In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

In [3]:
dbpedia_df = pd.read_csv('./datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [4]:
dbpedia_df.shape

(559999, 3)

### DBPedia classes

- Company
- EducationalInstitution
- Artist
- Athlete
- OfficeHolder
- MeanOfTransportation
- Building
- NaturalPlace
- Village
- Animal
- Plant
- Album
- Film
- WrittenWork

In [5]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [6]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [7]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
500996,13,The County Chairman,The County Chairman is a 1935 comedy film dir...
40795,2,Our Lady of Mount Carmel High School (Baltimor...,Our Lady of Mount Carmel High School (OLMC HS...
131453,4,Jim Kleinsasser,Jimmy Carter Kleinsasser (/ˈklaɪnsɑːsər/; bor...
122291,4,August Klingler,August Klingler (24 February 1918 – 23 Novemb...
38965,1,Fine Fare,Fine Fare was the name of a chain of supermar...
89852,3,Desmond Devlin,Desmond Devlin is an American comedy writer. ...
481930,13,A Sister to Assist 'Er (1938 film),A Sister to Assist 'Er is a 1938 British come...
95008,3,Megumi Makihara,Megumi Makihara (槇原めぐみ or 槙原めぐみ or 慎原めぐみ Maki...
543571,14,Annals of Science,Annals of Science is a peer-reviewed academic...
356791,9,Dula Gavabar,Dula Gavabar (Persian: دولاگوابر‎ also Romani...


In [8]:
dbpedia_df.shape

(10000, 3)

In [9]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [10]:
X.head()

113209     Eisuke Yoshiyuki (吉行 エイスケ Yoshiyuki Eisuke Ma...
10443      Schroders plc is a British multinational asse...
152897     Patrice Ferri is a retired French association...
485929     Honey 2 is a dance film which is a sequel to ...
312910     Marjanah is an impact crater in the northern ...
Name: Text, dtype: object

In [11]:
Y.head()

113209     3
10443      1
152897     4
485929    13
312910     8
Name: Label, dtype: int64

In [12]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [13]:
count_vectorizer = CountVectorizer()

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape

(10000, 48133)

In [14]:
print(feature_vector[0])

  (0, 44965)	1
  (0, 45159)	1
  (0, 24477)	1
  (0, 30174)	1
  (0, 5353)	1
  (0, 2617)	2
  (0, 4213)	1
  (0, 29184)	1
  (0, 770)	1
  (0, 41370)	1
  (0, 2229)	1
  (0, 22994)	1
  (0, 44731)	1
  (0, 22447)	1
  (0, 11806)	1
  (0, 3406)	1
  (0, 29564)	1
  (0, 3087)	1
  (0, 21905)	1
  (0, 38741)	1
  (0, 19410)	3
  (0, 32904)	1
  (0, 30043)	1
  (0, 20446)	1
  (0, 6733)	1
  (0, 18881)	1
  (0, 4662)	2
  (0, 21408)	1
  (0, 44526)	4
  (0, 696)	1
  (0, 21879)	1
  (0, 653)	1
  (0, 27)	1
  (0, 26417)	1
  (0, 47763)	1
  (0, 47850)	1
  (0, 45736)	4
  (0, 13969)	3


In [15]:
X_dense = feature_vector.todense()

In [16]:
X_dense.shape

(10000, 48133)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [18]:
x_train.shape, x_test.shape

((8000, 48133), (2000, 48133))

In [19]:
y_train.shape, y_test.shape

((8000,), (2000,))

In [20]:
clf = GaussianNB().fit(x_train, y_train)

In [21]:
y_pred = clf.predict(x_test)
y_pred

array([ 5, 10,  4, ...,  5,  1, 14])

In [22]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1464
accuracy_score :  0.732
precision_score :  0.7480649828741391
recall_score :  0.732
