In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

In [2]:
dbpedia_df = pd.read_csv('./datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [3]:
dbpedia_df.shape

(559999, 3)

### DBPedia classes

- Company
- EducationalInstitution
- Artist
- Athlete
- OfficeHolder
- MeanOfTransportation
- Building
- NaturalPlace
- Village
- Animal
- Plant
- Album
- Film
- WrittenWork

In [4]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [5]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [6]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
524463,14,Open Sesame (manga),Open Sesame is a Japanese manga series writte...
214954,6,USS Porter (DDG-78),USS Porter (DDG-78) is an Arleigh Burke-class...
369188,10,Chionodes luctuella,Chionodes luctuella is a moth of the Gelechii...
15537,1,Rashtriya Chemicals & Fertilizers,Rashtriya Chemicals & Fertilizers Ltd. (RCF) ...
434426,11,Salix gilgiana,Salix gilgiana is a species of willow native ...
315086,8,Devlins Creek,Devlins Creek an urban watercourse that is pa...
133194,4,Warren Donald,Warren Donald (born 7 October 1964) is an Eng...
138090,4,Chris Hatcher (outfielder),Christopher Kenneth Hatcher (born January 7 1...
358873,9,Królewskie Ostrzeszów County,Królewskie [kruˈlɛfskʲɛ] is a village in the ...
108048,3,Hamuera Tamahau Mahupuku,Hamuera Tamahau Mahupuku (c.1842 – 14 January...


In [7]:
dbpedia_df.shape

(10000, 3)

In [8]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [9]:
X.head()

77514      Phillips University was a private coeducation...
341059     Nowiny [nɔˈvinɨ] is a village in the administ...
517834     Capricious Summer (Czech: Rozmarné léto) is a...
215388     Maumelle Ordnance Works Locomotive 1 is a gas...
250923     Grace Reformed Church is a historic church lo...
Name: Text, dtype: object

In [10]:
Y.head()

77514      2
341059     9
517834    13
215388     6
250923     7
Name: Label, dtype: int64

In [22]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [12]:
count_vectorizer = CountVectorizer(stop_words='english')

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape

(10000, 47886)

In [13]:
print(feature_vector[0])

  (0, 15329)	1
  (0, 28116)	1
  (0, 39106)	1
  (0, 42005)	1
  (0, 14627)	1
  (0, 19453)	1
  (0, 34423)	1
  (0, 7888)	1
  (0, 30095)	1
  (0, 40200)	1
  (0, 19281)	1
  (0, 36984)	1
  (0, 17534)	1
  (0, 9770)	1
  (0, 42720)	1
  (0, 20178)	1
  (0, 9116)	1
  (0, 12475)	1
  (0, 9188)	1
  (0, 9127)	1
  (0, 2428)	1
  (0, 786)	1
  (0, 666)	1
  (0, 39115)	1
  (0, 42819)	1
  (0, 29834)	1
  (0, 14115)	2
  (0, 24524)	1
  (0, 13596)	1
  (0, 19001)	1
  (0, 20458)	1
  (0, 9679)	1
  (0, 32870)	1
  (0, 42838)	2
  (0, 31628)	2


In [14]:
X_dense = feature_vector.todense()

In [15]:
X_dense.shape

(10000, 47886)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [17]:
x_train.shape, x_test.shape

((8000, 47886), (2000, 47886))

In [18]:
y_train.shape, y_test.shape

((8000,), (2000,))

In [19]:
clf = GaussianNB().fit(x_train, y_train)

In [20]:
y_pred = clf.predict(x_test)
y_pred

array([ 8, 13,  4, ...,  6,  4, 13])

In [23]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1472
accuracy_score :  0.736
precision_score :  0.746912711601029
recall_score :  0.736
