In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 

from sklearn.naive_bayes import GaussianNB

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [3]:
dbpedia_df = pd.read_csv('./datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [4]:
dbpedia_df.shape

(559999, 3)

In [5]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [6]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [7]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
427520,11,Guzmania remyi,Guzmania remyi is a species of plant in the B...
522145,14,Villa Aurore,Villa Aurore is a novel written in French by ...
462552,12,Monk's Casino,Monk's Casino is a live album by German free ...
2460,1,Kiss Technology,Kiss Technology is an entertainment technolog...
83743,3,S.M. Zakir,S.M. Zakir (born 4 February 1969 in Kota Bhar...
554887,14,New York Law Journal,The New York Law Journal founded in 1888 is a...
270857,7,Latham United Methodist Church,Latham United Methodist Church is a historic ...
323989,9,Nalavadi,Nalavadi is a village in Dharwad district in ...
193352,5,Richard Ottley (judge),Sir Richard Ottley was the 5th Chief Justice ...
78104,2,Umatilla High School (Oregon),Umatilla High School is a public high school ...


In [8]:
dbpedia_df.shape

(10000, 3)

In [9]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [10]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [13]:
count_vectorizer = CountVectorizer(ngram_range=(2, 2))

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape

(10000, 217709)

In [14]:
X_dense = feature_vector.todense()

In [15]:
X_dense.shape

(10000, 217709)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [17]:
clf = GaussianNB().fit(x_train, y_train)

In [18]:
y_pred = clf.predict(x_test)
y_pred

array([2, 1, 5, ..., 4, 7, 8])

In [19]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1812
accuracy_score :  0.906
precision_score :  0.9071432275572098
recall_score :  0.906
