In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import GaussianNB

In [2]:
dbpedia_df = pd.read_csv('./datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [3]:
dbpedia_df.shape

(559999, 3)

In [4]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [5]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [6]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
521705,14,Woodstock (novel),Woodstock or The Cavalier. A Tale of the Year...
373693,10,Cryptophagidae,Cryptophagidae is a family of beetles with re...
333934,9,Dehliz-e Yek,Dehliz-e Yek (Persian: دهليزيك‎ also Romanize...
125305,4,Roland Osabutey,Roland Osabutey (born 11 March 1980) is a Gha...
100866,3,Ann Voskamp,Ann Voskamp (born August 10 1973 in Listowel ...
58320,2,Robert Bateman High School,Robert Bateman High School (also known as Rob...
419470,11,Vriesea languida,Vriesea languida is a species of the genus Vr...
164430,5,Gilbert Wellington Ostrom,Gilbert Wellington Ostrom (June 1837 – Decemb...
426098,11,Banara regia,Banara regia is a species of plant in the Sal...
281236,8,Calder River (Western Australia),For other Rivers Calder see River Calder (dis...


In [7]:
dbpedia_df.shape

(10000, 3)

In [8]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [9]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [10]:
count_vectorizer = CountVectorizer(min_df=0, max_df=80, ngram_range=(2, 2))

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape

(10000, 217434)

In [11]:
tfidf_transformer = TfidfTransformer()

feature_vector = tfidf_transformer.fit_transform(feature_vector)

feature_vector.shape

(10000, 217434)

In [12]:
X_dense = feature_vector.todense()

In [13]:
X_dense.shape

(10000, 217434)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [15]:
clf = GaussianNB().fit(x_train, y_train)

In [16]:
y_pred = clf.predict(x_test)
y_pred

array([ 7, 12,  6, ...,  1, 12,  3])

In [17]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1778
accuracy_score :  0.889
precision_score :  0.8902761517734769
recall_score :  0.889
