In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import GaussianNB

In [4]:
dbpedia_df = pd.read_csv('./datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [5]:
dbpedia_df.shape

(559999, 3)

In [6]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [7]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [8]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
252405,7,Prowers Bridge,The Prowers Bridge over the Arkansas River ne...
504457,13,Que sera sera (film),Que sera sera (Portuguese: Seja o que Deus Qu...
270494,7,The Esplanade (Kenner Louisiana),The Esplanade also known as the Esplanade Mal...
546058,14,The 1974 Annual World's Best SF,The 1974 Annual World's Best SF is an antholo...
461279,12,Just Go (album),Just Go is the ninth studio album by American...
277143,7,Steyning Methodist Church,Steyning Methodist Church is a Methodist plac...
266867,7,Smith Estate (Ridge New York),Smith Estate also known as Longwood Estate - ...
283712,8,Yr Eifl,Yr Eifl is a mountain on the north coast of t...
33861,1,IBM India,IBM India Private Limited is the Indian subsi...
70953,2,Loreto College of Rose-Hill,Loreto College Rose Hill is a private seconda...


In [9]:
dbpedia_df.shape

(10000, 3)

In [10]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [12]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [13]:
tfidf_vectorizer = TfidfVectorizer()

feature_vector = tfidf_vectorizer.fit_transform(X)

feature_vector.shape

(10000, 48401)

In [14]:
X_dense = feature_vector.todense()

In [15]:
X_dense.shape

(10000, 48401)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [17]:
clf = GaussianNB().fit(x_train, y_train)

In [18]:
y_pred = clf.predict(x_test)
y_pred

array([10,  7,  7, ...,  7,  4, 12])

In [19]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1398
accuracy_score :  0.699
precision_score :  0.7100351307095724
recall_score :  0.699
