In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import GaussianNB

In [3]:
dbpedia_df = pd.read_csv('./datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [4]:
dbpedia_df.shape

(559999, 3)

In [5]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [6]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [7]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
445879,12,Lucas (album),Lucas is the second album by Ghostly Internat...
120854,4,Pat McGehee,Patrick Henry McGehee (July 2 1888 – December...
449608,12,One Too Many Hearts,One Too Many Hearts is the third EP by Americ...
171512,5,Arken Arystanov,Arkén Kenesbékovich Arystánov (Kazakh: Аркен ...
26868,1,Cineplex Odeon Films,Cineplex Odeon Films (also known as Cineplex ...
402498,11,Steirachne,Steirachne is a genus of grass in the Poaceae...
102353,3,Harry Thumann,Harry Thumann (28 February 1952 – 2001) was a...
337385,9,Chah Sheykh,Chah Sheykh (Persian: چاه شيخ‎ also Romanized...
485176,13,Emmtan-Magan,Em Magan is a 2006 Tamil drama film directed ...
295129,8,Hot Springs Range,The Hot Springs Range is a mountain range in ...


In [8]:
dbpedia_df.shape

(10000, 3)

In [9]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [11]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [12]:
count_vectorizer = CountVectorizer()

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape

(10000, 48318)

In [14]:
tfidf_transformer = TfidfTransformer()

feature_vector = tfidf_transformer.fit_transform(feature_vector)

feature_vector.shape

(10000, 48318)

In [15]:
X_dense = feature_vector.todense()

In [16]:
X_dense.shape

(10000, 48318)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [18]:
clf = GaussianNB().fit(x_train, y_train)

In [19]:
y_pred = clf.predict(x_test)
y_pred

array([ 1,  1,  3, ..., 12, 10,  5])

In [20]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1438
accuracy_score :  0.719
precision_score :  0.7284791775245764
recall_score :  0.719
