In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import GaussianNB

In [2]:
dbpedia_df = pd.read_csv('./datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [3]:
dbpedia_df.shape

(559999, 3)

In [4]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [5]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [6]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
510378,13,Unknown (2006 film),Unknown is a 2006 American crime-thriller fil...
510665,13,World Without Sun,World Without Sun (French: Le Monde sans sole...
392655,10,Argyresthia pseudotsuga,Argyresthia pseudotsuga is a moth of the Ypon...
419827,11,Erythronium multiscapoideum,Erythronium multiscapoideum is a species of f...
217867,6,Simca 5,The Simca 5 is a small Franco-Italian passeng...
377408,10,Bucculatrix zophopasta,Bucculatrix zophopasta is a moth in the Buccu...
232485,6,HMS Ajax (F114),HMS Ajax was a Leander-class frigate of the R...
327195,9,Ruś Ostróda County,Ruś [ruɕ] (German Reussen) is a village in th...
478293,12,Mortal Kombat: The Album,Mortal Kombat: The Album is an album by The I...
533360,14,Makers (novel),Makers is a novel by Cory Doctorow. It was re...


In [7]:
dbpedia_df.shape

(10000, 3)

In [8]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [10]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [20]:
count_vectorizer = CountVectorizer(min_df=0, max_df=80)

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape

(10000, 47544)

In [21]:
tfidf_transformer = TfidfTransformer()

feature_vector = tfidf_transformer.fit_transform(feature_vector)

feature_vector.shape

(10000, 47544)

In [22]:
X_dense = feature_vector.todense()

In [23]:
X_dense.shape

(10000, 47544)

In [24]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [25]:
clf = GaussianNB().fit(x_train, y_train)

In [26]:
y_pred = clf.predict(x_test)
y_pred

array([10,  4,  1, ..., 13,  6,  2])

In [27]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1316
accuracy_score :  0.658
precision_score :  0.6650727642789968
recall_score :  0.658
