In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 

from sklearn.naive_bayes import GaussianNB

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [4]:
dbpedia_df = pd.read_csv('./datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [5]:
dbpedia_df.shape

(559999, 3)

In [6]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [7]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [8]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
293816,8,Sheep Mountain (Flathead County Montana),Sheep Mountain (8530 feet (2600 m)) is locate...
379389,10,Argyropelecus hemigymnus,Argyropelecus hemigymnus the half-naked hatch...
553602,14,Double Identity (novel),Double Identity is a 2005 young adult novel b...
104503,3,Picaflor de los Andes,Víctor Alberto Gil Mallma (1930; Huancayo - J...
338288,9,Jerry City Ohio,Jerry City is a village in Wood County Ohio U...
244140,7,John Hosford House,The John Hosford House built in 1860 is an hi...
182565,5,Hunt Downer,Major General Huntington Blair Downer Jr. kno...
252037,7,Brea City Hall and Park,Brea City Hall and Park in Brea California wa...
98721,3,Christian Jacob (musician),Christian Jacob is a lyrical jazz pianist. He...
188815,5,George Forbes (New Zealand politician),George William Forbes (12 March 1869 – 17 May...


In [9]:
dbpedia_df.shape

(10000, 3)

In [10]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [11]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [12]:
tokens = word_tokenize("\n".join(X.values))

len(tokens)

503033

In [13]:
freq = FreqDist(tokens)
freq

FreqDist({'the': 23584, '.': 23365, 'in': 16237, 'of': 15558, 'is': 13472, 'a': 13075, 'and': 12741, '(': 7078, ')': 7048, 'was': 6816, ...})

In [14]:
frequent_words = []

for key, value in freq.items():
    if value >= 100:
        frequent_words.append(key.lower())
        
len(frequent_words)

491

In [15]:
frequent_words[:25]

['columbia',
 '(',
 ')',
 'was',
 'an',
 'american',
 'company',
 'that',
 'from',
 '1994',
 'to',
 '2002',
 '.',
 'it',
 'operated',
 'as',
 'the',
 'third',
 'name',
 'of',
 'early',
 'studio',
 'and',
 'part',
 'second']

In [16]:
stop_words = text.ENGLISH_STOP_WORDS.union(frequent_words)

In [17]:
count_vectorizer = CountVectorizer(stop_words=stop_words)

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape

  'stop_words.' % sorted(inconsistent))


(10000, 47434)

In [18]:
X_dense = feature_vector.todense()

In [19]:
X_dense.shape

(10000, 47434)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [21]:
clf = GaussianNB().fit(x_train, y_train)

In [22]:
y_pred = clf.predict(x_test)
y_pred

array([13,  9, 13, ...,  7, 11,  5])

In [23]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1397
accuracy_score :  0.6985
precision_score :  0.7009864007615241
recall_score :  0.6985
