In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import text 

from sklearn.naive_bayes import GaussianNB

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [2]:
dbpedia_df = pd.read_csv('./datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [3]:
dbpedia_df.shape

(559999, 3)

In [4]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [5]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [6]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
179975,5,Ramreddy Damodar Reddy,Ramreddy Damodar Reddy (Telugu: రాంరెడ్డి దామ...
558672,14,Sonora Review,Sonora Review is a biannual graduate student-...
503492,13,Luz en el páramo,Luz en el páramo is a 1953 Venezuelan film di...
40980,2,Oswaldo Cruz Foundation,The Oswaldo Cruz Foundation (Portuguese Funda...
199065,5,Tate Reeves,Jonathon Tate Reeves (born June 5 1974) a Rep...
20373,1,Indie Boyz,Indie Boyz is a European company based in Lon...
131192,4,Hanna Mazgunova,Hanna Mazgunova (Belarusian: Ганна Мазгунова;...
438389,11,Platystemon,Platystemon is a monotypic genus of flowering...
295233,8,Paddys River (South West Slopes New South Wales),Paddys River a watercourse of the Murray catc...
481401,13,The Ragged Edge (film),The Ragged Edge is a lost 1923 silent film So...


In [7]:
dbpedia_df.shape

(10000, 3)

In [8]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [9]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [10]:
tokens = word_tokenize("\n".join(X.values))

len(tokens)

504495

In [11]:
freq = FreqDist(tokens)
freq

FreqDist({'the': 23494, '.': 23437, 'in': 16379, 'of': 15332, 'is': 13499, 'and': 12934, 'a': 12862, '(': 7204, ')': 7199, 'was': 6950, ...})

In [12]:
frequent_words = []

for key, value in freq.items():
    if value >= 100:
        frequent_words.append(key.lower())
        
len(frequent_words)

487

In [13]:
frequent_words[:25]

['(',
 ':',
 '[',
 ']',
 '.',
 ')',
 'is',
 'a',
 'lake',
 'in',
 'located',
 'north',
 'of',
 'the',
 'town',
 'district',
 'not',
 '1',
 'the',
 'area',
 'by',
 'including',
 '2001',
 'was',
 'international']

In [15]:
stop_words = text.ENGLISH_STOP_WORDS.union(frequent_words)

In [19]:
count_vectorizer = CountVectorizer(stop_words=stop_words)

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape

(10000, 47573)

In [20]:
tfidf_transformer = TfidfTransformer()

feature_vector = tfidf_transformer.fit_transform(feature_vector)

feature_vector.shape

(10000, 47573)

In [21]:
X_dense = feature_vector.todense()

In [22]:
X_dense.shape

(10000, 47573)

In [23]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [24]:
clf = GaussianNB().fit(x_train, y_train)

In [25]:
y_pred = clf.predict(x_test)
y_pred

array([ 7,  7,  1, ..., 13, 14, 11])

In [26]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1335
accuracy_score :  0.6675
precision_score :  0.6803619205404001
recall_score :  0.6675
