In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import HashingVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.naive_bayes import GaussianNB

In [2]:
dbpedia_df = pd.read_csv('./datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [3]:
dbpedia_df.shape

(559999, 3)

### DBPedia classes

- Company
- EducationalInstitution
- Artist
- Athlete
- OfficeHolder
- MeanOfTransportation
- Building
- NaturalPlace
- Village
- Animal
- Plant
- Album
- Film
- WrittenWork

In [4]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [5]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [6]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
328754,9,Sadabad-e Sofla,Sadabad-e Sofla (Persian: سعدابادسفلي‎ also R...
268204,7,Westgate Mall (Spartanburg),Westgate Mall is a shopping mall in Spartanbu...
53524,2,North Rockland High School,North Rockland High School is a high school l...
435646,11,Tillandsia diguetii,Tillandsia diguetii is a species of the genus...
254752,7,Palais Wilczek,Palais Wilczek is a palace in Vienna Austria....
29074,1,Canfor,Canfor Corporation is a Canadian integrated f...
155059,4,Gerry Rioux,Gerard Rioux (born February 17 1959) is a Can...
137451,4,Jerry DePoyster,Jerry Dean DePoyster (born July 6 1946 in Oma...
363022,10,Pieris (butterfly),Pieris the whites or garden whites is a wides...
499846,13,Gunah Aur Kanoon,Gunah Aur Kanoon is a 1970 Bollywood drama fi...


In [7]:
dbpedia_df.shape

(10000, 3)

In [8]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [9]:
X.head()

346354     Hoseynabad-e Sarzeh (Persian: حسين ابادسرزه‎ ...
198538     Milton Berkes (born September 29 1924) is a f...
550592     R.M. Williams Outback (or simply Outback) is ...
387637     Metasia carnealis is a species of moth in the...
366618     The Father Basilio's Striped Mouse or Bioko H...
Name: Text, dtype: object

In [10]:
Y.head()

346354     9
198538     5
550592    14
387637    10
366618    10
Name: Label, dtype: int64

In [11]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [13]:
stemmer =  SnowballStemmer('english')
analyzer = HashingVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

In [24]:
stem_vectorizer = HashingVectorizer(n_features=2**10, norm='l2', analyzer=stemmed_words)

feature_vector = stem_vectorizer.transform(X)

feature_vector.shape

(10000, 1024)

In [25]:
print(feature_vector[0])

  (0, 21)	0.25607375986579195
  (0, 24)	-0.12803687993289598
  (0, 27)	0.12803687993289598
  (0, 61)	-0.12803687993289598
  (0, 62)	0.12803687993289598
  (0, 69)	-0.12803687993289598
  (0, 71)	-0.25607375986579195
  (0, 145)	-0.12803687993289598
  (0, 158)	-0.12803687993289598
  (0, 215)	0.12803687993289598
  (0, 273)	0.25607375986579195
  (0, 301)	-0.12803687993289598
  (0, 304)	0.12803687993289598
  (0, 355)	0.12803687993289598
  (0, 365)	0.12803687993289598
  (0, 424)	-0.12803687993289598
  (0, 540)	0.25607375986579195
  (0, 550)	-0.12803687993289598
  (0, 569)	0.12803687993289598
  (0, 595)	-0.3841106397986879
  (0, 643)	0.12803687993289598
  (0, 659)	0.12803687993289598
  (0, 697)	0.12803687993289598
  (0, 745)	-0.12803687993289598
  (0, 758)	-0.12803687993289598
  (0, 799)	0.12803687993289598
  (0, 832)	-0.12803687993289598
  (0, 877)	0.12803687993289598
  (0, 883)	0.12803687993289598
  (0, 884)	-0.12803687993289598
  (0, 885)	0.25607375986579195
  (0, 913)	0.12803687993289598
  

In [26]:
X_dense = feature_vector.todense()

In [27]:
X_dense.shape

(10000, 1024)

In [28]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [29]:
x_train.shape, x_test.shape

((8000, 1024), (2000, 1024))

In [30]:
y_train.shape, y_test.shape

((8000,), (2000,))

In [31]:
clf = GaussianNB().fit(x_train, y_train)

In [32]:
y_pred = clf.predict(x_test)
y_pred

array([ 3,  3, 11, ...,  2,  2,  4])

In [33]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1120
accuracy_score :  0.56
precision_score :  0.5663465185830068
recall_score :  0.56
