Dataset link : https://github.com/pyk/dbpedia_csv

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import GaussianNB

In [3]:
dbpedia_df = pd.read_csv('./datasets/dbpedia_csv/train.csv', 
                       skiprows=1, names = ['Label', 'Name', 'Text'])

In [4]:
dbpedia_df.shape

(559999, 3)

### DBPedia classes

- Company
- EducationalInstitution
- Artist
- Athlete
- OfficeHolder
- MeanOfTransportation
- Building
- NaturalPlace
- Village
- Animal
- Plant
- Album
- Film
- WrittenWork

In [5]:
dbpedia_df['Label'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [6]:
dbpedia_df = dbpedia_df.sample(10000, replace=False)

In [7]:
dbpedia_df.sample(10)

Unnamed: 0,Label,Name,Text
494483,13,A Wedding Suit,A Wedding Suit (Persian: لباسی برای عروسی‎ Le...
548491,14,Into Thin Air,Into Thin Air: A Personal Account of the Mt. ...
339369,9,Banavchan,Banavchan (Persian: بناوچان‎ also Romanized a...
295828,8,Icalma Lake,Icalma Lake is a lake located in the Andes of...
416552,11,Pouteria sandwicensis,Pouteria sandwicensis is a species of floweri...
503849,13,Pepo (film),Pepo (Armenian: Պեպո) is a 1935 Soviet film d...
364384,10,Puebla deer mouse,The Puebla deer mouse (Peromyscus mekisturus)...
265465,7,Waleffe Castle,Waleffe Castle is a castle in Belgium.
528364,14,Through the Arc of the Rain Forest,Through the Arc of the Rain Forest is a novel...
423816,11,Guarianthe,Guarianthe abbreviated Gur. in the horticultu...


In [8]:
dbpedia_df.shape

(10000, 3)

In [9]:
X = dbpedia_df['Text']

Y = dbpedia_df['Label']

In [10]:
X.head()

187313     Bruce Cameron Young is an Australian Liberal ...
94935      Daniel Douglas Orlando (born May 29 1981) als...
93329      John Hultberg (February 8 1922 – April 15 200...
530642     The Oxford University Commonwealth Law Journa...
361874     Stensioella heintzi (Heintz's Little Stensio)...
Name: Text, dtype: object

In [11]:
Y.head()

187313     5
94935      3
93329      3
530642    14
361874    10
Name: Label, dtype: int64

In [12]:
def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("Length of testing data: ", len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [13]:
vectorizer = HashingVectorizer(n_features=2**10, norm='l2')

feature_vector = vectorizer.transform(X)

feature_vector.shape

(10000, 1024)

In [14]:
print(feature_vector[0])

  (0, 39)	0.15811388300841897
  (0, 53)	-0.15811388300841897
  (0, 54)	0.15811388300841897
  (0, 61)	-0.15811388300841897
  (0, 152)	0.15811388300841897
  (0, 158)	-0.4743416490252569
  (0, 175)	-0.15811388300841897
  (0, 203)	-0.15811388300841897
  (0, 300)	0.31622776601683794
  (0, 308)	-0.15811388300841897
  (0, 311)	-0.15811388300841897
  (0, 352)	-0.15811388300841897
  (0, 362)	-0.15811388300841897
  (0, 363)	0.15811388300841897
  (0, 365)	0.31622776601683794
  (0, 399)	0.15811388300841897
  (0, 426)	0.15811388300841897
  (0, 470)	0.15811388300841897
  (0, 513)	-0.15811388300841897
  (0, 534)	0.15811388300841897
  (0, 540)	-0.15811388300841897
  (0, 663)	-0.15811388300841897
  (0, 665)	-0.15811388300841897
  (0, 819)	-0.15811388300841897
  (0, 1002)	-0.15811388300841897
  (0, 1011)	-0.15811388300841897


In [15]:
X_dense = feature_vector.todense()

In [16]:
X_dense.shape

(10000, 1024)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)

In [18]:
x_train.shape, x_test.shape

((8000, 1024), (2000, 1024))

In [19]:
y_train.shape, y_test.shape

((8000,), (2000,))

In [20]:
clf = GaussianNB().fit(x_train, y_train)

In [21]:
y_pred = clf.predict(x_test)
y_pred

array([ 7, 11, 13, ...,  5,  6,  7])

In [22]:
summarize_classification(y_test, y_pred)

Length of testing data:  2000
accuracy_count :  1153
accuracy_score :  0.5765
precision_score :  0.5891110304184962
recall_score :  0.5765


In [23]:
y_test.head()

267097     7
364326    10
509624    13
466990    12
145918     4
Name: Label, dtype: int64

In [24]:
y_test = np.array(y_test)

In [25]:
pred_results = pd.DataFrame({'y_test': pd.Series(y_test),
                             'y_pred': pd.Series(y_pred)})

pred_results.sample(10)

Unnamed: 0,y_test,y_pred
1486,1,1
494,9,8
1039,3,3
985,9,9
711,8,10
532,1,1
1249,11,10
1746,8,11
766,1,7
1851,12,4
