In [2]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn import metrics
from sklearn import preprocessing
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import multiclass
from sklearn import svm
from sklearn import neighbors
from sklearn import naive_bayes

In [3]:
news = pd.read_csv('vesti.csv')

In [4]:
X = news['tekst']
y = news['kategorija']

### TF-IDF vektorizacija
Izvršićemo vektorizaciju tekstualnih podataka pomoću Tf-Idf reprezentacije:

In [5]:
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer()
tfidf_vectorizer.fit(X.values.astype('U'))

TfidfVectorizer()

In [6]:
len(tfidf_vectorizer.vocabulary_)

309627

In [7]:
X_transformed = tfidf_vectorizer.transform(X.values.astype('U'))

In [8]:
X_transformed.shape

(165699, 309627)

### Kodiranje klasa
Izvršićemo kodiranje 37 klasa brojevima, a mapiranje ćemo sačuvati u rečniku:

In [9]:
y_transformed = y.astype('category')

In [10]:
classes = dict(enumerate(y_transformed.cat.categories))

In [11]:
classes

{0: 'ARTS & CULTURE',
 1: 'BLACK VOICES',
 2: 'BUSINESS',
 3: 'COLLEGE',
 4: 'COMEDY',
 5: 'CRIME',
 6: 'DIVORCE',
 7: 'EDUCATION',
 8: 'ENTERTAINMENT',
 9: 'ENVIRONMENT',
 10: 'FIFTY',
 11: 'FOOD & DRINK',
 12: 'GOOD NEWS',
 13: 'GREEN',
 14: 'HEALTHY LIVING',
 15: 'HOME & LIVING',
 16: 'IMPACT',
 17: 'LATINO VOICES',
 18: 'MEDIA',
 19: 'MONEY',
 20: 'PARENT',
 21: 'PARENTS',
 22: 'POLITICS',
 23: 'QUEER VOICES',
 24: 'RELIGION',
 25: 'SCIENCE',
 26: 'SPORTS',
 27: 'STYLE & BEAUTY',
 28: 'STYLE AND BEAUTY',
 29: 'TASTE',
 30: 'TECH',
 31: 'TRAVEL',
 32: 'WEDDINGS',
 33: 'WEIRD NEWS',
 34: 'WELLNESS',
 35: 'WOMEN',
 36: 'WORLD NEWS'}

In [12]:
y_transformed = y_transformed.cat.codes

In [13]:
y_transformed

0         35
1         14
2          2
3          6
4          6
          ..
165694     8
165695    20
165696     0
165697    19
165698     8
Length: 165699, dtype: int8

### Podela podataka
Nakon vektorizacije i kodiranja klasa, uradićemo podelu podataka na trening i test skupove u odnosu 2:1 :

In [14]:
X_train, X_test, y_train, y_test = model_selection.\
    train_test_split(X_transformed, y_transformed, test_size = 0.33, stratify = y_transformed, random_state = 4)

### Višeklasna klasifikacija
Klasifikator koji ćemo koristiti je linearni SVM (<i>support-vector-machine</i>) sa parametrom <b>multi_class</b> kako bismo ga koristili višeklasni klasifikator. 

In [15]:
SVM = svm.LinearSVC(multi_class = 'ovr', loss = 'hinge')

In [16]:
SVM.fit(X_train, y_train)

LinearSVC(loss='hinge')

In [16]:
SVM.score(X_test, y_test)

0.7189883140396116

In [17]:
print(metrics.classification_report(y_test, SVM.predict(X_test), target_names = classes.values()))

                  precision    recall  f1-score   support

  ARTS & CULTURE       0.76      0.69      0.72      1033
    BLACK VOICES       0.70      0.53      0.61      1364
        BUSINESS       0.63      0.54      0.58      1596
         COLLEGE       0.60      0.50      0.54       307
          COMEDY       0.70      0.66      0.68      1478
           CRIME       0.61      0.60      0.60      1048
         DIVORCE       0.81      0.79      0.80       880
       EDUCATION       0.55      0.46      0.50       236
   ENTERTAINMENT       0.76      0.82      0.79      4936
     ENVIRONMENT       0.57      0.31      0.41       348
           FIFTY       0.71      0.55      0.62       376
    FOOD & DRINK       0.73      0.80      0.76      1345
       GOOD NEWS       0.69      0.50      0.58       424
           GREEN       0.54      0.41      0.47       725
  HEALTHY LIVING       0.65      0.41      0.50      1733
   HOME & LIVING       0.86      0.88      0.87      1144
          IMP

In [18]:
print(metrics.confusion_matrix(y_test, SVM.predict(X_test)))

[[ 713    7    6 ...   17    2   19]
 [   8  729    9 ...   17   11   11]
 [   8    4  864 ...   76    8   49]
 ...
 [   7    5   44 ... 3272   13   17]
 [   7   15   23 ...   60  395   14]
 [  11    6   24 ...   21    9 1861]]


Sada ćemo probati OneVsAll multiklasnu klasifikaciju sa Logističkom regresijom:

In [20]:
ovr = multiclass.OneVsRestClassifier(linear_model.LogisticRegression(max_iter = 1000))

In [21]:
ovr.fit(X_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(max_iter=1000))

In [22]:
print(metrics.classification_report(y_test, ovr.predict(X_test), target_names = classes.values()))

                  precision    recall  f1-score   support

  ARTS & CULTURE       0.81      0.60      0.69      1033
    BLACK VOICES       0.71      0.44      0.54      1364
        BUSINESS       0.60      0.50      0.55      1596
         COLLEGE       0.63      0.35      0.45       307
          COMEDY       0.72      0.59      0.65      1478
           CRIME       0.64      0.53      0.58      1048
         DIVORCE       0.87      0.67      0.76       880
       EDUCATION       0.59      0.29      0.39       236
   ENTERTAINMENT       0.65      0.86      0.74      4936
     ENVIRONMENT       0.71      0.12      0.21       348
           FIFTY       0.89      0.24      0.38       376
    FOOD & DRINK       0.72      0.77      0.74      1345
       GOOD NEWS       0.81      0.38      0.52       424
           GREEN       0.53      0.30      0.38       725
  HEALTHY LIVING       0.67      0.35      0.46      1733
   HOME & LIVING       0.88      0.81      0.84      1144
          IMP

In [24]:
print(metrics.confusion_matrix(y_test, ovr.predict(X_test)))

[[ 620    7    1 ...   30    4   12]
 [   6  601    6 ...   33   10    8]
 [   5    7  801 ...  138   14   54]
 ...
 [   3    3   38 ... 3381    8   23]
 [   2    9   21 ...   95  351    8]
 [   8    4   19 ...   33   19 1756]]
