In [2]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn import metrics
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import multiclass
from sklearn import svm
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import decomposition
from sklearn.model_selection import train_test_split
import sklearn.preprocessing

from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPool1D, GlobalMaxPool1D, Activation, SpatialDropout1D, LSTM, SimpleRNN, GRU
from keras.optimizers import Adam, SGD, RMSprop
from keras import losses, optimizers
from keras import preprocessing
from keras.utils import to_categorical
from keras.losses import CategoricalCrossentropy, BinaryCrossentropy

from matplotlib import pyplot as plt

import tensorflow as tf

Izvršićemo klasifikaciju podataka neurnoskim mrežama za 2 kategorije. Učitaćemo podatke i odraditi vektorizaciju, a zatim podeliti na podatke na trening, test i validacione skupove. Isprobavali smo različite vrednosti za parametre modela (units, epochs, batch_size, learning_rate) kao i broj slojeva i za ove vrednosti smo dobili najbolje moguće modele za naše podatke.

Funkcija <i>GetData</i> će nam služiti za učitavanje podataka u zavisnosti od broja kategorija.

In [3]:
def GetData(num_of_categories, categories):
    news = pd.read_csv('data/vesti.csv')
    if num_of_categories == 36:
        X = news['text']
        y = news['category']
    else:
        news = news[news.category.isin(categories)]
        X = news['text']
        y = news['category']
        
    return (X, y)

Funkcija <i>TransformData</i> izršava vektorizaciju nad skupom X i prevodi kategoričke vrednosti skupa y u indikatorske promenljive.

In [4]:
def TransformData(X, y):
    vectorizer = feature_extraction.text.TfidfVectorizer()
    vectorizer.fit(X.values.astype('U'))

    X_transformed = vectorizer.transform(X.values.astype('U'))

    y_transformed = pd.get_dummies(y).values
    
    return (X_transformed, y_transformed)

Funkcija <i>TrainTestValidationSplit</i> deli podatke na trening, validacioni (validation size: 0.2) i test (test size: 0.2) skup sa stratifikacijom po y skupu.

In [5]:
def TrainTestValidationSplit(X, y):
    X_train_validation, X_test, y_train_validation, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 4)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, y_train_validation, test_size = 0.2, stratify = y_train_validation, random_state = 4)
    
    return (X_train, X_validation, X_test, y_train, y_validation, y_test)

Funkcija <i>EvaluateModel</i> će služiti za evaluaciju podataka na test skupu i vizualizaciju dobijenih vrednosti funkcije greške i tačnosti.

In [6]:
def EvaluateModel(model, X_test, y_test, history):
    score = model.evaluate(X_test, y_test)
    print('Test loss: ', score[0])
    print('Test accuracy: ', score[1])
    
    epochs = history.epoch
    
    loss = history.history['loss']
    validation_loss = history.history['val_loss']
    
    accuracy = history.history['accuracy']
    validation_accuracy = history.history['val_accuracy']
    
    plt.figure(figsize=(12,8))
    plt.title('Loss')
    plt.xlabel('epochs')
    plt.ylabel('loss')
    plt.plot(epochs, loss, c='red', label = 'training')
    plt.plot(epochs, validation_loss, c='orange', label='validation')
    plt.legend(loc = 'best')
    plt.show()
    
    plt.figure(figsize=(12,8))
    plt.title('Accuracy')
    plt.xlabel('epochs')
    plt.ylabel('accuracy')
    plt.plot(epochs, accuracy, c='red', label = 'training')
    plt.plot(epochs, validation_accuracy, c = 'orange', label = 'validation')
    plt.legend(loc = 'best')
    plt.show()

Funkcija <i>TransformToTensor</i> izvrsava dekompoziciju pomocu TruncatedSVD algoritma i proveravamo variansu kako bismo ocuvali smisao podataka, zatim na kraju pretvaramo transformisane podatke u tenzor kako bismo mogli da pokrenemo modele.

In [14]:
def TransformToTensor(X, n):
    pca = decomposition.TruncatedSVD(n_components=n, random_state=4)
    pca.fit(X)
    print(sum(pca.explained_variance_ratio_))
    X = pca.transform(X)
    return tf.convert_to_tensor(X)

### Klasifikacija za 2 kategorije

Učitavamo podatke za dve kategorije koje imaju najveći broj članaka u skupu.

In [24]:
categories2 = ['POLITICS', 'WELLNESS']
X2, y2 = GetData(2, categories2)
print(X2.shape)
print(y2.shape)

(50281,)
(50281,)


Delimo podatke na trening, test i validacioni skup, a zatim ih transformišemo pomoću funkcije TransformData.

In [25]:
X2_train, X2_validation, X2_test, y2_train, y2_validation, y2_test = TrainTestValidationSplit(X2, y2)

In [26]:
X2_train_transformed, y2_train_transformed = TransformData(X2_train, y2_train)
X2_validation_transformed, y2_validation_transformed = TransformData(X2_validation, y2_validation)
X2_test_transformed, y2_test_transformed = TransformData(X2_test, y2_test)

In [27]:
print(X2_train_transformed.shape)
print(y2_train_transformed.shape)

(32179, 34626)
(32179, 2)


In [28]:
print(X2_validation_transformed.shape)
print(y2_validation_transformed.shape)

(8045, 18130)
(8045, 2)


In [29]:
print(X2_test_transformed.shape)
print(y2_test_transformed.shape)

(10057, 20471)
(10057, 2)


In [30]:
X2_train_transformed = TransformToTensor(X2_train_transformed, 6000)
X2_validation_transformed = TransformToTensor(X2_validation_transformed, 6000)
X2_test_transformed = TransformToTensor(X2_test_transformed, 6000)

0.8549289578495467
0.9778819627214266
0.9524713129595088


Kreiramo model i zatim prikazujemo rezultate.

In [33]:
model1 = Sequential()
model1.add(Embedding(input_dim = 2000, output_dim = 32, input_length = X2_train_transformed.shape[1]))
model1.add(LSTM(64, dropout = 0.2, recurrent_dropout = 0.2))
model1.add(Dense(2, activation='softmax'))

model1.compile(optimizer = Adam(learning_rate = 0.00001), loss = CategoricalCrossentropy(), metrics = ['accuracy'])
model1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 6000, 32)          64000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 88,962
Trainable params: 88,962
Non-trainable params: 0
_________________________________________________________________


In [36]:
history1 = model1.fit(X2_train_transformed, y2_train_transformed, epochs=5, batch_size=32, validation_data=(X2_validation_transformed, y2_validation_transformed))

Epoch 1/5
  54/1006 [>.............................] - ETA: 2:58:29 - loss: 0.6861 - accuracy: 0.6580

KeyboardInterrupt: 

Pokusali smo da napravimo model sa LSTM slojem ali za jednu od pet epoha bi bilo potrebno cekati 3h kako bi se izvrsila. U svesci 07_news_nn_small_categories smo napravili dva modela ali gde smo koristile manji broj redova radi lakseg ucenja podataka tokom epoha.