# Klasifikacija teksta

U ovoj svesci proći ćemo ponovo kroz primer koji smo uveli u priči o konvolutivnim neuronskim mrežama. Bavićemo se klasifikacijom IMDB filmskih pregleda na pozitivne i negativne, ali ovoga puta uz korišćenje rekurentnih neuronskih mreža. 

In [1]:
import torch
import numpy as np

SEED = 7
# za determinističko izvršavanje sveske:
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

## Priprema skupa podataka

In [2]:
max_features = 2000 

In [3]:
from torchtext.datasets import IMDB

train_iter, test_iter = IMDB(root='.')

Očekuje se i da pre obrade sve preglede svedemo na iste dužine. Mi ćemo se opredeliti za dužinu od 500 reči. Pregledi koji sadrže manje reći biće dopunjeni nulama, a pregledi koji sadrže više reči biće skraćeni.

In [4]:
from collections import Counter
from torchtext.data import get_tokenizer
from tqdm.notebook import tqdm

tokenizer = get_tokenizer('basic_english')
counter = Counter()
for label, line in train_iter:
    counter.update(tokenizer(line))

In [5]:
from collections import OrderedDict

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) # sortiramo tokene po frekvenciji
ordered_dict = OrderedDict(sorted_by_freq_tuples)
recnik = vocab(ordered_dict, specials=['<unk>', '<pad>', '<bos>', '<eos>'], min_freq=300) # pravimo vocabular sa tokenima koji se pojavljuju barem 300 puta
print("Index of 'the':", recnik['the'])
print("Word corresponding to index 10:", recnik.get_itos()[10])

Index of 'the': 4
Word corresponding to index 10: to


In [6]:
def text_to_indices(recnik, texts):
    return [torch.tensor([recnik[token] if token in recnik else recnik['<unk>'] for token in tokenizer(text)]) for text in texts]

In [7]:
train_labels, train_texts = zip(*train_iter)
# labele se podrazumevano 1 i 2, mi cemo ih smanjiti na 0 i 1
train_labels = [label - 1 for label in train_labels]
test_labels, test_texts = zip(*test_iter)
test_labels = [label - 1 for label in test_labels]
train_indices = text_to_indices(recnik, train_texts)
test_indices = text_to_indices(recnik, test_texts)

In [8]:
max_len = 500

In [9]:
from torch.nn.utils.rnn import pad_sequence

train_indices_padded = pad_sequence(train_indices, batch_first=True)[:, :max_len]
test_indices_padded = pad_sequence(test_indices, batch_first=True)[:, :max_len]

In [10]:
train_indices_padded.shape, train_indices_padded.dtype

(torch.Size([25000, 500]), torch.int64)

In [11]:
print(test_indices_padded.shape)

torch.Size([25000, 500])


In [12]:
np.unique(train_labels)

array([0, 1])

In [13]:
negative_review, positive_review = np.bincount(train_labels)

In [14]:
print('Broj pozitivnih pregleda: ', positive_review)
print('Broj negitivnih pregleda: ', negative_review)

Broj pozitivnih pregleda:  12500
Broj negitivnih pregleda:  12500


In [24]:
from sklearn import model_selection
batch_size = 128
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_indices_padded, torch.tensor(train_labels), test_size=0.2, random_state=42, stratify=train_labels)
X_test, y_test = test_indices_padded, torch.tensor(test_labels)

train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
valid_dataset = torch.utils.data.TensorDataset(X_valid, y_valid)
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)


Nadalje ćemo eksperimentisati sa različitim rekurentnim arhitekturama. Mreže koje ćemo konstruisati će biti plitke i trenirane u malom broju epoha tako da se za vežbu predlaže istraživanje ponašanja mreže sa većim brojem epoha, dodatnim regularizacijama i manjim koracima učenja. Takođe, mogu se varirati veličina ugnježđene reprezentacije koju dobijamo `Embedding` soljem, kao i velilčina izlaza.

In [22]:
from tqdm.notebook import tqdm
from collections import defaultdict
import torch
import torch.nn as nn

def evaluate_model(model, test_loader):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc='test batch', leave=True):
            inputs, labels = inputs.to(device), labels.to(device)
        
            batch_size = labels.size(0)
            labels = labels.to(torch.float32)
    
            predictions = model(inputs).squeeze()
            
            loss = nn.functional.binary_cross_entropy(predictions, labels)
            accuracy = ((predictions > 0.5) == labels).float().mean()
            test_loss += loss.item() * batch_size
            test_accuracy += accuracy.item() * batch_size
            test_samples += batch_size 
    test_loss /= test_samples
    test_accuracy /= test_samples

    return test_loss, test_accuracy

def train_classification(model, optimizer, train_loader, valid_loader, epochs=5):    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    metrics = defaultdict(lambda: {'train': [], 'valid': []})
    
    for epoch in tqdm(range(epochs), leave=True, desc='Epochs'):
        epoch_train_loss, epoch_train_accuracy, train_samples = 0, 0, 0
        epoch_valid_loss, epoch_valid_accuracy, valid_samples = 0, 0, 0
        
        # Training phase
        model.train()
        for inputs, labels in tqdm(train_loader, desc='Train batch', leave=False):
            inputs, labels = inputs.to(device), labels.to(device)
    
            batch_size = labels.size(0)
            labels = labels.to(torch.float32)
            
            optimizer.zero_grad()
            predictions = model(inputs).squeeze()
            loss = nn.functional.binary_cross_entropy(predictions, labels)
            accuracy = ((predictions > 0.5) == labels).float().mean()
            loss.backward()
            optimizer.step()
            
            epoch_train_loss += loss.item() * batch_size
            epoch_train_accuracy += accuracy.item() * batch_size
            train_samples += batch_size
    
        # Validation phase
        model.eval()
        with torch.no_grad():
            for inputs, labels in tqdm(valid_loader, desc='valid batch', leave=False):
                inputs, labels = inputs.to(device), labels.to(device)
            
                batch_size = labels.size(0)
                labels = labels.to(torch.float32)
    
                predictions = model(inputs).squeeze()
                
                loss = nn.functional.binary_cross_entropy(predictions, labels)
                accuracy = ((predictions > 0.5) == labels).float().mean()
                epoch_valid_loss += loss.item() * batch_size
                epoch_valid_accuracy += accuracy.item() * batch_size
                valid_samples += batch_size
    
        # Average metrics for the epoch
        avg_train_loss = epoch_train_loss / train_samples
        avg_train_accuracy = epoch_train_accuracy / train_samples
        avg_valid_loss = epoch_valid_loss / valid_samples
        avg_valid_accuracy = epoch_valid_accuracy / valid_samples
        
        metrics['loss']['train'].append(avg_train_loss)
        metrics['accuracy']['train'].append(avg_train_accuracy)
        metrics['loss']['valid'].append(avg_valid_loss)
        metrics['accuracy']['valid'].append(avg_valid_accuracy)
    return metrics


In [21]:
import matplotlib.pyplot as plt

def plot_graphs(metrics):
    plt.figure(figsize=(10, 4))
    
    plt.subplot(1, 2, 1)
    plt.title('Loss')
    plt.plot(np.arange(0, epochs), metrics['loss']['train'], label='train')
    plt.plot(np.arange(0, epochs), metrics['loss']['valid'], label='val')
    plt.legend(loc='best')
    
    plt.subplot(1, 2, 2)
    plt.title('Accuracy')
    plt.plot(np.arange(0, epochs), metrics['accuracy']['train'], label='train')
    plt.plot(np.arange(0, epochs), metrics['accuracy']['valid'], label='val')
    plt.legend(loc='best')
    
    plt.show()

## Jednostavna rekurentne neuronska mreža

In [17]:
embedding_size = 32
output_size = 128

In [18]:
import torch.nn as nn

class SimpleRNN(nn.Module):
    def __init__(self, embedding_size, num_tokens, hidden_size):
        super(SimpleRNN, self).__init__()

        self.embedding = nn.Embedding(num_tokens, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.embedding(x)
        outputs, h_n = self.rnn(x)
        x = outputs[:, -1, :]
        x = self.dropout(x)
        x = self.linear(x)
        x = self.sigmoid(x)
        return x

In [19]:
model_simple = SimpleRNN(embedding_size, max_len, output_size)

In [None]:
from torch.optim import Adam
optimizer = Adam(params=model_simple.parameters(), lr=0.001)
metrics_simple = train_classification(model_simple, optimizer, train_loader, valid_loader)

In [None]:
plot_graphs(metrics_simple)

In [None]:
loss_simple, accuracy_simple  = evaluate_model(model, test_loader)

In [None]:
print('Loss: ', loss_simple)

In [None]:
print('Accuracy: ', accuracy_simple)

Podsetimo se, za čuvanje modela može da se iskoristi funkcija `torch.save`.

In [25]:
# torch.save(simple_model, "models/RNN.pth")

### LSTM rekurentna neuronska mreža 

In [None]:
embedding_dimensionality = 32
units = 128

In [None]:
import torch.nn as nn

class SimpleLSTM(nn.Module):
    def __init__(self, embedding_size, num_tokens, hidden_size):
        super(SimpleLSTM, self).__init__()
        
        self.embedding = nn.Embedding(num_tokens, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size)
        self.dropout = nn.Dropout(0.2) 
        self.linear = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        x, (h_n, c_n) = self.lstm(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.linear(x)
        return x


In [None]:
model_lstm = SimpleLSTM(embedding_size, max_len, output_size)
model_lstm

In [None]:
# DEFINE OPTIM ETC

In [None]:
optimizer = Adam(params=model_lstm.parameters(), lr=0.001)
metrics_lstm = train_classification(model_lstm, optimizer, train_loader, valid_loader, epoches=3)

In [None]:
plot_graphs(metrics_lstm)

In [None]:
loss_lstm, accuracy_lstm = evaluate_model(model_lstm, test_loader)

In [None]:
print('Loss: ', loss_lstm)

In [None]:
print('Accuracy: ', accuracy_lstm)

In [None]:
# torch.save(simple_model, "models/LSTM.pth")

### Stekovana LSTM mreže

In [None]:
embedding_dimensionality = 32
units = 128

In [None]:
import torch.nn as nn

class StackedLSTM(nn.Module):
    def __init__(self,embedding_size, num_tokens, hidden_size, num_layers):
        super(StackedLSTM, self).__init__()

        self.embedding = nn.Embedding(num_tokens, embedding_size)
        # Using num_layers to create a stacked LSTM
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.2)  # Assuming you want a dropout rate of 0.2
        self.linear = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Get the last sequence output
        x = self.dropout(x)  # Apply dropout
        x = self.linear(x)
        return x


In [None]:
model_stacked_lstm = StackedLSTM(embedding_size, max_len, output_size)
model_stacked_lstm

In [None]:
optimizer = Adam(params=model_stacked_lstm.parameters(), lr=0.001)
metrics_stacked_lstm = train_classification(model_stacked_lstm, optimizer, train_loader, valid_loader, epoches=5)

In [None]:
plot_graphs(metrics_stacked_lstm)

In [None]:
model_stacked_lstm.save('models/LSTM_stacked.h5')

### GRU rekurentna neuronska mreža 

In [None]:
embedding_dimensionality = 32
units = 128

In [None]:
import torch.nn as nn

class GRUNet(nn.Module):
    def __init__(self, embedding_size, num_tokens, hidden_size):
        super(GRUNet, self).__init__()

        self.embedding = nn.Embedding(num_tokens, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.2)  # Assuming you want a dropout rate of 0.2
        self.linear = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.gru(x)
        x = x[:, -1, :]  # Get the last sequence output
        x = self.dropout(x)  # Apply dropout
        x = self.linear(x)
        return x


In [None]:
model_gru.summary()

In [None]:
# TODO OPTIM
model_gru.compile(optimizer=Adam(0.0001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
epoches = 5
batch_size = 128

In [None]:
# TODO FIT

# treniranje mreze traje nesto duze - istrenirani model je dostupan u models direktorijumu
start_training_gru = perf_counter()
history_gru = model_gru.fit(X_train, y_train, epochs=epoches, batch_size=batch_size, validation_split=0.2)
end_training_gru = perf_counter()

In [None]:
print('Duzina trajanja treniranja: ', end_training_gru-start_training_gru)

In [None]:
plot_graphs(history_gru, epoches)

In [None]:
loss_gru, accuracy_gru = model_gru.evaluate(X_test, y_test)

In [None]:
print('Loss: ', loss_gru)

In [None]:
print('Accuracy: ', accuracy_gru)

In [None]:
model_gru.save('models/GRU.h5')

### Neuronska mreža sa pritreniranim ugnježđavanjima

Isprobaćemo i varijantu mreže koja će koristiti pritrenirane ugnježđene reprezentacije. Ovakve reprezentacije se obično uče nad većim skupom podataka i mogu biti korisne za mnoge zadatke u kojima je raspoloživi skup mnogo manjeg obima. O načinima njihovog učenja biće više reči u sekciji o nenadgledanom mašinskom učenju, a ove ćemo iskoristiti `GloVe` reprezentacije. Ceo paket sa reprezentacijama reči različitih dužina (50, 100, 200 i 300) ukupne veličine 822MB se može preuzeti sa [zvanične adrese](https://nlp.stanford.edu/projects/glove/) Stanford grupe. Mi ćemo u radu koristiti reprezentacije dužine 100 koje se nalaze u datoteci `glove.6B.100d.txt` (347.1MB) koja se može preuzeti pojedinačno npr. sa [ove](https://www.kaggle.com/terenceliu4444/glove6b100dtxt) adrese. 

Prvo ćemo pročitati iz preuzete datoteke sve podržane reči i njihove vektorske reprezentacije. U pojedinačnim redovima datoteke se prvo nalazi reči, a potom 100 realnih vrednosti koje predstavljaju njenu vektorsku reprezentaciju.

In [None]:
word_embeddings = {}
with open('data/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs

In [None]:
len(word_embeddings)

Na primer, vektorska reprezentacija reči `movie` se može dobiti sa:

In [None]:
word_embeddings['movie']

Prilikom učenja ovakvih reprezentacija obično se ne vrši priprema teksta na način koji smo opisivali u generisanju `Tf-Idf` reprezentacija (normalizacija, stemiranje, lematizacija) kako bi mreža mogla da nauči različite aspekte pojedinačnih reči. Tako za imenicu movie postoji i reprezentacija reči movies, a za glagol play i reprezentacije za plays, playing i played.

Da bismo mogli da mapiramo filmske izveštaje u reči, a potom i reči u vektorske reprezentacije, sa forme pregleda koji sadrži indekse reči trebamo preći na formu koja sadrži konkretne reči. 

Ovde ćemo simulirati ceo proces obrade teksta korišćenjem Keras podrške obrađujući orginalne tekstualne sadržaje pregleda koji se mogu preuzeti sa [ove](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) adrese.

In [None]:
import pandas as pd
import re

In [None]:
reviews = pd.read_csv('data/IMDB.csv')

U pročitanom skupu se nalaze svi pregledi koje treba dalje razvrstati na skupove za treniranje, validaciju i testiranje.

In [None]:
reviews.shape

In [None]:
reviews.head()

Pozitivni sentiment ćemo mapirati u vrednost 1, a negativni u vrednost 0.

In [None]:
reviews['target'] = reviews['sentiment'].apply(lambda s: 1 if s=='positive' else 0)

In [None]:
reviews['target'].value_counts()

Zbog dalje obrade teksta, pročitaćemo i jedan originalni pregled.

In [None]:
demo_review = reviews['review'][0]

In [None]:
demo_review

Iz ovako pročitanih dokumenata možemo izbrisati zaostale HTML etikete i zameniti višestruke beline jednostrukim. Takođe, možemo sve zapisati malim slovima.

In [None]:
def prepare_review(review):
    clean_review = re.sub('<.+?>', ' ', review)
    clean_review = re.sub('\s+', ' ', clean_review)
    return clean_review.lower()

In [None]:
prepare_review(demo_review)

Ovako napisanu funkciju ćemo dalje primeniti nad svakim pregledom.

In [None]:
reviews['prepared_review'] = reviews['review'].apply(lambda r: prepare_review(r))

Dalje, da bismo izdvojili skup za treniranje, skup za validaciju i skup za testiranje, promešaćemo indekse skupova podataka - podrazumevano prvo imamo sve pozitivne preglede uzastopno, a zatim i sve negativne.

In [None]:
indices = np.arange(reviews.shape[0])
np.random.shuffle(indices)

Prateći brojke iz prethodnih primera, u test skup ćemo smestiti polovinu pregleda, a ostale ćemo podeliti na preglede za traniranje i validaciju.

In [None]:
number_of_reviews = reviews.shape[0]
test_reviews = reviews.iloc[indices[number_of_reviews//2:]]
validation_size = int(0.2*number_of_reviews)
train_reviews = reviews.iloc[indices[0:number_of_reviews//2-validation_size]]
validation_reviews = reviews.iloc[indices[number_of_reviews//2-validation_size:number_of_reviews//2]]

In [None]:
test_reviews.shape

In [None]:
train_reviews.shape

In [None]:
validation_reviews.shape

Obradu ćemo izvršiti korišćenjem Kerasovog `Tokenizer` tokenizatora. Njime se mogu izdvojiti pojedinačne reči tj. tokeni zadatog teksta i konstruisati vokabular željene dužine. Kao i u prethodnim primerima, iskoristićemo 5000 najfrekventnijih reči.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_reviews['prepared_review'])

Vokabular koji je kreirao tokenizator se može dobiti pomoću `word_index` svojstva. On se sastoji od reči i indeksa njihovih pozicija.

In [None]:
word_index = tokenizer.word_index
print('Broj jedinstvenih reci (tokena): ', len(word_index))

In [None]:
word_index['movie']

In [None]:
word_index['movies']

Na osnovu ovako dobijene mape i GloVe vrednosti kreiraćemo matricu ugnježđavanja nalik onoj koju je mreža učila do sada.

In [None]:
embedding_dimensionality = 100
number_of_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((number_of_words, embedding_dimensionality))

In [None]:
for word, i in word_index.items():
    if i >= max_features:
        break
    embedding_vector = word_embeddings.get(word)
    
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
    # ukoliko rec teksta nema GloVe reprezentaciju, njoj ce odgovarati vektor nula

In [None]:
embedding_matrix.shape

Dalje ćemo korišćenjem tokenizatora transformisati tekst u niz indeksa reči. Za to ćemo koristiti njegovu metodu `text_to_sequence`. Potom ćemo dobijene sekvence svesti na istu dužinu korišćenjem funkcije `pad_sequence`.

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_reviews['prepared_review'])
X_train = preprocessing.sequence.pad_sequences(train_sequences, maxlen=maxlen)
y_train = train_reviews['target']

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_reviews['prepared_review'])
X_test = preprocessing.sequence.pad_sequences(test_sequences, maxlen=maxlen)
y_test = test_reviews['target']

In [None]:
validation_sequences = tokenizer.texts_to_sequences(validation_reviews['prepared_review'])
X_validation = preprocessing.sequence.pad_sequences(validation_sequences, maxlen=maxlen)
y_validation = validation_reviews['target']

Ostalo je još da kreiramo i naučimo model koji koiristi matricu ugnježdavanja. Prilikom zadavanja Embedding sloja ovoga puta ćemo preko parametra `weights` naglasiti koje vrednosti treba koristiti, a kako njih ne treba menjati u toku učenja, postavićemo i `trainable` parametar na vrednost `False`.

In [None]:
model_with_glove = Sequential()
model_with_glove.add(Embedding(max_features, embedding_dimensionality, weights=[embedding_matrix], input_length=maxlen, trainable=False))
model_with_glove.add(Flatten())
model_with_glove.add(Dropout(rate=0.3))
model_with_glove.add(Dense(32, activation='relu'))
model_with_glove.add(Dropout(rate=0.3))
model_with_glove.add(Dense(1, activation='sigmoid'))
model_with_glove.summary()

In [None]:
model_with_glove.compile(optimizer=Adam(0.000001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
epoches = 10
batch_size = 64

In [None]:
history_with_glove = model_with_glove.fit(X_train, y_train, batch_size=batch_size, epochs=epoches, validation_data=(X_validation, y_validation))

In [None]:
plot_graphs(history_with_glove, epoches)

In [None]:
loss_with_glove, accuracy_with_glove = model_with_glove.evaluate(X_test, y_test)

In [None]:
print('Loss: ', loss_with_glove)

In [None]:
print('Accuracy: ', accuracy_with_glove)

In [None]:
model_with_glove.save('models/glove.h5')

### Zadaci: 
- Ispitati ponašanje mreža koje se duže treniraju. 
- Ispitati tok treniranja kada se koriste manji koraci učenja i nešto drugačija veličina paketića.