In [159]:
import torch
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import time
import random
import pandas as pd
import numpy as np

torch.backends.cudnn.deterministic = True

In [160]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 5000
LEARNING_RATE = 1e-3
BATCH_SIZE = 128
NUM_EPOCHS = 50
DROPOUT = 0.5
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
BIDIRECTIONAL = True
HIDDEN_DIM = 192
NUM_LAYERS = 2
OUTPUT_DIM = 23

In [161]:
df = pd.read_excel('../input/ciao9cci/politica.xlsx', sheet_name="Foglio1")
pd.options.display.float_format = '{:,.0f}'.format
print(f'Found {len(df)} texts.')

print(f'{df["cap_maj_master"].isnull().sum()} document(s) with no classification removed')
df=df[pd.notnull(df['cap_maj_master'])]

print(f'{df["testo"].isnull().sum()} document(s) with no text removed')
df=df[pd.notnull(df['testo'])]

classes = [int(c) for c in df['cap_maj_master']]
documents = [d for d in df['testo']]
df = df[['cap_maj_master', 'testo']]
df.columns = ['classlabel', 'content']
df['classlabel'] = df['classlabel']-1
df.classlabel = df.classlabel.astype(int)
df.head(3)

Found 5674 texts.
2 document(s) with no classification removed
424 document(s) with no text removed


Unnamed: 0,classlabel,content
0,0,: quali siano le determinazioni del Governo in...
1,0,: quali siano le valutazioni del Governo sugli...
2,0,- premesso che: la prospettata modifica degli ...


In [162]:
np.unique(df['classlabel'].values)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 22])

In [163]:
len(np.bincount(df['classlabel']))

23

In [164]:
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize 
def preprocessor(text):
    text = re.sub('<[^>]*>', ' ', str(text))
    text=re.sub('\d+',' ',str(text))
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           str(text))
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' +
            ' '.join(emoticons).replace('-', ''))
    return text
import unicodedata
def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return text
def tokenizer_porter(text):
    stop=set(stopwords.words('italian'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop]
    
    stemmer = SnowballStemmer("italian", ignore_stopwords=True)
    return [stemmer.stem(word) for word in filtered_sentence]

In [165]:
df['content']=df.content.apply(preprocessor).apply(strip_accents)
df.head(3)

Unnamed: 0,classlabel,content
0,0,quali siano le determinazioni del governo in ...
1,0,quali siano le valutazioni del governo sugli ...
2,0,premesso che la prospettata modifica degli sc...


In [166]:
df[['classlabel', 'content']].to_csv('./train_prepocessed.csv', index=None)

In [167]:
del df

In [168]:
TEXT = data.Field(sequential=True,
                  tokenize='spacy',
                  include_lengths=True) # necessary for packed_padded_sequence

LABEL = data.LabelField(dtype=torch.float)

In [169]:
fields = [('classlabel', LABEL), ('content', TEXT)]

train_dataset = data.TabularDataset(
    path="./train_prepocessed.csv", format='csv',
    skip_header=True, fields=fields)



In [170]:
train_data, valid_data = train_dataset.split(
    split_ratio=[0.95, 0.05],
    random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Valid: {len(valid_data)}')

Num Train: 4986
Num Valid: 262


In [171]:
TEXT.build_vocab(train_data,
                 max_size=VOCABULARY_SIZE,
                 vectors='fasttext.simple.300d',
                 unk_init=torch.Tensor.normal_)

LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocabulary size: 5002
Number of classes: 21


In [172]:
train_loader, valid_loader = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size=BATCH_SIZE,
    sort_within_batch=True, # necessary for packed_padded_sequence
    sort_key=lambda x: len(x.content),
    device=DEVICE)

In [173]:

print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.content[0].size()}')
    print(f'Target vector size: {batch.classlabel.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.content[0].size()}')
    print(f'Target vector size: {batch.classlabel.size()}')
    break
    


Train
Text matrix size: torch.Size([445, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([281, 128])
Target vector size: torch.Size([128])


In [174]:

import torch.nn as nn


class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, bidirectional, hidden_dim, num_layers, output_dim, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim,
                           num_layers=num_layers,
                           bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc1 = nn.Linear(hidden_dim * num_layers, 64)
        self.fc2 = nn.Linear(64, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_length):

        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_length)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        hidden = self.fc1(hidden)
        hidden = self.dropout(hidden)
        hidden = self.fc2(hidden)
        return hidden

In [176]:
INPUT_DIM = len(TEXT.vocab)

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

torch.manual_seed(RANDOM_SEED)
model = RNN(INPUT_DIM, EMBEDDING_DIM, BIDIRECTIONAL, HIDDEN_DIM, NUM_LAYERS, OUTPUT_DIM, DROPOUT, PAD_IDX)
model = model.to(DEVICE)
criterion = nn.CrossEntropyLoss()
criterion=criterion.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [177]:
def evaluate(model, data_loader, criterion):
    model.eval()
    val_accuracy = []
    val_loss = []
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.content
            logits = model(text, text_lengths)
            loss = criterion(logits, batch_data.classlabel.long())
            val_loss.append(loss.item())
            
            _, preds = torch.max(logits, 1)
            accuracy = (preds == batch_data.classlabel.long()).cpu().numpy().mean() * 100
            val_accuracy.append(accuracy)
        val_loss = np.mean(val_loss)
        val_accuracy = np.mean(val_accuracy)
        return val_loss, val_accuracy

In [178]:

def train(model, optimizer,criterion, train_dataloader, val_dataloader=None, epochs=10):
    """Train the LSTM model."""
    
    # Tracking best validation accuracy
    best_accuracy = 0

    # Start training loop
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
    print("-"*60)
    countdown=epochs
    count=1
    while countdown>0:
        # =======================================
        #               Training
        # =======================================

        # Tracking time and loss
        t0_epoch = time.time()
        total_loss = 0
        model.train()
        step=1
        for batch_idx, batch_data in enumerate(train_loader):
            text, text_lengths = batch_data.content
        
            ### FORWARD AND BACK PROP
            logits = model(text, text_lengths)
            loss = criterion(logits, batch_data.classlabel.long())
            total_loss += loss.item()
            optimizer.zero_grad()
            step=step+1
            loss.backward()
        
            ### UPDATE MODEL PARAMETERS
            optimizer.step()
        
        avg_train_loss = total_loss / len(train_dataloader)
        
        val_loss, val_accuracy = evaluate(model, val_dataloader,criterion)
        
        # Track the best accuracy
        if val_accuracy >= best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'tut5-model.pt')
            countdown=epochs
        else:
            countdown=countdown-1
        time_elapsed = time.time() - t0_epoch
        print(f"{count :^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
        count=count+1
    print("\n")
    print(f"Training complete! Best accuracy: {best_accuracy:.2f}%.")
    return best_accuracy
    
        

In [None]:
train(model, optimizer,criterion, train_loader, valid_loader, 10)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   2.956469   |  2.831449  |   9.64    |   10.38  
   2    |   2.865184   |  2.843997  |   9.64    |   10.41  
   3    |   2.832933   |  2.819625  |   9.64    |   10.41  
   4    |   2.800553   |  2.835640  |   9.90    |   10.35  
   5    |   2.770646   |  2.822638  |   11.98   |   10.54  
   6    |   2.705731   |  2.721217  |   13.80   |   10.38  
   7    |   2.644482   |  2.703880  |   13.80   |   10.33  
   8    |   2.589377   |  2.622101  |   15.10   |   10.37  
   9    |   2.512120   |  2.724893  |   21.18   |   10.34  
  10    |   2.429830   |  2.536421  |   22.74   |   10.41  
  11    |   2.396723   |  2.568830  |   24.57   |   10.30  
