# Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/'My Drive/Desktop/ML/ProjectsML/Sentiment_Analysis/'
%ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Desktop/ML/ProjectsML/Sentiment_Analysis_multilabel
[0m[01;34mdata[0m/         Pytorch_torchtext.ipynb       Sentiment_Preprocessing.ipynb
[01;34molder_data[0m/   Semantic_Training.ipynb       utils.py
[01;34m__pycache__[0m/  Semantic_Training_LSTM.ipynb


# Libraries

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, roc_auc_score
from utils import CleanText, contractions

import torch
from torchtext import data
import torch.nn as nn
import torch.optim as optim
pd.set_option('display.max_colwidth', None)

# Data Preparation

### TEXT and FIELD

In [3]:
SEED = 0
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
TEXT = data.Field(lower=True, include_lengths = True, preprocessing=None, tokenize='spacy')
LABEL = data.Field(sequential=False,is_target=True, use_vocab=False, pad_token=None, unk_token=None)

In [5]:
datafields = [('comment_text', TEXT), 
              ("identity_hate", LABEL), 
              ("negative", LABEL),
              ('obscene', LABEL), 
              ]

In [6]:
train_ds,val_ds = data.TabularDataset.splits(path='data/', 
                                      train = "train.csv",
                                      validation = "val.csv",
                                      format='csv', 
                                      fields=datafields, 
                                      skip_header = True)

### Single example

In [7]:
print(type(train_ds[0]))
print(vars(train_ds[0]))

<class 'torchtext.data.example.Example'>
{'comment_text': ['updated', 'shall', 'give', 'thanks', 'cyclonebiskit'], 'identity_hate': '0', 'negative': '0', 'obscene': '0'}


### Create Vocab

In [8]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_ds, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.zero_)

In [9]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")

Unique tokens in TEXT vocabulary: 25002


In [10]:
print(TEXT.vocab.freqs.most_common(20))

[('not', 42888), ('article', 15245), ('wikipedia', 13630), ('page', 12675), ('talk', 10259), ('would', 9787), ('s', 9757), ('like', 8734), ('fuck', 8451), ('no', 8342), ('one', 7924), ('please', 7915), ('see', 5783), ('nigger', 5750), ('think', 5607), ('also', 5602), ('know', 5584), ('people', 5365), ('edit', 4915), ('use', 4533)]


In [11]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'not', 'article', 'wikipedia', 'page', 'talk', 'would', 's', 'like']


In [12]:
print(TEXT.vocab.stoi)



In [13]:
word_list = ["not", "no", "fucker", "motherfucker", "jews", "muslim", "black", "bloody", "blahibubwubfbivwbvf", "fuck"]
for word in word_list:
  print("{}|{}|{}".format(word, TEXT.vocab.stoi[word], TEXT.vocab.itos[TEXT.vocab.stoi[word]]))

not|2|not
no|11|no
fucker|555|fucker
motherfucker|1812|motherfucker
jews|306|jews
muslim|1084|muslim
black|569|black
bloody|1184|bloody
blahibubwubfbivwbvf|0|<unk>
fuck|10|fuck


### Batch and Iterator

In [14]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_ds, val_ds), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device,
    sort_key=lambda x: len(x.comment_text))

In [15]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim//4)
        self.fc2 = nn.Linear(hidden_dim//4, hidden_dim//8)
        self.fc3 = nn.Linear(hidden_dim//8, output_dim)

        self.relu = nn.ReLU()
        
        self.dropout = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(0.2)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
        
        out = self.dropout(self.relu(self.fc1(hidden)))
        out = self.dropout(self.relu(self.fc2(out)))
        out = self.fc3(out)

        #out = self.fc(hidden)
            
        return out

In [16]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 3
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,846,894 trainable parameters


In [18]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)

torch.Size([25002, 100])


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1910,  0.1760,  0.3692,  ..., -0.5968,  0.0808,  0.2787],
        ...,
        [-0.9174, -0.2646,  1.4420,  ...,  0.2034, -0.0067,  0.0205],
        [-0.0427, -0.1145, -0.9168,  ...,  1.0597, -0.3603, -0.5449],
        [ 1.0068, -0.1827,  0.1469,  ..., -0.3260, -0.1167, -0.1910]])

In [19]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1910,  0.1760,  0.3692,  ..., -0.5968,  0.0808,  0.2787],
        ...,
        [-0.9174, -0.2646,  1.4420,  ...,  0.2034, -0.0067,  0.0205],
        [-0.0427, -0.1145, -0.9168,  ...,  1.0597, -0.3603, -0.5449],
        [ 1.0068, -0.1827,  0.1469,  ..., -0.3260, -0.1167, -0.1910]])


In [20]:
model.to(device) #CPU to GPU

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()

optimizer = optim.Adam(model.parameters())

In [21]:
def cal_score(y, preds):
  
  #round predictions to the closest integer
  rounded_preds = torch.round(torch.sigmoid(preds))
  try:rc = roc_auc_score(y.cpu(), rounded_preds.detach().cpu().numpy(), average="micro")
  except Exception: rc = 0.0
  return rc

In [22]:
# training function 
def train(model, iterator):
    
    epoch_loss = 0
    epoch_rc = 0
    
    model.train()
    
    for batch in iterator:
        y = torch.stack([batch.identity_hate,
                         batch.negative,
                         batch.obscene],dim=1).float()
        text, text_lengths = batch.comment_text
        optimizer.zero_grad()
        predictions = model(text, text_lengths.cpu()).squeeze(1)
        loss = criterion(predictions, y)
        rc = cal_score(y, predictions)

        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_rc += rc
        

    return epoch_loss / len(iterator), epoch_rc / len(iterator)

In [23]:
def evaluate(model, iterator):
    
    epoch_rc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            y = torch.stack([batch.identity_hate,
                         batch.negative,
                         batch.obscene],dim=1).float()
            text, text_lengths = batch.comment_text
            predictions = model(text, text_lengths.cpu()).squeeze(1)
            rc = cal_score(y, predictions)
    
            epoch_rc += rc
        
    return epoch_rc / len(iterator)

In [24]:
num_epochs = 10

for epoch in range(num_epochs):
    
    train_loss, train_rc = train(model, train_iterator)
    valid_rc = evaluate(model, valid_iterator)
    
    print(f'{epoch} Train Loss: {train_loss:.3f} | Train ROC AUC score: {train_rc:.2f}')
    print(f'\tValidation ROC AUC score: {valid_rc:.2f}')

0 Train Loss: 0.201 | Train ROC AUC score: 0.77
	Validation ROC AUC score: 0.82
1 Train Loss: 0.130 | Train ROC AUC score: 0.85
	Validation ROC AUC score: 0.85
2 Train Loss: 0.112 | Train ROC AUC score: 0.86
	Validation ROC AUC score: 0.91
3 Train Loss: 0.100 | Train ROC AUC score: 0.89
	Validation ROC AUC score: 0.90
4 Train Loss: 0.090 | Train ROC AUC score: 0.90
	Validation ROC AUC score: 0.94
5 Train Loss: 0.083 | Train ROC AUC score: 0.92
	Validation ROC AUC score: 0.93
6 Train Loss: 0.076 | Train ROC AUC score: 0.93
	Validation ROC AUC score: 0.93
7 Train Loss: 0.070 | Train ROC AUC score: 0.94
	Validation ROC AUC score: 0.93
8 Train Loss: 0.064 | Train ROC AUC score: 0.95
	Validation ROC AUC score: 0.95
9 Train Loss: 0.060 | Train ROC AUC score: 0.95
	Validation ROC AUC score: 0.93


In [25]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.squeeze().detach().cpu().numpy().round(decimals=3)

In [26]:
sentences =["queer wow ur fucking gay u go die",
          "muslim boy has sweet tooth",
          "hem nigger nigger nigger nigger nigger nigger nigger thank",
          "thank you very much",
          "jews are nice people",
          "go stuff islam evil",
          "leave country not want arabs",
          "Jews are not dumb",
          "He is not a fool",
          "fuck stupid muslim ass", 
          "Americans are nicer"]

for sentence in sentences:
  print(predict_sentiment(model, sentence.lower()), "|", sentence)

[0.861 1.    0.999] | queer wow ur fucking gay u go die
[0.1   0.213 0.11 ] | muslim boy has sweet tooth
[0.889 1.    0.995] | hem nigger nigger nigger nigger nigger nigger nigger thank
[0. 0. 0.] | thank you very much
[0. 0. 0.] | jews are nice people
[0.712 0.75  0.283] | go stuff islam evil
[0.767 0.799 0.271] | leave country not want arabs
[0. 0. 0.] | Jews are not dumb
[0.006 0.339 0.317] | He is not a fool
[0.858 1.    1.   ] | fuck stupid muslim ass
[0. 0. 0.] | Americans are nicer
