<a href="https://colab.research.google.com/github/nicostanw/NLP_Toxic_Comment_Classification/blob/main/NLP_Project_Dimanche.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import re
import string 
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd drive/MyDrive/NLP

/content/drive/MyDrive/NLP


In [6]:
train = pd.read_csv('train.csv', sep=',')
print(train.shape)

(159571, 8)


In [7]:
train.head(15)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [8]:
tokenizer = get_tokenizer('basic_english')

#Tokenization, on réduit les phrases en bouts simples

def clean_text(text):
  text=text.lower()  #Met en minuscules
  url_pattern = re.compile(r'https?://\S+|www\.\S+')   #Enleve les liens 
  text=url_pattern.sub(r'', text)
  text=re.sub(r'\d+', '',text)    #Enleve les nbs
  translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))    #Enleve poncutation 
  text=text.translate(translator)
  text=re.sub(r'\n',' ',text)   #enleve les /n
  text= re.sub(' +', ' ', text)    #enleve les trop grands espaces 
  return(text.strip())   #Clean les bords

In [9]:
X_train=train["comment_text"].apply(clean_text)
X_train.head()

0    explanation why the edits made under my userna...
1    d aww he matches this background colour i m se...
2    hey man i m really not trying to edit war it s...
3    more i can t make any real suggestions on impr...
4    you sir are my hero any chance you remember wh...
Name: comment_text, dtype: object

In [12]:
y_train=train[train.columns[2:]].apply(lambda x:np.array(list(x)),axis=1)
print(y_train) 
print(len(y_train))
num_class=y_train[0].size
print(num_class)
print(y_train.sum())

0         [0, 0, 0, 0, 0, 0]
1         [0, 0, 0, 0, 0, 0]
2         [0, 0, 0, 0, 0, 0]
3         [0, 0, 0, 0, 0, 0]
4         [0, 0, 0, 0, 0, 0]
                 ...        
159566    [0, 0, 0, 0, 0, 0]
159567    [0, 0, 0, 0, 0, 0]
159568    [0, 0, 0, 0, 0, 0]
159569    [0, 0, 0, 0, 0, 0]
159570    [0, 0, 0, 0, 0, 0]
Length: 159571, dtype: object
159571
6
[15294  1595  8449   478  7877  1405]


In [13]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for text in X_train:
  counter.update(tokenizer(text))
vocab = Vocab(counter, min_freq=3)   #Le vocab qui apparait au moins 3 fois

In [14]:
vocab_size=len(vocab)
print(vocab_size)  #En tout on a 59 868 mots qui apparaissent au moins 3 fois

59868


In [15]:
max_len=X_train.map(lambda x:len(tokenizer(x))).max()    #Le plus long texte
print(max_len)

1403


In [17]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [100]:
class ToxicCommentDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, X,y):

        self.X =X
        self.y =y
        

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
       
        return (self.X[idx],self.y[idx])


def collate_batch(batch):
    vectorized_seqs=[]
    list_label=[]
    for text , label in batch:
      vectorized_seqs.append(text_pipeline(text))
      list_label.append(label)
      
    seq_lengths=list(map(len, vectorized_seqs))
    seq_tensor=torch.ones(size=(len(batch),max_len),dtype=torch.int64)
    for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
      seq_tensor[idx, :seqlen] = torch.tensor(seq,dtype=torch.int64)
    
        
    return seq_tensor.to(device),torch.tensor(list_label,dtype=torch.int64).to(device)

In [19]:
from torch import nn
from torch import autograd
from torch.nn import functional as F
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim,hidden_dim, num_class,batch_size):
        super(TextClassificationModel, self).__init__()
        self.batch_size=batch_size
        self.hidden_dim=hidden_dim

        self.embedding = nn.Embedding( vocab_size,embed_dim, padding_idx=1)
        self.lstm = nn.LSTM(input_size=embed_dim,
                            hidden_size=hidden_dim,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        #self.drop = nn.Dropout(p=0.5)

        self.fc = nn.Linear(2*hidden_dim,num_class)
       # self.hidden = self.init_hidden()

    '''def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim)),   
                autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim))) '''

    def forward(self, text):
        embed_text= self.embedding(text)
        lstm_out,(hn,cn) = self.lstm(embed_text)
        x=torch.cat((hn[0,:,:],hn[1,:,:]),1)
        return torch.sigmoid(self.fc(x))

In [20]:
model=TextClassificationModel(vocab_size,100,50,num_class,8)

In [30]:
toxic_train.X_


0         explanation why the edits made under my userna...
1         d aww he matches this background colour i m se...
2         hey man i m really not trying to edit war it s...
3         more i can t make any real suggestions on impr...
4         you sir are my hero any chance you remember wh...
                                ...                        
159566    and for the second time of asking when your vi...
159567    you should be ashamed of yourself that is a ho...
159568    spitzer umm theres no actual article for prost...
159569    and it looks like it was actually you who put ...
159570    and i really don t think you understand i came...
Name: comment_text, Length: 159571, dtype: object

In [143]:
toxic_train=ToxicCommentDataset(X_train,y_train)
dataloaders = DataLoader(toxic_train, batch_size=8, shuffle=False,collate_fn=collate_batch)
print(dataloaders)

criterion1 = nn.BCELoss()
criterion2 = nn.CrossEntropyLoss()

for i,(u,l) in enumerate(dataloaders):
  
  outputs = model(u)
  preds2=torch.where(outputs<0.5,0,1).float()
  #print((preds2==l).float().mean(axis=1).mean())
  print((preds2==l).float())
  print((preds2==l))
  print(l,preds2)
  loss=0
  if i==0:
    break


<torch.utils.data.dataloader.DataLoader object at 0x7f94b7551710>
tensor([[1., 1., 0., 1., 1., 1.],
        [1., 0., 0., 1., 0., 1.],
        [1., 1., 0., 1., 1., 1.],
        [1., 1., 0., 1., 1., 1.],
        [1., 1., 0., 0., 1., 1.],
        [1., 1., 0., 1., 1., 1.],
        [0., 0., 1., 1., 0., 1.],
        [1., 1., 0., 1., 1., 1.]], device='cuda:0')
tensor([[ True,  True, False,  True,  True,  True],
        [ True, False, False,  True, False,  True],
        [ True,  True, False,  True,  True,  True],
        [ True,  True, False,  True,  True,  True],
        [ True,  True, False, False,  True,  True],
        [ True,  True, False,  True,  True,  True],
        [False, False,  True,  True, False,  True],
        [ True,  True, False,  True,  True,  True]], device='cuda:0')
tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 1, 0],
        [0, 0,

In [None]:
x=np.arange(40).reshape(2,2,10)
print(x.shape)
print(x[0,:,:],x[1,:,:])
print(np.concatenate((x[0,:,:],x[1,:,:]),axis=1).shape)

(2, 2, 10)
[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]] [[20 21 22 23 24 25 26 27 28 29]
 [30 31 32 33 34 35 36 37 38 39]]
(2, 20)


In [105]:
from __future__ import print_function
from __future__ import division
import argparse
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets
from torch.autograd import Variable
from torchvision import datasets, models, transforms
from tqdm import tqdm
import numpy as np
import torchvision
import matplotlib.pyplot as plt
import time
import os
import copy
from numba import cuda
from tqdm import tqdm  

In [118]:
11//3

3

In [117]:
#Now we set our training function 
def train_model(model, trainloader, valloader, optimizer, num_epochs=9):

    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        running_train_loss = 0.0
        running_val_loss = 0.0
        running_corrects = 0
        model.train()
        # Iterate over data.
        for j,(inputs, labels) in enumerate(trainloader):
                inputs = inputs.to(device)
                labels = labels.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                outputs = model(inputs)
                loss = criterion(outputs, labels.float())
        
                      #running_corrects += torch.sum(preds == labels.data[:,z])
                    # backward + optimize only if in training phase
                loss.backward()
                optimizer.step()
                running_train_loss += loss.item()
                if j%4000==0:
                  model.eval()   
                  with torch.no_grad():   
                    for val_inp,val_lab in valloader:
                      val_out = model(val_inp)
                      loss = criterion(val_out, val_lab.float())
                      running_val_loss += loss.item() * inputs.size(0)
                      preds=torch.where(outputs<0.5,0,1)
                      map=((preds==val_lab).float().mean()
        epoch_loss = running_loss / len(dataloaders.dataset)
        epoch_acc = running_corrects.double() / len(dataloaders.dataset)
        print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == val and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            model_file = model_name + '/model_' + str(epoch) + '.pth'
            print('Saved model to ' + model_file + '. You can run `python evaluate.py --model ' + model_file + '` to generate the Kaggle formatted csv file\n')
            print()
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model
 
#We define a criterion for the loss
criterion = nn.BCELoss()

#We define an optimizier with the Stochastic Gradient Descent method 
optimizer_ft = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
#optimizer_ft_2 = optim.SGD(model_ft_2.parameters(), lr=0.001, momentum=0.9)

 
#And finally we train the model 
model = train_model(model, criterion, optimizer_ft, num_epochs=10)
#model_ft_2 = train_model(model_ft_2, criterion, optimizer_ft_2, num_epochs=10)

#For each epoch, we will have the training loss and accuracy, as well as the validation loss and accuracy 

IndentationError: ignored