In [1]:
import pandas as pd
import nltk
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [2]:
train_data = 'data/train.csv'
test_data = 'data/test.csv'
valid_data = 'data/test.csv'

file = "multiclass_dataset.csv"

def save_load_df(file:str):
    if os.path.exists(file):
        df = pd.read_csv(file, index_col= 0)
    else:
        df = pd.concat(map(pd.read_csv, [train_data, test_data, valid_data]), axis= 0, ignore_index=True)
        df.to_csv(file, columns= ['id', 'text', 'label', 'sentiment'])
        df = pd.read_csv(file, index_col= 0)
    return df

df = save_load_df(file=file)
df

Unnamed: 0,id,text,label,sentiment
0,9536,"Cooking microwave pizzas, yummy",2,positive
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive
3,14182,naw idk what ur talkin about,1,neutral
4,17840,That sucks to hear. I hate days like that,0,negative
...,...,...,...,...
41639,10277,Fuck no internet damn time warner!,0,negative
41640,8610,Looking forward to android 1.5 being pushed t...,1,neutral
41641,8114,Not good. Wasted time.,0,negative
41642,3034,"U were great, as always. But, can`t we do an ...",2,positive


## TP1, TP2

In [3]:
class PreProcess():
    def __init__(self):
        self.text_pattern = re.compile(
        r'(<.+?>)'         # Balises HTML
        r'|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'  # Emails
        r'|(https?://[^\s\n\r]+)' # URLs commençant par http ou https
        r'|(www\.[^\s]+)'      # URLs commençant par www
        r'|([\U00010000-\U0010ffff])'  # Émojis et autres caractères au-delà de l'ASCII étendu
        r'|([^\x00-\xFF])'     # Tout ce qui n'est pas en ASCII étendu (0-255)
        )
        self.emoji_pattern = re.compile(
            "[\U0001F600-\U0001F64F"  # Emoticons
            "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
            "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
            "\U0001F700-\U0001F77F"  # Alchemical Symbols
            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
            "\U0001FA00-\U0001FA6F"  # Chess Symbols
            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
            "\U00002702-\U000027B0"  # Dingbats
            "\U000024C2-\U0001F251"  # Enclosed characters
            "]+",
            flags=re.UNICODE,
        )
        self.punctuation = set(string.punctuation)
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = self.text_pattern.sub('', str(text))
        text = self.emoji_pattern.sub('', str(text))
        text = text.lower()
        return text
    
    def get_tokens(self, text):
        sentences = nltk.sent_tokenize(text)
        tokens = []
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            for word in words:
                if word not in self.stop_words:
                    word = ''.join([c for c in word if c not in self.punctuation])
                    if word == '':
                        continue
                    tokens.append(word)
        return tokens

    def lemmetize_with_pos(self, tokens):
        pos_tags = nltk.pos_tag(tokens)
        lemmes = [] 
        pos_tag = []
        for token, pos in pos_tags:
            if pos.startswith('J'):
                lemma = self.lemmatizer.lemmatize(token, pos = 'a')
            elif pos.startswith('V'):
                lemma =  self.lemmatizer.lemmatize(token, pos = 'v')
            elif pos.startswith('RB'):
                lemma = self.lemmatizer.lemmatize(token, pos = 'r')
            elif pos.startswith('N'):
                lemma = self.lemmatizer.lemmatize(token, pos = 'n')
            else:
                lemma = self.lemmatizer.lemmatize(token)
            lemmes.append(lemma)
            pos_tag.append(pos)
        return lemmes, pos_tag
    
    def get_lemmes(self, text):
        text = self.clean_text(text)
        tokens = self.get_tokens(text)
        lemmes, _ = self.lemmetize_with_pos(tokens)
        return lemmes
    
    def visualize_data(self, text):
        text = self.clean_text(text)
        tokens = self.get_tokens(text)
        lemmes, pos_tag = self.lemmetize_with_pos(tokens)
        data = [[token, lemme, pos] for token, lemme, pos in zip(tokens, lemmes, pos_tag)]
        return data
    
texts = list(df['text'])
labels = list(df['label'])
process_text = PreProcess()

In [4]:
corpus = pd.DataFrame(data = [(process_text.get_lemmes(text), label) for text, label in zip(texts, labels)], columns=['sentence', 'label'])
corpus

Unnamed: 0,sentence,label
0,"[cook, microwave, pizza, yummy]",2
1,"[plan, allow, sub, task, show, widget]",1
2,"[love, humor, reword, like, say, group, therap...",2
3,"[naw, idk, ur, talkin]",1
4,"[suck, hear, hate, day, like]",0
...,...,...
41639,"[fuck, internet, damn, time, warner]",0
41640,"[look, forward, android, 15, push, g1]",1
41641,"[good, waste, time]",0
41642,"[u, great, always, east, germany, noko, least,...",2


In [5]:
# drop the empty rows , and sort the DF by length of the text 
corpus = corpus[corpus['sentence'].str.len() != 0]
corpus = corpus.sort_values(by = 'sentence', key = lambda sent: sent.str.len(), ascending=False)
corpus = corpus.reset_index(drop = True)
corpus

Unnamed: 0,sentence,label
0,"[true, nonpremium, version, soon, loose, way, ...",1
1,"[rereview, could, best, habit, tracker, goal, ...",1
2,"[one, major, shortcoming, app, habit, tracker,...",2
3,"[one, major, shortcoming, app, habit, tracker,...",2
4,"[bought, pro, version, week, ago, great, conce...",2
...,...,...
41502,[laggy],1
41503,[ibood],1
41504,[miss],0
41505,[home],1


In [6]:
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from gensim.models import KeyedVectors

sentences = list(corpus['sentence'])
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary= True)

def sentence_to_vec(sentence):
    sentence_to_vec = [word2vec_model[word] for word in sentence if word in word2vec_model]
    return sentence_to_vec

df1 = pd.DataFrame(data = [(sentence_to_vec(sentence), label) for sentence, label in zip(sentences, list(corpus['label']))], 
                   columns=['vec', 'label'])

df1 = df1[df1['vec'].str.len() != 0]
df1 = df1.sort_values(by = 'vec', key = lambda vec: vec.str.len(), ascending=False)
df1 = df1.reset_index(drop= True)
df1

Unnamed: 0,vec,label
0,"[[0.12792969, 0.047851562, 0.106933594, 0.0017...",1
1,"[[0.123535156, 0.031982422, 0.15039062, 0.1523...",1
2,"[[0.045654297, -0.14550781, 0.15625, 0.1660156...",2
3,"[[0.045654297, -0.14550781, 0.15625, 0.1660156...",2
4,"[[0.16699219, -0.05419922, -0.087402344, 0.019...",2
...,...,...
41352,"[[0.040527344, 0.0625, -0.017456055, 0.0786132...",0
41353,"[[-0.05102539, 0.045898438, -0.2734375, -0.259...",1
41354,"[[-0.07763672, -0.15722656, -0.15136719, 0.292...",1
41355,"[[-0.1796875, 0.057128906, 0.14160156, -0.0771...",2


In [7]:
# [len(vec) for vec in list(df1['vec']) if len(vec) <= 64]
df1 = df1[df1['vec'].str.len() <= 64]
df1 = df1.reset_index(drop= True)
df1

Unnamed: 0,vec,label
0,"[[-0.31835938, 0.059570312, -0.22949219, 0.087...",1
1,"[[0.048583984, 0.14355469, 0.22851562, 0.26953...",1
2,"[[0.048583984, 0.14355469, 0.22851562, 0.26953...",1
3,"[[0.040527344, 0.0625, -0.017456055, 0.0786132...",0
4,"[[-0.10546875, -0.13574219, -0.12402344, 0.042...",0
...,...,...
41299,"[[0.040527344, 0.0625, -0.017456055, 0.0786132...",0
41300,"[[-0.05102539, 0.045898438, -0.2734375, -0.259...",1
41301,"[[-0.07763672, -0.15722656, -0.15136719, 0.292...",1
41302,"[[-0.1796875, 0.057128906, 0.14160156, -0.0771...",2


In [8]:
X = pad_sequence([torch.from_numpy(np.array(vec)) for vec in list(df1['vec'])], batch_first=True)
X.shape

torch.Size([41304, 64, 300])

In [9]:
Y = torch.from_numpy(np.array(list(df1['label'])))
Y.shape

torch.Size([41304])

In [10]:
df1['label'].value_counts()


label
1    15302
2    13862
0    12140
Name: count, dtype: int64

In [11]:
(41644 - 41304) # dropped rows 

340

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, train_size=0.8, random_state=42, shuffle=True)
x_valid, x_test, y_valid, y_test = train_test_split(x_temp, y_temp, test_size=0.5, train_size=0.5, random_state=42, shuffle=True)

In [13]:
from torch.utils.data import DataLoader, TensorDataset

torch.manual_seed(42)
train_dataset = TensorDataset(x_train, y_train)
test_dataset = TensorDataset(x_test, y_test)
valid_dataset = TensorDataset(x_valid, y_valid)

batch_size = 16
train_dl = DataLoader(train_dataset, batch_size= batch_size, shuffle=True, drop_last=True)
valid_dl = DataLoader(valid_dataset, batch_size = batch_size, shuffle=True, drop_last=True)
test_dl  = DataLoader(test_dataset, batch_size= batch_size, shuffle=True, drop_last= True)

In [17]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class Model(nn.Module):
    def __init__(self, 
                 embed_dim = 300, 
                 hidden_size = 128, 
                 bidirectional = True,
                 num_layers = 2,
                 dropout = 0.4
        ):
        super(Model, self).__init__()
        self.bidirectional = bidirectional
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size= embed_dim, 
                            hidden_size = hidden_size, 
                            num_layers=num_layers,
                            bias= False, 
                            bidirectional=bidirectional, 
                            batch_first=True, 
                            dropout=dropout
        )
        self.fc2 = nn.Linear(num_layers * hidden_size, 3)

    def forward(self, x):
        B, T, C = x.shape
        D = 1 if self.bidirectional == False else 2
        h0 = torch.zeros((D, self.num_layers, B, self.hidden_size), device=device).view(-1, B, self.hidden_size)
        c0 = torch.zeros((D, self.num_layers, B, self.hidden_size), device=device).view(-1, B, self.hidden_size)
        out, (h1, c1) = self.lstm(x, (h0, c0))
        out = F.softmax(self.fc2(out), dim = -1)
        return out

In [18]:
torch.manual_seed(1337) # for the distribution of the gradient
model = Model()
model.to(device=device)
optimizer = optim.Adam(model.parameters(), lr = 0.001)
print(sum(p.numel() for p in model.parameters()),'parameters')

832259 parameters


In [19]:
for i in range(100):
    model.train()
    train_loss = 0
    train_acc = 0
    for xb_train, yb_train in train_dl:
        optimizer.zero_grad(set_to_none=False)

        # forward pass
        xb_train = xb_train.to(device)
        yb_train = yb_train.to(device)
        logits= model(xb_train)[:, -1, :]
        loss = F.cross_entropy(logits, yb_train)

        # backward pass
        loss.backward()
        train_loss += loss.item() * batch_size
        train_acc += (torch.argmax(logits, dim = 1) == yb_train).float().sum().item()
        # update the gradient
        optimizer.step()

    train_loss /= len(train_dl.dataset)
    train_acc /= len(train_dl.dataset)

    model.eval()
    with torch.no_grad():
        valid_loss = 0
        valid_acc = 0
        for xb_valid, yb_valid in valid_dl:
            xb_train = xb_train.to(device)
            yb_train = yb_train.to(device)
            logits = model(xb_valid)[:, -1, :]
            loss = F.cross_entropy(logits, yb_valid)
            valid_loss += loss.item() * batch_size
            valid_acc += (torch.argmax(logits, dim = 1) == yb_train).float().sum().item()
    
    valid_loss /= len(valid_dl.dataset)
    valid_acc /= len(valid_dl.dataset)
    if i % 10 == 0:
        print(f'epoch {i} || train_loss {train_loss:4f}, valid_loss {valid_loss:4f}, train_accu {train_acc:4f} , valid_accu {valid_acc:4f} ')
print(f'epoch {i} || train_loss {train_loss:4f}, valid_loss {valid_loss:4f}, train_accu {train_acc:4f} , valid_accu {valid_acc:4f} ')

epoch 0 || train_loss 1.095434, valid_loss 1.094753, train_accu 0.367309 , valid_accu 0.374818 


KeyboardInterrupt: 