In [1]:
import pandas as pd
import nltk
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [2]:
# /teamspace/studios/this_studio/sentiment_analysis-project
train_data = './sentiment_analysis-project/data/train.csv'
test_data = './sentiment_analysis-project/data/test.csv'
valid_data = './sentiment_analysis-project/data/validation.csv'

file = './sentiment_analysis-project/multiclass_dataset.csv'

def save_load_df(file:str):
    if os.path.exists(file):
        df = pd.read_csv(file, index_col= 0)
    else:
        df = pd.concat(map(pd.read_csv, [train_data, test_data, valid_data]), axis= 0, ignore_index=True)
        df.to_csv(file, columns= ['id', 'text', 'label', 'sentiment'])
        df = pd.read_csv(file, index_col= 0)
    return df

df = save_load_df(file=file)
df

Unnamed: 0,id,text,label,sentiment
0,9536,"Cooking microwave pizzas, yummy",2,positive
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive
3,14182,naw idk what ur talkin about,1,neutral
4,17840,That sucks to hear. I hate days like that,0,negative
...,...,...,...,...
41639,10277,Fuck no internet damn time warner!,0,negative
41640,8610,Looking forward to android 1.5 being pushed t...,1,neutral
41641,8114,Not good. Wasted time.,0,negative
41642,3034,"U were great, as always. But, can`t we do an ...",2,positive


## TP1, TP2

In [3]:
class PreProcess():
    def __init__(self):
        self.text_pattern = re.compile(
        r'(<.+?>)'         # Balises HTML
        r'(#|@)\w+'  # @ and # words
        r'|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'  # Emails
        r'|(https?://[^\s\n\r]+)' # URLs commençant par http ou https
        r'|(www\.[^\s]+)'      # URLs commençant par www
        r'|([\U00010000-\U0010ffff])'  # Émojis et autres caractères au-delà de l'ASCII étendu
        r'|([^\x00-\xFF])'     # Tout ce qui n'est pas en ASCII étendu (0-255)
        )
        self.emoji_pattern = re.compile(
            "[\U0001F600-\U0001F64F"  # Emoticons
            "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
            "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
            "\U0001F700-\U0001F77F"  # Alchemical Symbols
            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
            "\U0001FA00-\U0001FA6F"  # Chess Symbols
            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
            "\U00002702-\U000027B0"  # Dingbats
            "\U000024C2-\U0001F251"  # Enclosed characters
            "]+",
            flags=re.UNICODE,
        )
        self.punctuation = set(string.punctuation)
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = self.text_pattern.sub('', str(text))
        text = self.emoji_pattern.sub('', str(text))
        text = text.lower()
        return text
    
    def get_tokens(self, text):
        sentences = nltk.sent_tokenize(text)
        tokens = []
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            for word in words:
                if word not in self.stop_words:
                    # clean all the punctuation and the StopWords
                    word = ''.join([c for c in word if c not in self.punctuation])
                    if word == '':
                        continue
                    tokens.append(word)
        return tokens

    def lemmetize_with_pos(self, tokens):
        pos_tags = nltk.pos_tag(tokens)
        lemmes = [] 
        pos_tag = []
        for token, pos in pos_tags:
            if pos.startswith('J'):
                lemma = self.lemmatizer.lemmatize(token, pos = 'a')
            elif pos.startswith('V'):
                lemma =  self.lemmatizer.lemmatize(token, pos = 'v')
            elif pos.startswith('RB'):
                lemma = self.lemmatizer.lemmatize(token, pos = 'r')
            elif pos.startswith('N'):
                lemma = self.lemmatizer.lemmatize(token, pos = 'n')
            else:
                lemma = self.lemmatizer.lemmatize(token)
            lemmes.append(lemma)
            pos_tag.append(pos)
        return lemmes, pos_tag
    
    def get_lemmes(self, text):
        text = self.clean_text(text)
        tokens = self.get_tokens(text)
        lemmes, _ = self.lemmetize_with_pos(tokens)
        return lemmes
    
    def visualize_data(self, text):
        text = self.clean_text(text)
        tokens = self.get_tokens(text)
        lemmes, pos_tag = self.lemmetize_with_pos(tokens)
        data = [[token, lemme, pos] for token, lemme, pos in zip(tokens, lemmes, pos_tag)]
        return data
    
texts = list(df['text'])
labels = list(df['label'])
process_text = PreProcess()

In [4]:
corpus = pd.DataFrame(data = [(process_text.get_lemmes(text), label) for text, label in zip(texts, labels)], columns=['sentence', 'label'])
corpus

Unnamed: 0,sentence,label
0,"[cook, microwave, pizza, yummy]",2
1,"[plan, allow, sub, task, show, widget]",1
2,"[love, humor, reword, like, say, group, therap...",2
3,"[naw, idk, ur, talkin]",1
4,"[suck, hear, hate, day, like]",0
...,...,...
41639,"[fuck, internet, damn, time, warner]",0
41640,"[look, forward, android, 15, push, g1]",1
41641,"[good, waste, time]",0
41642,"[u, great, always, east, germany, noko, least,...",2


In [5]:
# drop the empty rows
corpus = corpus[corpus['sentence'].str.len() != 0]
corpus = corpus.reset_index(drop = True)
corpus

Unnamed: 0,sentence,label
0,"[cook, microwave, pizza, yummy]",2
1,"[plan, allow, sub, task, show, widget]",1
2,"[love, humor, reword, like, say, group, therap...",2
3,"[naw, idk, ur, talkin]",1
4,"[suck, hear, hate, day, like]",0
...,...,...
41502,"[fuck, internet, damn, time, warner]",0
41503,"[look, forward, android, 15, push, g1]",1
41504,"[good, waste, time]",0
41505,"[u, great, always, east, germany, noko, least,...",2


In [6]:
import numpy as np
import io

fasttext_file = './sentiment_analysis-project/wiki-news-300d-1M-subword.vec'

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

fasttext_model = load_vectors(fasttext_file)

In [7]:
sentences = list(corpus['sentence'])

vocab = set(fasttext_model.keys())
def sentence_to_vec(sentence):
    sentence_to_vec = [np.array(list(fasttext_model[word])) for word in sentence if word in fasttext_model]
    return sentence_to_vec

df1 = pd.DataFrame(data = [(sentence_to_vec(sentence), label) for sentence, label in zip(sentences, list(corpus['label']))], 
                   columns=['vec', 'label'])

df1

Unnamed: 0,vec,label
0,"[[-0.0501, -0.0601, -0.0258, 0.0095, 0.0216, -...",2
1,"[[0.0492, 0.0081, -0.0267, 0.0038, -0.0188, -0...",1
2,"[[0.0022, 0.0199, 0.0331, 0.0033, -0.0508, 0.0...",2
3,"[[-0.0342, -0.0388, -0.0075, 0.0027, -0.0401, ...",1
4,"[[-0.0187, -0.0129, -0.0114, -0.0113, -0.0239,...",0
...,...,...
41502,"[[], [], [], [], []]",0
41503,"[[], [], [], [], [], []]",1
41504,"[[], [], []]",0
41505,"[[], [], [], [], [], [], [], [], [], []]",2


In [8]:
df1 = df1[df1['vec'].str.len() != 0]
df1 = df1.sort_values(by = 'vec', key = lambda vec: vec.str.len(), ascending=False)
df1 = df1.reset_index(drop= True)
df1

Unnamed: 0,vec,label
0,"[[], [], [], [], [], [], [], [], [], [], [], [...",1
1,"[[0.0065, -0.014, 0.0056, -0.0009, 0.0008, 0.0...",1
2,"[[], [], [], [], [], [], [], [], [], [], [], [...",2
3,"[[], [], [], [], [], [], [], [], [], [], [], [...",2
4,"[[], [], [], [], [], [], [], [], [], [], [], [...",2
...,...,...
41381,[[]],1
41382,[[]],1
41383,[[]],1
41384,[[]],2


In [None]:
# [len(vec) for vec in list(df1['vec']) if len(vec) <= 64]
df1 = df1[df1['vec'].str.len() <= 64]
df1 = df1.reset_index(drop= True)
df1

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

X = pad_sequence([torch.from_numpy(np.array(vec)) for vec in list(df1['vec'])], batch_first=True)
X.shape

In [None]:
Y = torch.from_numpy(np.array(list(df1['label'])))
Y.shape

In [None]:
df1['label'].value_counts()


In [None]:
from sklearn.model_selection import train_test_split

x_train, x_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, train_size=0.8, random_state=42, shuffle=True)
x_valid, x_test, y_valid, y_test = train_test_split(x_temp, y_temp, test_size=0.5, train_size=0.5, random_state=42, shuffle=True)

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(x_train, y_train)
test_dataset = TensorDataset(x_test, y_test)
valid_dataset = TensorDataset(x_valid, y_valid)

batch_size = 64
train_dl = DataLoader(train_dataset, batch_size= batch_size, shuffle=True, drop_last=True)
valid_dl = DataLoader(valid_dataset, batch_size = batch_size, shuffle=False, drop_last=True)
test_dl  = DataLoader(test_dataset, batch_size= batch_size, shuffle=False)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# (258, 195, 300)
class Model(nn.Module):
    def __init__(self,
                 embed_dim = 300,
                 hidden_size = 64,
                 bidirectional = False,
                 num_layers = 2,
                 dropout = 0.2
        ):
        super(Model, self).__init__()
        self.bidirectional = bidirectional
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size= embed_dim,
                            hidden_size = hidden_size,
                            num_layers=num_layers,
                            bias= False,
                            bidirectional=bidirectional,
                            batch_first=True,
                            dropout=dropout if num_layers > 1 else 0
        )
        self.D = 1 if self.bidirectional == False else 2
        self.fc2 = nn.Linear(self.D * hidden_size , 3)

    def forward(self, x):
        B, T, C = x.shape
        out, (h1, c1) = self.lstm(x) 
        out = F.softmax(self.fc2(out), dim = -1)
        return out

torch.manual_seed(42) # for the distribution of the gradient

torch.set_float32_matmul_precision('high')

model = Model()
model.to(device=device)
# model = torch.compile(model)

optimizer = optim.Adam(model.parameters(), lr = 3e-4)
print(sum(p.numel() for p in model.parameters()),'parameters')

import time
num_iter = 200
for i in range(num_iter):
    train_loss = 0
    train_acc = 0
    t1 = time.time()
    model.train()
    for xb_train, yb_train in train_dl:
        optimizer.zero_grad()
        # forward pass
        xb_train = xb_train.to(device)
        yb_train = yb_train.to(device)
        with torch.autocast(device_type = 'cuda', dtype = torch.float16):
            logits= model(xb_train)[:, -1, :]
            loss = F.cross_entropy(logits, yb_train)
        # backward pass
        loss.backward()
        train_loss += loss.item() * batch_size
        train_acc += (torch.argmax(logits, dim = 1) == yb_train).float().sum().item()
        # update the gradient
        optimizer.step()

    train_loss /= len(train_dl.dataset)
    train_acc /= len(train_dl.dataset)
     
    model.eval()
    valid_loss = 0
    valid_acc = 0
    with torch.no_grad():
        for xb_valid, yb_valid in valid_dl:
            xb_valid = xb_valid.to(device)
            yb_valid = yb_valid.to(device)
            logits = model(xb_valid)[:, -1, :]
            loss = F.cross_entropy(logits, yb_valid)
            valid_loss += loss.item() * batch_size
            valid_acc += (torch.argmax(logits, dim = 1) == yb_valid).float().sum().item()
    
    valid_loss /= len(valid_dl.dataset)
    valid_acc /= len(valid_dl.dataset)
    t2 = time.time()
    dt = (t2 - t1)
    print(f'epoch {i} || train_loss {train_loss:.3f}, valid_loss {valid_loss:.3f}, train_accu {train_acc:.3f} , valid_accu {valid_acc:.3f}, dt= {dt:.2f}')

In [None]:
test_acc = 0
with torch.no_grad():
    for xb_test, yb_test in test_dl:
        xb_test = xb_test.to(device)
        yb_test = yb_test.to(device)
        logits = model(xb_test)[:, -1, :]
        test_acc += (torch.argmax(logits, dim = 1) == yb_test).float().sum().item()

test_acc /= len(test_dl.dataset)
test_acc