In [1]:
import pandas as pd
import nltk
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [3]:
!ls

DQL-Example  main.py  nltk_data  sentiment_analysis-project


In [4]:

# train_data = path + '/train.csv'
# test_data = path + '/test.csv'
# valid_data = path + '/test.csv'

file = "./sentiment_analysis-project/multiclass_dataset.csv"

# df = pd.concat(map(pd.read_csv, [train_data, test_data, valid_data]), axis= 0, ignore_index=True)
df = pd.read_csv(file, index_col= 0)
df

Unnamed: 0,id,text,label,sentiment
0,9536,"Cooking microwave pizzas, yummy",2,positive
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive
3,14182,naw idk what ur talkin about,1,neutral
4,17840,That sucks to hear. I hate days like that,0,negative
...,...,...,...,...
41639,10277,Fuck no internet damn time warner!,0,negative
41640,8610,Looking forward to android 1.5 being pushed t...,1,neutral
41641,8114,Not good. Wasted time.,0,negative
41642,3034,"U were great, as always. But, can`t we do an ...",2,positive


## TP1, TP2

In [5]:
class PreProcess():
    def __init__(self):
        self.text_pattern = re.compile(
        r'(<.+?>)'         # Balises HTML
        r'|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'  # Emails
        r'|(https?://[^\s\n\r]+)' # URLs commençant par http ou https
        r'|(www\.[^\s]+)'      # URLs commençant par www
        r'|([\U00010000-\U0010ffff])'  # Émojis et autres caractères au-delà de l'ASCII étendu
        r'|([^\x00-\xFF])'     # Tout ce qui n'est pas en ASCII étendu (0-255)
        )
        self.emoji_pattern = re.compile(
            "[\U0001F600-\U0001F64F"  # Emoticons
            "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
            "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
            "\U0001F700-\U0001F77F"  # Alchemical Symbols
            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
            "\U0001FA00-\U0001FA6F"  # Chess Symbols
            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
            "\U00002702-\U000027B0"  # Dingbats
            "\U000024C2-\U0001F251"  # Enclosed characters
            "]+",
            flags=re.UNICODE,
        )
        self.punctuation = set(string.punctuation)
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = self.text_pattern.sub('', str(text))
        text = self.emoji_pattern.sub('', str(text))
        text = text.lower()
        return text

    def get_tokens(self, text):
        sentences = nltk.sent_tokenize(text)
        tokens = []
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            for word in words:
                if word not in self.stop_words:
                    word = ''.join([c for c in word if c not in self.punctuation])
                    if word == '':
                        continue
                    tokens.append(word)
        return tokens

    def lemmetize_with_pos(self, tokens):
        pos_tags = nltk.pos_tag(tokens)
        lemmes = []
        pos_tag = []
        for token, pos in pos_tags:
            if pos.startswith('J'):
                lemma = self.lemmatizer.lemmatize(token, pos = 'a')
            elif pos.startswith('V'):
                lemma =  self.lemmatizer.lemmatize(token, pos = 'v')
            elif pos.startswith('RB'):
                lemma = self.lemmatizer.lemmatize(token, pos = 'r')
            elif pos.startswith('N'):
                lemma = self.lemmatizer.lemmatize(token, pos = 'n')
            else:
                lemma = self.lemmatizer.lemmatize(token)
            lemmes.append(lemma)
            pos_tag.append(pos)
        return lemmes, pos_tag

    def get_lemmes(self, text):
        text = self.clean_text(text)
        tokens = self.get_tokens(text)
        lemmes, _ = self.lemmetize_with_pos(tokens)
        return lemmes

    def visualize_data(self, text):
        text = self.clean_text(text)
        tokens = self.get_tokens(text)
        lemmes, pos_tag = self.lemmetize_with_pos(tokens)
        data = [[token, lemme, pos] for token, lemme, pos in zip(tokens, lemmes, pos_tag)]
        return data

texts = list(df['text'])
labels = list(df['label'])
process_text = PreProcess()

In [6]:
corpus = pd.DataFrame(data = [(process_text.get_lemmes(text), label) for text, label in zip(texts, labels)], columns=['sentence', 'label'])
corpus

Unnamed: 0,sentence,label
0,"[cook, microwave, pizza, yummy]",2
1,"[plan, allow, sub, task, show, widget]",1
2,"[love, humor, reword, like, say, group, therap...",2
3,"[naw, idk, ur, talkin]",1
4,"[suck, hear, hate, day, like]",0
...,...,...
41639,"[fuck, internet, damn, time, warner]",0
41640,"[look, forward, android, 15, push, g1]",1
41641,"[good, waste, time]",0
41642,"[u, great, always, east, germany, noko, least,...",2


In [7]:
# drop the empty rows , and sort the DF by length of the text
corpus = corpus[corpus['sentence'].str.len() != 0]
corpus = corpus.sort_values(by = 'sentence', key = lambda sent: sent.str.len(), ascending=False)
corpus = corpus.reset_index(drop = True)
corpus

Unnamed: 0,sentence,label
0,"[true, nonpremium, version, soon, loose, way, ...",1
1,"[rereview, could, best, habit, tracker, goal, ...",1
2,"[one, major, shortcoming, app, habit, tracker,...",2
3,"[one, major, shortcoming, app, habit, tracker,...",2
4,"[bought, pro, version, week, ago, great, conce...",2
...,...,...
41502,[byeeeee],1
41503,[shut],0
41504,[twappy],1
41505,[sorry],2


In [11]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel

# model_name = 'bert-base-uncased'
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

sentences = list(corpus['sentence'])
sentences = [' '.join(sentence) for sentence in sentences]
data = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

labels = list(corpus['label'])
labels = torch.tensor(labels, dtype = torch.long)
data['labels'] = labels

train_data, valid_data, test_data = {}, {}, {}
for k, v in data.items():
    train_data[k], test_data[k] = train_test_split(v, test_size=0.3, train_size=0.7, random_state=42, shuffle=True)
    test_data[k], valid_data[k] = train_test_split(test_data[k], test_size=0.5, train_size=0.5, random_state=42, shuffle=True)

class DictDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.data['input_ids'][idx],
            'attention_mask': self.data['attention_mask'][idx],
            # 'token_type_ids': self.data['token_type_ids'][idx]
              }, self.data['labels'][idx]

train_data = DictDataset(train_data)
valid_data = DictDataset(valid_data)
test_data = DictDataset(test_data)

batch_size = 32
train_dl = DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True)
valid_dl = DataLoader(valid_data, batch_size=batch_size, shuffle=True, drop_last=True)
test_dl = DataLoader(test_data, batch_size=batch_size, shuffle=True, drop_last=True)

print(sum([p.numel() for p in bert_model.parameters()])// 1e6, 'M')


66.0 M


In [12]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class LSTModel(nn.Module):
    def __init__(self,
                 embed_dim = 768,
                 hidden_size = 64,
                 bidirectional = False,
                 num_layers = 1,
                 dropout = 0.4
        ):
        super(LSTModel, self).__init__()
        self.bidirectional = bidirectional
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size= embed_dim,
                            hidden_size = hidden_size,
                            num_layers=num_layers,
                            bias= False,
                            bidirectional=bidirectional,
                            batch_first=True,
                            # dropout=dropout
        )
        self.D = 1 if self.bidirectional == False else 2
        self.fc2 = nn.Linear(self.D * num_layers * hidden_size, 3)

    def forward(self, x):
        B, T, C = x.shape
        h0 = torch.zeros((self.D, self.num_layers, B, self.hidden_size), device=device).view(-1, B, self.hidden_size)
        c0 = torch.zeros((self.D, self.num_layers, B, self.hidden_size), device=device).view(-1, B, self.hidden_size)
        out, (h1, c1) = self.lstm(x, (h0, c0))
        out = F.softmax(self.fc2(out), dim = -1)
        return out

torch.manual_seed(1337) # for the distribution of the gradient
bert_model.to(device)

torch.set_float32_matmul_precision('high')

model = LSTModel()
model.to(device=device)

optimizer = optim.Adam(model.parameters(), lr = 3e-4)
print(sum(p.numel() for p in model.parameters()),'parameters')



213187 parameters


In [9]:
import time

for i in range(100):
    t1 = time.time()
    train_loss = 0
    train_acc = 0

    model.train()
    for xb_train, yb_train in train_dl:
        x = {k: v.to(device) for k, v in xb_train.items()}
        y = yb_train.to(device)

        with torch.no_grad():
            with torch.autocast(device_type="cuda", dtype=torch.float16):
                embeddings = bert_model(**x)
                embeddings = embeddings.last_hidden_state

        optimizer.zero_grad()
        with torch.autocast(device_type="cuda", dtype=torch.float16):
            logits = model(embeddings)[:, -1, :]
            loss = F.cross_entropy(logits, y)
        loss.backward()
        train_loss += loss.item() * y.size(0)
        train_acc += (torch.argmax(logits, dim = 1) == y).float().sum().item()
        optimizer.step()
        break
    train_loss /= len(train_dl.dataset)
    train_acc /= len(train_dl.dataset)
    t2 = time.time()
    dt = (t2 - t1)* 1000
    print(f'epoch {i} || train_loss {train_loss:4f}, train_accu {train_acc:4f}, time {dt:.2f}ms')



epoch 0 || train_loss 0.001214, train_accu 0.000207, time 5734.94ms
epoch 1 || train_loss 0.001197, train_accu 0.000516, time 4322.61ms
epoch 2 || train_loss 0.001212, train_accu 0.000310, time 4190.39ms
epoch 3 || train_loss 0.001213, train_accu 0.000344, time 4166.23ms
epoch 4 || train_loss 0.001211, train_accu 0.000379, time 4119.93ms
epoch 5 || train_loss 0.001209, train_accu 0.000310, time 4043.61ms
epoch 6 || train_loss 0.001196, train_accu 0.000447, time 4211.08ms
epoch 7 || train_loss 0.001218, train_accu 0.000344, time 4188.95ms


KeyboardInterrupt: 

In [None]:
# for i in range(100):
#     model.train()
#     train_loss = 0
#     train_acc = 0
#     for xb_train, yb_train in train_dl:
#         optimizer.zero_grad(set_to_none=False)

#         # forward pass
#         xb_train = xb_train.to(device)
#         yb_train = yb_train.to(device)
#         logits= model(xb_train)[:, -1, :]
#         loss = F.cross_entropy(logits, yb_train)

#         # backward pass
#         loss.backward()
#         train_loss += loss.item() * batch_size
#         train_acc += (torch.argmax(logits, dim = 1) == yb_train).float().sum().item()
#         # update the gradient
#         optimizer.step()

#     train_loss /= len(train_dl.dataset)
#     train_acc /= len(train_dl.dataset)

#     model.eval()
#     with torch.no_grad():
#         valid_loss = 0
#         valid_acc = 0
#         for xb_valid, yb_valid in valid_dl:
#             xb_train = xb_train.to(device)
#             yb_train = yb_train.to(device)
#             logits = model(xb_valid)[:, -1, :]
#             loss = F.cross_entropy(logits, yb_valid)
#             valid_loss += loss.item() * batch_size
#             valid_acc += (torch.argmax(logits, dim = 1) == yb_train).float().sum().item()

#     valid_loss /= len(valid_dl.dataset)
#     valid_acc /= len(valid_dl.dataset)
#     if i % 10 == 0:
#         print(f'epoch {i} || train_loss {train_loss:4f}, valid_loss {valid_loss:4f}, train_accu {train_acc:4f} , valid_accu {valid_acc:4f} ')
# print(f'epoch {i} || train_loss {train_loss:4f}, valid_loss {valid_loss:4f}, train_accu {train_acc:4f} , valid_accu {valid_acc:4f} ')