In [63]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext
import re
import string

# https://github.com/rsreetech/PyTorchTextClassificationCustomDataset/blob/main/PyTorchTweetTextClassification.ipynb

In [2]:
def remove_url(text): 
    url_pattern  = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>2))]) 
    
    return text2.lower()

# Dataset

In [107]:
class MyDataset(Dataset):
    def __init__(self, data, name="dataset") -> None:
        super().__init__()
        self.name = name
        text, target = data
        self.text = np.array(text)
        self.n_samples = len(self.text)
        if target:
            self.target = np.array(target)
        else:
            self.target = (-1) * np.ones(self.n_samples, dtype=np.int32)

    def __getitem__(self, index):
        return (self.text[index], self.target[index])
    
    def __len__(self):
        return self.n_samples
    
    def get_stats(self):
        print('--------------------')
        print(self.name)
        for val, nb in np.array(np.unique(self.target , return_counts=True)).T:
            print(f"{val}   {nb}")
        print('--------------------')


In [108]:
def splitTrainVal(path, valid_size):
    df = pd.read_csv(path)
    df.drop(columns=["id", "keyword", "location"], inplace=True)
    df["text"] = df["text"].apply(remove_url)
    df["text"] = df["text"].apply(remove_emoji)
    df["text"] = df["text"].apply(clean_text)

    (X_train, X_valid,
        Y_train, Y_valid) = train_test_split(df['text'].to_list(),
                                            df['target'].to_list(),
                                            test_size=valid_size,
                                            shuffle=True,
                                            stratify = df['target'].to_list(),
                                            random_state=0)
    return (X_train, X_valid, Y_train, Y_valid)

In [109]:
# Train et Val
X_train, X_valid, Y_train, Y_valid = splitTrainVal("data/train.csv", 0.2)
train_dataset = MyDataset((X_train, Y_train), name="Train")
val_dataset = MyDataset((X_valid, Y_valid), name="Val")

# Test
df_test = pd.read_csv("data/test.csv")
df_test.drop(columns=["id", "keyword", "location"], inplace=True)
df_test["text"] = df_test["text"].apply(remove_url)
df_test["text"] = df_test["text"].apply(remove_emoji)
df_test["text"] = df_test["text"].apply(clean_text)
test_dataset = MyDataset((df_test["text"].to_list(), None), name="Test")


In [110]:
train_dataset.get_stats()
val_dataset.get_stats()
test_dataset.get_stats()

--------------------
Train
0   3473
1   2617
--------------------
--------------------
Val
0   869
1   654
--------------------
--------------------
Test
-1   3263
--------------------


# Tokenisation

In [5]:
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_dataset), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

In [30]:
def tokeniseDataset(sentenses, tokenizer):
    return [tokenizer(sentense) for sentense in sentenses]


def getMaxLenTweet(sentenses, tokenizer):
    text_tokenised = tokeniseDataset(sentenses, tokenizer)
    return max(len(txt) for txt in text_tokenised)


def transformSenToIntMat(sentenses, tokenizer, vocab, max_num_token):
    text_tokenised = tokeniseDataset(sentenses, tokenizer)
    vocab_list = [vocab(text) for text in text_tokenised]
    return np.array([sous_liste + [0] * (max_num_token - len(sous_liste)) for sous_liste in vocab_list], dtype=np.float32)

In [154]:
# Get the size of the input feature vector 
nb_features = max(
    getMaxLenTweet(train_dataset.text, tokenizer),
    getMaxLenTweet(val_dataset.text, tokenizer),
    getMaxLenTweet(test_dataset.text, tokenizer))

print(f"Our data set (train and test) has {nb_features} tokens max. This will be the shape of the text vector")

Our data set (train and test) has 27 tokens max. This will be the shape of the text vector


# Dataloader

In [150]:
def collate_batch(batch): # batch est une list de tuple Text label
    sentenses = [elem[0] for elem in batch]
    sentenses_mat = torch.from_numpy(transformSenToIntMat(sentenses, tokenizer, vocab, nb_features))
    label = torch.from_numpy(np.array([elem[1] for elem in batch]).reshape(len(sentenses), 1))
    return sentenses_mat, label

In [151]:
dataloader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)

In [152]:
for idx, ( text, label) in enumerate(dataloader):
    print(idx,"\n", text,"\n",label)
    break


0 
 tensor([[1.0000e+00, 4.5730e+03, 1.2000e+02, 5.0000e+00, 8.1630e+03, 3.4090e+03,
         5.0550e+03, 2.5100e+02, 1.8000e+02, 4.2100e+02, 5.1000e+01, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.9500e+02, 6.3300e+02, 2.2200e+02, 8.0000e+00, 1.3000e+02, 2.0700e+02,
         3.6000e+02, 2.0000e+00, 1.0000e+00, 9.4000e+01, 1.4100e+02, 4.1000e+01,
         4.3000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.6299e+04, 3.2200e+03, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.00

# Reste a faire

Creer le model, avec un embeding, suivi d'un LMTS

In [None]:
# Reprendre https://github.com/rsreetech/PyTorchTextClassificationCustomDataset/blob/main/PyTorchTweetTextClassification.ipynb