<a href="https://colab.research.google.com/github/mthsansu/MLNLP/blob/main/Code/Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 **In all your experiments you should compare the performance of your models with simpler models (baselines). Baseline models should correspond to the most simple model you can build to solve your task. It could be a rule-based system, or a simple machine learning / deep-learning based system**

In [2]:
# Import modules and frameworks
#! pip install transformers
#! pip install torchinfo

In [4]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import nltk
from gensim.models.phrases import Phrases, Phraser
from nltk.tokenize import TreebankWordTokenizer, TweetTokenizer
import pandas as pd
from termcolor import colored
from collections import Counter
from nltk.tokenize import TweetTokenizer
from tqdm.notebook import tqdm

In [24]:
# Import data
git_url = "https://raw.githubusercontent.com/mthsansu/MLNLP/main/Data/"
df = pd.read_csv(git_url + 'df_label_sentiment.csv')


In [25]:
#df. rename(columns = {'test':'Label == sentiment'}, inplace = True)
df = df[["text", "Label"]]
df.head()

Unnamed: 0,text,Label
0,"Article de Var Matin, ce jour, sur la proposit...",0
1,Je me réjouis de l’annonce du @gouvernementFR ...,1
2,"Une fois guéris, les enfants hospitalisés au @...",1
3,« Progressivement nous devons avoir une foncti...,0
4,"Réaction commune, avec ma collègue députée Sab...",0


In [26]:
train, test = train_test_split(df, test_size=0.2)

In [27]:
train = train.to_dict('records')
test = test.to_dict('records')

In [None]:
# Use TweetTokenizer
tok = TweetTokenizer()

['i', 'have', 'a', 'new', 'gpu', '!']

In [None]:
# Define useful functions
def tokenize_pad_numericalize(entry, vocab_stoi, max_length=20):
  text = [ vocab_stoi[token] if token in vocab_stoi else vocab_stoi['<unk>'] for token in tok.tokenize(entry.lower())]
  padded_text = None
  if len(text) < max_length:   padded_text = text + [ vocab_stoi['<pad>'] for i in range(len(text), max_length) ] 
  elif len(text) > max_length: padded_text = text[:max_length]
  else:                        padded_text = text
  return padded_text

def tokenize_all(entries, vocab_stoi):
  res = {}
  res['text'] = [tokenize_pad_numericalize(entry, vocab_stoi, max_length=200) for entry in entries['text']]
  res['label'] = entries['label']
  return res

In [None]:
# defining the model 
class MinimalExampleModel(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(MinimalExampleModel, self).__init__()
        # define a first linear layer with an input dimension (D_in) and a Hidden dimension (H)
        # linear layer documentation: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html#torch.nn.Linear
        self.linear1 = torch.nn.Linear(D_in, H, bias=True)
        # define the final linear layer, often named classification layer as the output dimension (D_out) == the number of target classes
        self.linear2 = torch.nn.Linear(H, D_out, bias=True)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        # first apply the linear1 layer on the vector and encapsulate it with an activation function for non linearization. This creates a dense layer.
        # relu documentation: https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html#torch.nn.ReLU 
        h_relu = torch.relu(self.linear1(x))
        y_pred = self.linear2(h_relu)
        return y_pred

In [None]:
# instanciating the model with
# N:      batch size
# D_in:   input dimension
# H:      hidden dimension
# D_out:  output dimension
N, D_in, H, D_out = 2, 10, 10, 2

# Construct our model by instantiating the class defined above 
# Note: all the parameters are initialized here 
model = MinimalExampleModel(D_in, H, D_out)
# You can look up into the model 
model

MinimalExampleModel(
  (linear1): Linear(in_features=10, out_features=10, bias=True)
  (linear2): Linear(in_features=10, out_features=2, bias=True)
)

In [None]:
# let's use torchinfo to have a better insight of the model infos: https://github.com/TylerYep/torchinfo
from torchinfo import summary
dummy_input_size = (1, 10) # (batch_size, D_in)
summary(model, (N, D_in))

Layer (type:depth-idx)                   Output Shape              Param #
MinimalExampleModel                      --                        --
├─Linear: 1-1                            [2, 10]                   110
├─Linear: 1-2                            [2, 2]                    22
Total params: 132
Trainable params: 132
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00

In [None]:
# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [None]:
if torch.cuda.is_available():
  device = 'cuda'
  print('DEVICE = ', colored(torch.cuda.get_device_name(0), "green" ) )
else:
  device = 'cpu'
  print('DEVICE = ', colored('CPU', "blue"))
model = model.to(device)

#model = MyModel(D_in, H, D_out)
# forward pass / predict x 
y_pred = model(x.to(device)) # almost equivalent to model.forward(x)
# y_pred
y_pred

DEVICE =  [34mCPU[0m


tensor([[-0.4117, -0.1531],
        [-0.2486, -0.3666]], grad_fn=<AddmmBackward0>)

In [None]:
# Create the model to classify tweets
class TweetModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, pretrained_vectors=None):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TweetModel, self).__init__()
        # apply the pretrained embeddings to transform our token indices, into vectors
        self.ebd = torch.nn.Embedding.from_pretrained(pretrained_vectors, freeze=True)
        self.hidden_linear_layer = torch.nn.Linear(hidden_dim, hidden_dim, bias=True)
        self.classification_layer = torch.nn.Linear(hidden_dim, output_dim, bias=True)
        # softmax layer to compute class probabilities
        # https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html?highlight=softmax#torch.nn.Softmax
        self.softmax = nn.Softmax(dim=1)

        # define the dropout strategy (here, 20% (0.2) of the vector is ignored to prevent overfitting)
        # we don't use it here but it's a good thing to keep in mind
        # self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        # apply the pretrained embeddings
        x  = self.ebd(x)
        x  = x.mean(1)
        h  = torch.relu(self.hidden_linear_layer( x ))
        # h  = self.dropout(h)
        h  = self.classification_layer(h)
        logits = self.softmax(h)
        return logits