In [1]:
#Modelo DistilBETO Cleaned FINAL

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from transformers import DistilBertForSequenceClassification, AdamW, DistilBertTokenizer
from transformers import get_linear_schedule_with_warmup
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
import torch.optim
import numpy as np
import pandas as pd
import time
import datetime
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import pandas as pd
from textwrap import wrap
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from wordcloud import STOPWORDS
import re,string
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score,precision_score,recall_score
from string import punctuation
from nltk.corpus import wordnet


MAX_LEN = 85

# Select cpu or cuda
run_on = 'cpu'
device = torch.device(run_on)

df_train = pd.read_csv('/Users/nfanlo/dev/spanish-classifier-tfg/dataset/80-10-10/train.csv')
print(df_train.shape)
df_train.isnull().sum()
df_train.sentiment.replace("P" , 2 , inplace = True)
df_train.sentiment.replace("N" , 0 , inplace = True)
df_train.sentiment.replace("NEU" , 1, inplace = True)
df_train.head()
df_train['review'] = df_train['text']
df_train.drop('text', axis=1, inplace=True)
df_train['label'] = df_train['sentiment']
df_train.drop('sentiment', axis=1, inplace=True)

df_dev = pd.read_csv('/Users/nfanlo/dev/spanish-classifier-tfg/dataset/80-10-10/dev.csv')
print(df_dev.shape)
df_dev.isnull().sum()
df_dev.sentiment.replace("P" , 2 , inplace = True)
df_dev.sentiment.replace("N" , 0 , inplace = True)
df_dev.sentiment.replace("NEU" , 1, inplace = True)
df_dev['review'] = df_dev['text']
df_dev.drop('text', axis=1, inplace=True)
df_dev['label'] = df_dev['sentiment']
df_dev.drop('sentiment', axis=1, inplace=True)

df_test = pd.read_csv('/Users/nfanlo/dev/spanish-classifier-tfg/dataset/80-10-10/test.csv')
print(df_dev.shape)
df_test.isnull().sum()
df_test.sentiment.replace("P" , 2 , inplace = True)
df_test.sentiment.replace("N" , 0 , inplace = True)
df_test.sentiment.replace("NEU" , 1, inplace = True)
df_test['review'] = df_test['text']
df_test.drop('text', axis=1, inplace=True)
df_test['label'] = df_test['sentiment']
df_test.drop('sentiment', axis=1, inplace=True)


(5787, 2)
(723, 2)
(723, 2)


In [2]:
##CUSTOM DEFINED FUNCTIONS TO CLEAN THE TWEETS


#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

In [3]:
review_new_train = []
review_new_dev = []
review_new_test = []

for t in df_train.review:
    review_new_train.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(t)))))

for t in df_dev.review:
    review_new_dev.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(t)))))

for t in df_test.review:
    review_new_test.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(t)))))

In [4]:
df_train['review'] = review_new_train
df_dev['review'] = review_new_dev
df_test['review'] = review_new_test

X_train = df_train.iloc[:, 0]
y_train = df_train.iloc[:, 1]
X_dev = df_dev.iloc[:, 0]
y_dev = df_dev.iloc[:, 1]
X_test = df_dev.iloc[:, 0]
y_test = df_dev.iloc[:, 1]

In [5]:
# Tokenization

tokenizer = DistilBertTokenizer.from_pretrained('dccuchile/distilbert-base-spanish-uncased',
            do_lower_case=True)

def preprocessing(dataset):
    input_ids = []
    attention_mask = []
    for doc in dataset:
        encoded_doc = tokenizer.encode_plus(doc,
                   add_special_tokens=True, max_length=MAX_LEN,
                   truncation=True ,pad_to_max_length=True,
                   return_token_type_ids = False,
                   return_attention_mask = True,)
        input_ids.append(encoded_doc['input_ids'])
        attention_mask.append(encoded_doc['attention_mask'])
    return (torch.tensor(input_ids),
           torch.tensor(attention_mask))

# Apply preprocessing to dataset
X_train_inputs, X_train_masks = preprocessing(X_train)
X_dev_inputs, X_dev_masks = preprocessing(X_dev)

# Report max n° tokens in a sentence
max_len = max([torch.sum(sen) for sen in X_train_masks])
print('Max n°tokens in a sentence: {0}'.format(max_len))



Max n°tokens in a sentence: 85


In [6]:
#Data loaders
batch_size = 8

y_train_labels = torch.tensor(y_train.values)
y_dev_labels = torch.tensor(y_dev.values)

def dataloader(x_inputs, x_masks, y_labels, is_train_split):
    data = TensorDataset(x_inputs, x_masks, y_labels)
    if is_train_split:
	    sampler = RandomSampler(data)
    else:
    	sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size, num_workers=4)
    
    return dataloader

train_dataloader = dataloader(X_train_inputs, X_train_masks, y_train_labels, True)
val_dataloader = dataloader(X_dev_inputs, X_dev_masks, y_dev_labels, False)

In [7]:
#Cargamos el modelo + optimizador + definimos EPOCHS + Scheduler
#Modelo
model = DistilBertForSequenceClassification.from_pretrained('dccuchile/distilbert-base-spanish-uncased', num_labels=3,
 output_attentions=False, output_hidden_states=False)

optimizer = AdamW(model.parameters(), lr = 5e-5)

epochs=2

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps= total_steps)

Some weights of the model checkpoint at dccuchile/distilbert-base-spanish-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/distilbert-base-spanish-uncased and are newly initialized: ['pre_classifier.wei

In [36]:
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=0.01)


training_args = TrainingArguments(
    output_dir='/Users/nfanlo/dev/spanish-classifier-tfg/final-models/neural-models/80-10-10/final-model/results-final',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=64,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir='/Users/nfanlo/dev/spanish-classifier-tfg/final-models/neural-models/80-10-10/final-model/logs-final',
    logging_steps=10,
    evaluation_strategy='steps',
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
    lr_scheduler_type='linear',
    learning_rate=5e-5
)

In [38]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    callbacks=[early_stopping_callback]
)

trainer.train()



  0%|          | 0/182 [00:00<?, ?it/s]

TypeError: 'DataLoader' object is not subscriptable