# 1. Activate GPU and Install Dependencies

In [1]:
# Activate GPU for faster training by clicking on 'Runtime' > 'Change runtime type' and then selecting GPU as the Hardware accelerator
# Then check if GPU is available
import torch
torch.cuda.is_available()

True

In [2]:
# Install required libraries
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m78.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-a

#2. Preprocess data

In [3]:
from google.colab import files
uploaded = files.upload()

Saving google_reviews_balanced.csv to google_reviews_balanced.csv


In [4]:
import pandas as pd

In [5]:
import io
Xpandas = pd.read_csv(io.BytesIO(uploaded['google_reviews_balanced.csv']), sep='|')
Xpandas = Xpandas.rename(columns={"review": "text", "class": "label"})
Xpandas

Unnamed: 0,text,label
0,lifts poorly located in the far corner. kids d...,0
1,i experienced racism! it's so bad.. the cashie...,0
2,very unprofessional staff. manager kitty was s...,0
3,"unprofessional staff, not all of them but the ...",0
4,i bought 3 items at this store at around 8:30p...,0
...,...,...
1429,"staff helpful, clean.",1
1430,cute clothes for a college student like myself,1
1431,nice place. clean,1
1432,good mall but i wish there was more of a varie...,1


In [6]:
ypandas = Xpandas.pop('label')
ypandas

0       0
1       0
2       0
3       0
4       0
       ..
1429    1
1430    1
1431    1
1432    1
1433    1
Name: label, Length: 1434, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xpandas, ypandas, test_size=0.2, random_state=42, stratify=ypandas)

In [8]:
X_train['label'] = y_train
X_test['label'] = y_test

In [9]:
from datasets import Dataset
Xd_train = Dataset.from_pandas(X_train, preserve_index=False)
Xd_test = Dataset.from_pandas(X_test, preserve_index=False)

In [10]:
Xd_train[0]

{'text': 'good variety of clothes at a good price', 'label': 1}

In [11]:
# Set DistilBERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = Xd_train.map(preprocess_function, batched=True)
tokenized_test = Xd_test.map(preprocess_function, batched=True)

Map:   0%|          | 0/1147 [00:00<?, ? examples/s]

Map:   0%|          | 0/287 [00:00<?, ? examples/s]

In [13]:
tokenized_train

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1147
})

In [14]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3. Training the model

In [15]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

In [16]:
# Define the evaluation metrics 
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [19]:
# Log in to your Hugging Face account 
# Get your API token here https://huggingface.co/settings/token
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [20]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch", 
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Cloning https://huggingface.co/lourdesLB/finetuning-sentiment-model-3000-samples into local empty directory.


Download file pytorch_model.bin:   0%|          | 7.40k/255M [00:00<?, ?B/s]

Download file runs/Mar04_11-32-03_4f2e1212baec/1677929526.0841775/events.out.tfevents.1677929526.4f2e1212baec.…

Download file runs/Mar04_11-32-03_4f2e1212baec/events.out.tfevents.1677929526.4f2e1212baec.509.2: 100%|#######…

Download file runs/Mar04_11-13-07_4f2e1212baec/events.out.tfevents.1677928403.4f2e1212baec.509.0: 100%|#######…

Download file runs/Mar04_11-32-03_4f2e1212baec/events.out.tfevents.1677931875.4f2e1212baec.509.4: 100%|#######…

Download file runs/Mar04_11-13-07_4f2e1212baec/1677928403.954577/events.out.tfevents.1677928403.4f2e1212baec.5…

Clean file runs/Mar04_11-32-03_4f2e1212baec/1677929526.0841775/events.out.tfevents.1677929526.4f2e1212baec.509…

Clean file runs/Mar04_11-32-03_4f2e1212baec/events.out.tfevents.1677929526.4f2e1212baec.509.2:  25%|##4       …

Clean file runs/Mar04_11-32-03_4f2e1212baec/events.out.tfevents.1677931875.4f2e1212baec.509.4: 100%|##########…

Clean file runs/Mar04_11-13-07_4f2e1212baec/1677928403.954577/events.out.tfevents.1677928403.4f2e1212baec.509.…

Download file training_args.bin: 100%|##########| 3.43k/3.43k [00:00<?, ?B/s]

Clean file runs/Mar04_11-13-07_4f2e1212baec/events.out.tfevents.1677928403.4f2e1212baec.509.0:  27%|##7       …

Clean file training_args.bin:  29%|##9       | 1.00k/3.43k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/255M [00:00<?, ?B/s]

In [None]:
# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1147
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 144
  Number of trainable parameters = 66955010
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


Saving model checkpoint to finetuning-sentiment-model-3000-samples/checkpoint-72
Configuration saved in finetuning-sentiment-model-3000-samples/checkpoint-72/config.json
Model weights saved in finetuning-sentiment-model-3000-samples/checkpoint-72/pytorch_model.bin
tokenizer config file saved in finetuning-sentiment-model-3000-samples/checkpoint-72/tokenizer_config.json
Special tokens file saved in finetuning-sentiment-model-3000-samples/checkpoint-72/special_tokens_map.json
tokenizer config file saved in finetuning-sentiment-model-3000-samples/tokenizer_config.json
Special tokens file saved in finetuning-sentiment-model-3000-samples/special_tokens_map.json
Saving model checkpoint to finetuning-sentiment-model-3000-samples/checkpoint-144
Configuration saved in finetuning-sentiment-model-3000-samples/checkpoint-144/config.json
Model weights saved in finetuning-sentiment-model-3000-samples/checkpoint-144/pytorch_model.bin


In [62]:
# Compute the evaluation metrics
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 287
  Batch size = 16


  load_accuracy = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 0.18583238124847412,
 'eval_accuracy': 0.9372822299651568,
 'eval_f1': 0.9387755102040817,
 'eval_runtime': 101.2604,
 'eval_samples_per_second': 2.834,
 'eval_steps_per_second': 0.178,
 'epoch': 2.0}

# 4. Analyzing new data with the model

In [63]:
# Upload the model to the Hub
trainer.push_to_hub()

Saving model checkpoint to finetuning-sentiment-model-3000-samples
Configuration saved in finetuning-sentiment-model-3000-samples/config.json
Model weights saved in finetuning-sentiment-model-3000-samples/pytorch_model.bin
tokenizer config file saved in finetuning-sentiment-model-3000-samples/tokenizer_config.json
Special tokens file saved in finetuning-sentiment-model-3000-samples/special_tokens_map.json
remote: Scanning LFS files of refs/heads/main for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/lourdesLB/finetuning-sentiment-model-3000-samples
   832aa6f..2ebce8b  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/lourdesLB/finetuning-sentiment-model-3000-samples
   832aa6f..2ebce8b  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.937282

'https://huggingface.co/lourdesLB/finetuning-sentiment-model-3000-samples/commit/2ebce8b945cb504a01f1ab31f196e4b87522ac8f'

In [66]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="lourdesLB/finetuning-sentiment-model")

sentiment_model(["I love this move", "This movie sucks!"])

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--lourdesLB--finetuning-sentiment-model-3000-samples/snapshots/14e745592ffa1f1a8dd9ea1df0b3f0634131d9d9/config.json
Model config DistilBertConfig {
  "_name_or_path": "lourdesLB/finetuning-sentiment-model-3000-samples",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--lourdesLB--finetuning-sentiment-mo

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--lourdesLB--finetuning-sentiment-model-3000-samples/snapshots/14e745592ffa1f1a8dd9ea1df0b3f0634131d9d9/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at lourdesLB/finetuning-sentiment-model-3000-samples.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/360 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--lourdesLB--finetuning-sentiment-model-3000-samples/snapshots/14e745592ffa1f1a8dd9ea1df0b3f0634131d9d9/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--lourdesLB--finetuning-sentiment-model-3000-samples/snapshots/14e745592ffa1f1a8dd9ea1df0b3f0634131d9d9/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--lourdesLB--finetuning-sentiment-model-3000-samples/snapshots/14e745592ffa1f1a8dd9ea1df0b3f0634131d9d9/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--lourdesLB--finetuning-sentiment-model-3000-samples/snapshots/14e745592ffa1f1a8dd9ea1df0b3f0634131d9d9/tokenizer_config.json


[{'label': 'LABEL_1', 'score': 0.9032949209213257},
 {'label': 'LABEL_0', 'score': 0.9534971714019775}]

In [80]:
from google.colab import files
uploaded = files.upload()

Saving google_news_cleaned.csv to google_news_cleaned (1).csv


In [83]:
Xnews = pd.read_csv(io.BytesIO(uploaded['google_news_cleaned.csv']), sep='|').iloc[:,0]
Xnews

0     Five-star bathrooms with these H&M Home access...
1     The H&M midi skirt that influencers over 50 we...
2     The H&M padded jacket that is a hit among infl...
3     H&M has the accessory that turns any look into...
4          Irina Shayk has the most worn jacket in 2023
                            ...                        
75    H&M, under scrutiny due to doubts about the su...
76    The eight most beautiful H&M garments that are...
77    H&M jumps into the world of immersive gaming w...
78       Unions and H&M agree on an extra to compensate
79    All the storage accessories that we have found...
Name: 0, Length: 80, dtype: object

In [84]:
news = Xnews.to_list() # con las noticias va regu habra que probar con los tuits
news

['Five-star bathrooms with these H&M Home accessories',
 'The H&M midi skirt that influencers over 50 wear with Adidas: satin and elegant XL print',
 'The H&M padded jacket that is a hit among influencers over 50: loose, pink and perfect for between-seasons',
 'H&M has the accessory that turns any look into a trend for less than 10 euros',
 'Irina Shayk has the most worn jacket in 2023',
 'The dress with the definitive guy effect is this one from H&M that Anna Padilla has made us want',
 'Smile from ear to ear when finding the H&M shoes worn by the most elegant women',
 'It looks like Chanel, it is H&M: the quilted bag to give a touch of luxury to your look for less than 20 euros',
 'H&M will lay off 1,500 workers',
 'H&M has the ideal complement to order the bathroom without taking up space (and it costs less than 10 euros)',
 'H&M announces expansion plan: it will open almost a dozen new',
 'H&M Home has the most elegant sink accessories to elevate the design of the bathroom',
 'In t

In [88]:
for new in news:
  print(new)
  print(sentiment_model(new))

Five-star bathrooms with these H&M Home accessories
[{'label': 'LABEL_0', 'score': 0.5410690307617188}]
The H&M midi skirt that influencers over 50 wear with Adidas: satin and elegant XL print
[{'label': 'LABEL_1', 'score': 0.7606040239334106}]
The H&M padded jacket that is a hit among influencers over 50: loose, pink and perfect for between-seasons
[{'label': 'LABEL_1', 'score': 0.9555617570877075}]
H&M has the accessory that turns any look into a trend for less than 10 euros
[{'label': 'LABEL_0', 'score': 0.9240064024925232}]
Irina Shayk has the most worn jacket in 2023
[{'label': 'LABEL_0', 'score': 0.9045273065567017}]
The dress with the definitive guy effect is this one from H&M that Anna Padilla has made us want
[{'label': 'LABEL_1', 'score': 0.7977362275123596}]
Smile from ear to ear when finding the H&M shoes worn by the most elegant women
[{'label': 'LABEL_0', 'score': 0.6699050068855286}]
It looks like Chanel, it is H&M: the quilted bag to give a touch of luxury to your look 