size: 205888
type: <class 'str'>


#### 1. Activate GPU and Install Dependencies

In [5]:
# Install required libraries
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs

# Activate GPU for faster training by clicking on 'Runtime' > 'Change runtime type' and then selecting GPU as the Hardware accelerator
# Then check if GPU is available
import torch
torch.cuda.is_available()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 22 not upgraded.


True

####2. Preprocess data

In [6]:
# Load data
from datasets import load_dataset
imdb = load_dataset("imdb")

# Create a smaller training dataset for faster training times
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])
print(small_train_dataset[0])
print(small_test_dataset[0])

# Set DistilBERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

{'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...', 'label': 1}
{'text': "<br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, 

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

#### 3. Training the model

In [12]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

from transformers import AutoTokenizer, AutoModelForSequenceClassification

access_token = "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"

tokenizer = AutoTokenizer.from_pretrained("davidlandeo/finetuning-sentiment-model-3000-samples", use_auth_token=access_token)

model = AutoModelForSequenceClassification.from_pretrained("davidlandeo/finetuning-sentiment-model-3000-samples", use_auth_token=access_token)

# Define the evaluation metrics 
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [9]:
# Log in to your Hugging Face account 
# Get your API token here https://huggingface.co/settings/token
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [13]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer

repo_name = "davidlandeo/finetuning-sentiment-model-3000-samples"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch", 
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/davidlandeo/finetuning-sentiment-model-3000-samples into local empty directory.


Download file pytorch_model.bin:   0%|          | 16.5k/255M [00:00<?, ?B/s]

Download file runs/Mar04_01-24-11_63df751e49af/events.out.tfevents.1677893501.63df751e49af.2486.2: 100%|######…

Download file runs/Mar04_01-24-11_63df751e49af/events.out.tfevents.1677893069.63df751e49af.2486.0: 100%|######…

Clean file runs/Mar04_01-24-11_63df751e49af/events.out.tfevents.1677893501.63df751e49af.2486.2: 100%|#########…

Clean file runs/Mar04_01-24-11_63df751e49af/events.out.tfevents.1677893069.63df751e49af.2486.0:  25%|##4      …

Download file runs/Mar04_01-24-11_63df751e49af/1677893069.5719197/events.out.tfevents.1677893069.63df751e49af.…

Download file training_args.bin: 100%|##########| 3.43k/3.43k [00:00<?, ?B/s]

Clean file runs/Mar04_01-24-11_63df751e49af/1677893069.5719197/events.out.tfevents.1677893069.63df751e49af.248…

Clean file training_args.bin:  29%|##9       | 1.00k/3.43k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/255M [00:00<?, ?B/s]

In [14]:
# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3000
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 376
  Number of trainable parameters = 66955010
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


Saving model checkpoint to davidlandeo/finetuning-sentiment-model-3000-samples/checkpoint-188
Configuration saved in davidlandeo/finetuning-sentiment-model-3000-samples/checkpoint-188/config.json
Model weights saved in davidlandeo/finetuning-sentiment-model-3000-samples/checkpoint-188/pytorch_model.bin
tokenizer config file saved in davidlandeo/finetuning-sentiment-model-3000-samples/checkpoint-188/tokenizer_config.json
Special tokens file saved in davidlandeo/finetuning-sentiment-model-3000-samples/checkpoint-188/special_tokens_map.json
tokenizer config file saved in davidlandeo/finetuning-sentiment-model-3000-samples/tokenizer_config.json
Special tokens file saved in davidlandeo/finetuning-sentiment-model-3000-samples/special_tokens_map.json
Saving model checkpoint to davidlandeo/finetuning-sentiment-model-3000-samples/checkpoint-376
Configuration saved in davidlandeo/finetuning-sentiment-model-3000-samples/checkpoint-376/config.json
Model weights saved in davidlandeo/finetuning-sent

TrainOutput(global_step=376, training_loss=0.11227244519172831, metrics={'train_runtime': 323.2512, 'train_samples_per_second': 18.561, 'train_steps_per_second': 1.163, 'total_flos': 783875831546880.0, 'train_loss': 0.11227244519172831, 'epoch': 2.0})

In [16]:
# Compute the evaluation metrics
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 300
  Batch size = 16


{'eval_loss': 0.48243340849876404,
 'eval_accuracy': 0.89,
 'eval_f1': 0.8925081433224756,
 'eval_runtime': 6.6202,
 'eval_samples_per_second': 45.316,
 'eval_steps_per_second': 2.87,
 'epoch': 2.0}

#### 4. Analyzing new data with the model

In [17]:
# Upload the model to the Hub
trainer.push_to_hub()

Saving model checkpoint to davidlandeo/finetuning-sentiment-model-3000-samples
Configuration saved in davidlandeo/finetuning-sentiment-model-3000-samples/config.json
Model weights saved in davidlandeo/finetuning-sentiment-model-3000-samples/pytorch_model.bin
tokenizer config file saved in davidlandeo/finetuning-sentiment-model-3000-samples/tokenizer_config.json
Special tokens file saved in davidlandeo/finetuning-sentiment-model-3000-samples/special_tokens_map.json


Upload file runs/Mar06_14-49-16_bfffc849c3c2/events.out.tfevents.1678115015.bfffc849c3c2.145.0: 100%|#########…

Upload file runs/Mar06_14-49-16_bfffc849c3c2/events.out.tfevents.1678115346.bfffc849c3c2.145.2: 100%|#########…

remote: Scanning LFS files of refs/heads/main for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/davidlandeo/finetuning-sentiment-model-3000-samples
   96eb8da..46eb602  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/davidlandeo/finetuning-sentiment-model-3000-samples
   96eb8da..46eb602  main -> main

To https://huggingface.co/davidlandeo/finetuning-sentiment-model-3000-samples
   46eb602..972bcbb  main -> main

   46eb602..972bcbb  main -> main



'https://huggingface.co/davidlandeo/finetuning-sentiment-model-3000-samples/commit/46eb60237cf6388141d680fa21496140e3004a50'

In [21]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="davidlandeo/finetuning-sentiment-model-3000-samples")

loading configuration file davidlandeo/finetuning-sentiment-model-3000-samples/config.json
Model config DistilBertConfig {
  "_name_or_path": "davidlandeo/finetuning-sentiment-model-3000-samples",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading configuration file davidlandeo/finetuning-sentiment-model-3000-samples/config.json
Model config DistilBertConfig {
  "_name_or_path": "davidlandeo/finetuning-sentiment-model-3000-samples",
  "activation": "gelu",
  "architectur

In [26]:
# We get the whole chat

with open('WhatsApp Big Data.txt',mode='r', encoding="utf8") as f2:
    chat_completo = f2.read()
    print('size:',len(chat_completo))
    print('type:',type(chat_completo))
    # print(chat_completo)

size: 205888
type: <class 'str'>


In [47]:
# We get the whole chat

with open('WhatsApp Big.txt',mode='r', encoding="utf8") as f2:
    chat_completo_2 = f2.read()
    print('size:',len(chat_completo_2))
    print('type:',type(chat_completo_2))
    # print(chat_completo)

size: 265421
type: <class 'str'>


In [38]:
new_chat = []
a = 0
b = 0

while chat_completo.find('\n', a+1) != -1:
    a = chat_completo.find('\n',b)
    b = chat_completo.find('\n',a+1)
    new_chat.append(chat_completo[a+1:b])

datos_no_validos = 0
chat_sin_2puntos = []
for i in range(len(new_chat)):
    if new_chat[i].count(':') == 1:
        datos_no_validos = datos_no_validos + 1
    else:
        a = new_chat[i].find(':')
        b = new_chat[i].find(':',a+1)
        chat_sin_2puntos.append(new_chat[i][b+2:])

chat_str = " ".join(chat_sin_2puntos)
print(len(chat_str))
# print(chat_str)

106441


In [49]:
new_chat_2 = []
a = 0
b = 0

while chat_completo_2.find('\n', a+1) != -1:
    a = chat_completo_2.find('\n',b)
    b = chat_completo_2.find('\n',a+1)
    new_chat_2.append(chat_completo_2[a+1:b])

datos_no_validos = 0
chat_sin_puntos = []
for i in range(len(new_chat_2)):
    if new_chat_2[i].count(':') == 1:
        datos_no_validos = datos_no_validos + 1
    else:
        a = new_chat_2[i].find(':')
        b = new_chat_2[i].find(':',a+1)
        chat_sin_puntos.append(new_chat_2[i][b+2:])

chat_str = " ".join(chat_sin_puntos)
print(len(chat_str))
# print(chat_str)

139979


In [35]:
print((datos_no_validos))
print(len(new_chat))
print(len(chat_sin_2puntos))

59
3174
3115


In [51]:
print((datos_no_validos))
print(len(new_chat_2))
print(len(chat_sin_puntos))

88
4122
4034


In [67]:
i = 2058
val = sentiment_model(chat_sin_puntos[i])
print(chat_sin_puntos[i])
val

I'm giving him a Jupyter notebook with comments


[{'label': 'LABEL_0', 'score': 0.9207105040550232}]

In [84]:
i = 3360
val = sentiment_model(chat_sin_puntos[i])
print(chat_sin_puntos[i])
val

We all are frustrated about max, we probably cant do much about that


[{'label': 'LABEL_0', 'score': 0.9932522177696228}]

In [113]:
i = 1990
val = sentiment_model(chat_sin_puntos[i])
print(chat_sin_puntos[i])
val

I took a screenshot of the pdf


[{'label': 'LABEL_1', 'score': 0.531906008720398}]

In [91]:
sentiment_model('Guys')

[{'label': 'LABEL_0', 'score': 0.5428915023803711}]

In [103]:
#test before we discuss about Max
negative_values = []
positive_values = []
for mssge in chat_sin_2puntos:
  val = sentiment_model(mssge)
  # if val[0].get('score') < 0.54:
  #   print(mssge) 
  if val[0].get('score') >= 0.55: 
    if val[0].get('label') == 'LABEL_0':
      negative_values.append(val[0].get('score'))
    else:
      positive_values.append(val[0].get('score'))

print("Negative average:", np.mean(negative_values))
print("Positive average:", np.mean(positive_values))

Negative average: 0.7814023067191435
Positive average: 0.7566614862903


In [100]:
#test after we discuss about Max
negative_values = []
positive_values = []
for mssge in chat_sin_puntos:
  val = sentiment_model(mssge)
  # if val[0].get('score') < 0.54:
  #   print(mssge) 
  if val[0].get('score') >= 0.55: 
    if val[0].get('label') == 'LABEL_0':
      negative_values.append(val[0].get('score'))
    else:
      positive_values.append(val[0].get('score'))

In [101]:
print("Negative average:", np.mean(negative_values))
print("Positive average:", np.mean(positive_values))

Negative average: 0.7813059425070172
Positive average: 0.7545798528508773


In [102]:
print(len(negative_values))
print(len(positive_values))

2436
986


In [90]:
print("Negative average:", np.mean(negative_values))
print("Positive average:", np.mean(positive_values))

Negative average: 0.7557750808088134
Positive average: 0.6955309773391147
