#### 1. Activate GPU and Install Dependencies

In [1]:
# Install required libraries
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs

# Activate GPU for faster training by clicking on 'Runtime' > 'Change runtime type' and then selecting GPU as the Hardware accelerator
# Then check if GPU is available
import torch
torch.cuda.is_available()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading r

True

In [15]:
# Load data
from datasets import load_dataset
imdb = load_dataset("imdb")

# Create a smaller training dataset for faster training times
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])
print(small_train_dataset[0])
print(small_test_dataset[0])

# Set DistilBERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



  0%|          | 0/3 [00:00<?, ?it/s]



{'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...', 'label': 1}
{'text': "<br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, 

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

#### 3. Training the model

In [17]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# access_token = "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"

# Define the evaluation metrics 
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [4]:
# Log in to your Hugging Face account 
# Get your API token here https://huggingface.co/settings/token
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [19]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model-3000-samples"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch", 
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/davidlandeo/finetuning-sentiment-model-3000-samples into local empty directory.


In [20]:
# Train the model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=376, training_loss=0.282241719834348, metrics={'train_runtime': 324.44, 'train_samples_per_second': 18.493, 'train_steps_per_second': 1.159, 'total_flos': 783875831546880.0, 'train_loss': 0.282241719834348, 'epoch': 2.0})

In [21]:
# Compute the evaluation metrics
trainer.evaluate()

{'eval_loss': 0.3319162130355835,
 'eval_accuracy': 0.87,
 'eval_f1': 0.8712871287128714,
 'eval_runtime': 5.6311,
 'eval_samples_per_second': 53.276,
 'eval_steps_per_second': 3.374,
 'epoch': 2.0}

#### 4. Analyzing new data with the model

In [24]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="federicopascual/finetuning-sentiment-model-3000-samples")

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [25]:
# We get the whole chat from the text file exported from WhatsApp

with open('Chats_2/WhatsApp BTS.txt',mode='r', encoding="utf8") as f2:
    chat = f2.read()
    print('size of chat:',len(chat))
    print('variable type:',type(chat))
    print(chat[:700])

# We print just the first 700 characters to see how the file information is distributed

size of chat: 151966
variable type: <class 'str'>
9/30/22, 21:56 - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.
9/30/22, 21:56 - You created group "D"
9/30/22, 21:56 - You removed Amandeep Idealista
9/30/22, 21:58 - You changed the subject from "D" to "🍻Hppy Beer day🇪🇸"
9/30/22, 22:00 - You changed the subject from "🍻Hppy Beer day🇪🇸" to "🍻Beer/dinner🍔/beach🏖️"
9/30/22, 22:40 - Sebastian Vila joined using this group's invite link
9/30/22, 22:41 - Sebastian Vila: Hola David! Gran idea
9/30/22, 22:58 - david: Gracias Sebastián
9/30/22, 22:58 - david: De qué programa eres?
9/30/22, 23:01 - Sebastian Vila: Ux/ui
9/30/22, 23:01 - Sebastian Vila: Vos?
9/


#### 5. Function to store the comments in a database

In [26]:
import sqlite3

def insert_database(name, comment, pos_score, neg_score):
    database_name = 'Comments.db' 
    connection = sqlite3.connect(database_name)
    cursor = connection.cursor()

    cursor.execute(f"CREATE TABLE IF NOT EXISTS {name}(Comment TEXT, Positive_Score INT, Negative_Score INT)")
    cursor.execute(f'''
                    INSERT INTO {name}(Comment, Positive_Score, Negative_Score)
                    VALUES (?,?,?)
                    ''',
                    [comment, pos_score, neg_score])
    connection.commit()
    cursor.close()
    connection.close()
    return

#### 6. Data preparation

In [27]:
# This new list (new_chat) is gonna store individual messages after every new line(\n)
new_chat = []
a = 0
b = 0

# Splitting the whole string on each \n  
while chat.find('\n', a+1) != -1:
    a = chat.find('\n',b)
    b = chat.find('\n',a+1)
    new_chat.append(chat[a+1:b])

# create an empty set to store the names of every member of the whatsapp Group
names = set()

no_valid_data = 0

# new list that will not include the time and date
cleaned_chat = []

for i in range(len(new_chat)):
  # we verify if there is a colon 
    if (new_chat[i].count(':')):
      # Messages has at least 2 colons, that's why I'm filtering if there is only one
        if (new_chat[i].count(':') == 1) :
            no_valid_data = no_valid_data + 1
        else:
            # Findind the positions of the colons, slash, and the hyphen
            first_colon = new_chat[i].find(':')
            second_colon = new_chat[i].find(':',first_colon+1)
            slash = new_chat[i].find('/')
            hyphen = new_chat[i].find('-')
            
            # With the positions of the colons and the slash, we can evaluate if the message is valid to assign to a person
            if (first_colon>=9) and (first_colon<=13) and (slash<4):    
              nombre = new_chat[i][hyphen+2:second_colon]
              nombre = nombre.replace(" ","_")
              if (nombre.find('+')<0):
                names.add(nombre)
              cleaned_chat.append(new_chat[i][hyphen+2:])

# Show the results after cleaning data
print('Amount of rows that were deleted:', (no_valid_data))
print('Size of the file before be cleaned',len(new_chat))
print('Size of the file after be cleaned' ,len(cleaned_chat))

Amount of rows that were deleted: 147
Size of the file before be cleaned 2476
Size of the file after be cleaned 2118


#### 7. Analyzing the sentiment of the whole Whatsapp group

In [28]:
# We are gonna calculate the average of the whole chat
# Depending on whether it's negative or positive
negative_values = []
positive_values = []

for row in cleaned_chat:
  comment = row[first_colon+2:]
  val = sentiment_model(comment)                        # Analyze the sentiment of every message

  if val[0].get('score') >= 0.55:                     # The score has to be at least 0.55 to be counted
                                                      # Less than 0.55 doesn't make much sense
    if val[0].get('label') == 'LABEL_0':
      negative_values.append(val[0].get('score'))     # LABEL_0 means it's negative
    else:
      positive_values.append(val[0].get('score'))     # LABEL_1 means it's negative

print('The number of negative messages found',len(negative_values))
print('The number of positive messages found',len(positive_values))

print("Negative average:", np.mean(negative_values))
print("Positive average:", np.mean(positive_values))

The number of negative messages found 1018
The number of positive messages found 577
Negative average: 0.662192004427226
Positive average: 0.7273065938486797


#### 8. Analyzing the sentiment of the every member of the group

In [34]:
# Create a dictionary with the names of all the members of the whatsapp group
# For negative and positive messages
import pandas as pd
name_lists_positive = {name: [] for name in names}
name_lists_negative = {name: [] for name in names}

# Get the sentiment analysis of each person
for row in cleaned_chat:
  first_colon = row.find(':')
  comment = row[first_colon+2:]
  if (comment.count('/')<3) & (comment.find('Media omitted')<0): 
    val = sentiment_model(comment)                                  # Evaluate the sentiment of each message
    nombre = row[:first_colon]                                  # Get only the name of the person who sent the message
    nombre = nombre.replace(" ","_")

    # Save the result of its sentiment of each person in the dictionary
    if (nombre in names):
      if (val[0].get('label')== 'LABEL_0'):
        name_lists_negative[nombre].append(round(val[0].get('score'),3))
        insert_database(nombre, comment, '', round(val[0].get('score'),3))
      elif (val[0].get('label')== 'LABEL_1'):
        name_lists_positive[nombre].append(round(val[0].get('score'),3))
        insert_database(nombre, comment, round(val[0].get('score'),3), '')

  # Get the average of all the messages' sentiment that are positive of everyone

  average_positive = {name: [] for name in names}
  for name in names:
    average_positive[name].append(np.mean(name_lists_positive[name]))

  # Get the average of all the messages' sentiment that are positive of everyone

  average_negative = {name: [] for name in names}
  for name in names:
    average_negative[name].append(np.mean(name_lists_negative[name]))

# Define a function to get the value of the list
def rounding(x):
    return round(x[0],3)

# Convert to dataframe to show the results 

Positive_results = pd.DataFrame.from_dict(average_positive.items())
Positive_results = Positive_results.rename(columns={0: 'Name', 1: 'Positive Sentiment Score'})
Positive_results['Positive Sentiment Score'] = Positive_results['Positive Sentiment Score'].apply(lambda row: rounding(row))

Negative_results = pd.DataFrame.from_dict(average_negative.items())
Negative_results = Negative_results.rename(columns={0: 'Name', 1: 'Negative Sentiment Score'})
Negative_results['Negative Sentiment Score'] = Negative_results['Negative Sentiment Score'].apply(lambda row: rounding(row))


Mean of empty slice.


invalid value encountered in double_scalars



#### 9. Some positive results

In [35]:
Positive_results.head()

Unnamed: 0,Name,Positive Sentiment Score
0,Sebastian_Vila,0.666
1,Fernanda_Orviz,0.691
2,Nishit,0.794
3,Kim_Jimenez,0.728
4,Ghandi,0.662


#### 10. Some negative results

In [42]:
Negative_results.head()

Unnamed: 0,Name,Negative Sentiment Score
0,Sebastian_Vila,0.611
1,Fernanda_Orviz,0.569
2,Nishit,0.588
3,Kim_Jimenez,0.688
4,Ghandi,0.586


#### 11. Graphs

In [32]:
import plotly.express as px

fig = px.bar(Positive_results, y='Positive Sentiment Score',x='Name',template='simple_white')
fig.update_layout(title_text='Positive Sentiment Score per person', title_x=0.5)
fig

<p>
  <img src="./Screenshots/plotly_positive_2.png" width="800">
</p>

#### 12. Results to analyze

In [37]:
i = 1260
val = sentiment_model(cleaned_chat[i])
print(cleaned_chat[i])
val

Taz: Congrats 🎊


[{'label': 'LABEL_1', 'score': 0.6571516394615173}]

In [38]:
i = 476
val = sentiment_model(cleaned_chat[i])
print(cleaned_chat[i])
val

Gabriel: Anyone who would like to go play beach volley in barceloneta today?


[{'label': 'LABEL_1', 'score': 0.5230283737182617}]

In [39]:
i = 233
val = sentiment_model(cleaned_chat[i])
print(cleaned_chat[i])
val

Vicky: The gym is really nice, pools are big, there’s a coworking space also, lots of equipment 👌


[{'label': 'LABEL_1', 'score': 0.9279422163963318}]

#### 13. Best comment

In [40]:
highest_positive = 0
best_comment = "There isn't"
index_comment_1 = 0
index_best_comment = 0

# Get the sentiment analysis of each person
for mssge in cleaned_chat:
  first_colon = mssge.find(':')
  val = sentiment_model(mssge)                                  # Evaluate the sentiment of each message
  nombre = mssge[:first_colon]                                  # Get only the name of the person who sent the message
  
  # Save the result of its sentiment of each person in the dictionary
  if (nombre in names):
    if (val[0].get('label')== 'LABEL_1'):
      if val[0].get('score') > highest_positive:
        highest_positive = val[0].get('score')
        best_comment = mssge
        index_best_comment = index_comment_1

  index_comment_1 = index_comment_1 + 1

In [41]:
best_comment

'Alessio: Phenomena is the best cinema in town, I recommend it'

### 14. Conclusions

* The model has an overfitting problem because it has a good performance with the training data, but no with data the model hasn't seen before. 

* Some sentiment analysis libraries determine whether data is positive, negative or neutral. This model doesn't give off neutral outputs.

* Not all the comments can be positive or negative, this model should implement neutral outputs.

