In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv
/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv


In [2]:
data = pd.read_csv("/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv")
data["Positive"].unique

<bound method Series.unique of 0        Positive
1        Positive
2        Positive
3        Positive
4        Positive
           ...   
74676    Positive
74677    Positive
74678    Positive
74679    Positive
74680    Positive
Name: Positive, Length: 74681, dtype: object>

In [3]:
# Rename the columns
data.columns = ['id', 'topic', 'sentiment', 'tweet']

In [4]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

# Convert "Irrelevant" class to "Neutral"
data['sentiment'] = data['sentiment'].replace('Irrelevant', 'Neutral')

# Encode sentiment labels to integers
label_mapping = {'positive': 0, 'negative': 1, 'neutral': 2}
data['label'] = data['sentiment'].str.lower().map(label_mapping)

# Drop any rows with missing labels or tweets
data = data.dropna(subset=['label', 'tweet'])

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label'])

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_data[['tweet', 'label']])
val_dataset = Dataset.from_pandas(val_data[['tweet', 'label']])

# Create a DatasetDict for the Trainer
dataset = DatasetDict({'train': train_dataset, 'validation': val_dataset})

# Step 2: Fine-tune the CardiffNLP Twitter model

# Load the tokenizer and model
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=3,
)

# Tokenize the data
def preprocess_function(examples):
    # Tokenizer expects a list of texts for batching
    return tokenizer(examples['tweet'], truncation=True, padding='max_length', max_length=128)

# Apply the tokenizer to the datasets
encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_val_dataset = val_dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",  # Save model checkpoints at each epoch,
    report_to="none"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
)

# Fine-tune the model
trainer.train()

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/982 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Map:   0%|          | 0/59196 [00:00<?, ? examples/s]

Map:   0%|          | 0/14799 [00:00<?, ? examples/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,0.5964,0.47763
2,0.3101,0.323425
3,0.1923,0.313151


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=5550, training_loss=0.40124616090241855, metrics={'train_runtime': 3833.4021, 'train_samples_per_second': 46.326, 'train_steps_per_second': 1.448, 'total_flos': 1.1681446406870016e+16, 'train_loss': 0.40124616090241855, 'epoch': 3.0})

In [5]:
# Save the fine-tuned model and tokenizer
model_save_path = "/kaggle/working/fine_tuned_model"
trainer.save_model(model_save_path)  # Saves the model
tokenizer.save_pretrained(model_save_path)  # Saves the tokenizer

('/kaggle/working/fine_tuned_model/tokenizer_config.json',
 '/kaggle/working/fine_tuned_model/special_tokens_map.json',
 '/kaggle/working/fine_tuned_model/sentencepiece.bpe.model',
 '/kaggle/working/fine_tuned_model/added_tokens.json',
 '/kaggle/working/fine_tuned_model/tokenizer.json')

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the fine-tuned model and tokenizer
model_save_path = "/kaggle/working/fine_tuned_model"  # Path where the model was saved
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)

In [7]:
import torch

# Load the validation dataset
validation_data_path = "/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv"
validation_data = pd.read_csv(validation_data_path)

# Add columns as specified
validation_data.columns = ['id', 'topic', 'sentiment', 'tweet']

# Convert "Irrelevant" class to "Neutral"
validation_data['sentiment'] = validation_data['sentiment'].replace('Irrelevant', 'Neutral')

# Encode sentiment labels to integers
label_mapping = {'positive': 0, 'negative': 1, 'neutral': 2}
validation_data['label'] = validation_data['sentiment'].str.lower().map(label_mapping)

# Drop any rows with missing labels or tweets
validation_data = validation_data.dropna(subset=['label', 'tweet'])

# Move the model to the same device as the input tensors
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Tokenize the validation data
def preprocess_function(examples):
    return tokenizer(examples['tweet'].tolist(), truncation=True, padding='max_length', max_length=128, return_tensors='pt')

# Tokenize the validation tweets
encoded_validation_data = preprocess_function(validation_data)

# Move the input tensors to the same device as the model
encoded_validation_data = {key: value.to(device) for key, value in encoded_validation_data.items()}

# Run predictions
with torch.no_grad():  # Disables gradient calculation to save memory
    outputs = model(**encoded_validation_data)
    predictions = outputs.logits.argmax(dim=-1).cpu().numpy()  # Move predictions back to CPU

# Map the predicted labels back to their corresponding sentiment classes
label_reverse_mapping = {0: 'Positive', 1: 'Negative', 2: 'Neutral'}
validation_data['predicted_sentiment'] = [label_reverse_mapping[pred] for pred in predictions]

# Display the results
print("Predictions on Validation Data:")
print(validation_data[['tweet', 'sentiment', 'predicted_sentiment']].head(20))  # Display 20 example predictions

Predictions on Validation Data:
                                                tweet sentiment  \
0   BBC News - Amazon boss Jeff Bezos rejects clai...   Neutral   
1   @Microsoft Why do I pay for WORD when it funct...  Negative   
2   CSGO matchmaking is so full of closet hacking,...  Negative   
3   Now the President is slapping Americans in the...   Neutral   
4   Hi @EAHelp I’ve had Madeleine McCann in my cel...  Negative   
5   Thank you @EAMaddenNFL!! \n\nNew TE Austin Hoo...  Positive   
6   Rocket League, Sea of Thieves or Rainbow Six: ...  Positive   
7   my ass still knee-deep in Assassins Creed Odys...  Positive   
8   FIX IT JESUS ! Please FIX IT ! What In the wor...  Negative   
9   The professional dota 2 scene is fucking explo...  Positive   
10  Itching to assassinate \n\n#TCCGif #AssassinsC...  Positive   
11  @FredTJoseph hey fred, Comcast cut the cable a...  Negative   
12  CSGO WIngman (Im Silver dont bully) twitch.tv/...   Neutral   
13  @NBA2K game sucks... down 

In [8]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
true_labels = validation_data['label'].values
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy of the fine-tuned model on the validation dataset: {accuracy:.4f}")


Accuracy of the fine-tuned model on the validation dataset: 0.9630


In [9]:
# Load the original pre-trained model and tokenizer
original_model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
original_tokenizer = AutoTokenizer.from_pretrained(original_model_name)
original_model = AutoModelForSequenceClassification.from_pretrained(original_model_name)

# Move the original model to the same device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
original_model.to(device)

# Tokenize the validation data using the original model's tokenizer
encoded_validation_data = preprocess_function(validation_data)

# Move the input tensors to the same device as the model
encoded_validation_data = {key: value.to(device) for key, value in encoded_validation_data.items()}

# Run predictions with the original model
with torch.no_grad():  # Disables gradient calculation to save memory
    original_outputs = original_model(**encoded_validation_data)
    original_predictions = original_outputs.logits.argmax(dim=-1).cpu().numpy()  # Move predictions back to CPU

# Calculate accuracy for the original model
true_labels = validation_data['label'].values
original_accuracy = accuracy_score(true_labels, original_predictions)
print(f"Accuracy of the original model on the validation dataset: {original_accuracy:.4f}")




Accuracy of the original model on the validation dataset: 0.2382
