In [None]:
# install libraries
!pip install -U transformers datasets accelerate
!apt-get install git-lfs

In [None]:
import os
# ** note **
# ** mount drive before running **
output_dir = "/content/drive/My Drive/Colab Notebooks/TrainingResults"
os.makedirs(output_dir, exist_ok=True)
output_dir = "/content/drive/My Drive/Colab Notebooks/SADistilBERTModel"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# import libraries
import torch
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline
import numpy as np
# check if cuda is available for GPU acceleration
torch.cuda.is_available()

In [None]:
# load the dataset from a CSV file
df = pd.read_csv('/content/drive/MyDrive/SeniorPortfolio/sentiments.csv', encoding='ISO-8859-1', header=None)

# rename columns according to the dataset structure
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# map target to {0: negative, 4: positive}; adjust if needed to handle neutral tweets
df['target'] = df['target'].map({0: 0, 4: 1})

# convert the pandas dataframe to a Hugging Face dataset for compatibility
dataset = Dataset.from_pandas(df)

# split the dataset into training and testing sets
train_test_split = dataset.train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

In [None]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# define the preprocessing function
def preprocess_function(examples):
    # Tokenize the texts and include truncation and padding
    tokenized_inputs = tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

    # ensure 'labels' are included in the returned dictionary and correctly mapped from the dataset
    tokenized_inputs['labels'] = [label for label in examples['target']]

    return tokenized_inputs

# apply the preprocessing function to both training and testing datasets
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
# initialize the model for sequence classification with 2 labels (positive and negative)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
# initialize the data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# define a function to compute accuracy and f1 score
from datasets import load_metric
accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

from sklearn.metrics import recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    # calculate recall
    recall = recall_score(labels, predictions, average='binary')
    return {"accuracy": accuracy, "f1": f1, "recall": recall}

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# TrainingArguments with both learning rate scheduling and early stopping configurations
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/Colab Notebooks/TrainingResults",
    evaluation_strategy="steps",  # evaluate every `eval_steps`
    eval_steps=5000,  # evaluation happens every 500 steps
    save_strategy="steps",  # save happens every `eval_steps` to align with evaluation
    save_steps=5000,  # align saving with evaluation frequency
    learning_rate=2e-5,
    lr_scheduler_type="reduce_lr_on_plateau",  # learning rate scheduler
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,  # only keep the last 2 checkpoints
    load_best_model_at_end=True,  # load the best model at the end of training
    metric_for_best_model="accuracy",  # metric to identify the best model
    warmup_steps=500,  # warmup steps for the learning rate scheduler
)

# Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
# train the model
trainer.train()

In [None]:
# evaluate the model
trainer.evaluate()

In [None]:
# save the model and tokenizer
model.save_pretrained("/content/drive/My Drive/Colab Notebooks/SADistilBERTModel")
tokenizer.save_pretrained("/content/drive/My Drive/Colab Notebooks/SADistilBERTModel")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# mount Google Drive and use the correct path
model_directory = "/content/drive/My Drive/Colab Notebooks/SADistilBERTModel"

# load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_directory)
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# create a pipeline for sentiment analysis
sentiment_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

# example predictions
results = sentiment_pipeline(["Cool!", "I hate that!"])

# display the results
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")


In [None]:
from transformers import Trainer, TrainingArguments

# set minimal training arguments for evaluation
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/Colab Notebooks/EvalOutput",
    per_device_eval_batch_size=8,
)

# initialize the trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# evaluate the model
evaluation_results = trainer.evaluate()

print(evaluation_results)
