In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(
    "final_cleaned_df.csv",
    delimiter=",",
    quotechar='"',
    escapechar="\\",
    engine="python"
)

In [6]:
from datasets import Dataset
from transformers import BertTokenizer, set_seed
import random
import numpy as np
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [7]:
import os
os.environ["PYTHONHASHSEED"] = "42"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
set_seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(False)

In [8]:
print(df.iloc[0]['label'].astype(int))

1


In [9]:
df['label'] = df['label'].astype(int)

In [10]:
print(f"Dataset has {len(df)} rows after processing")
print("Label counts:")
print(df['label'].value_counts())

Dataset has 30635 rows after processing
Label counts:
label
1    15351
0    15284
Name: count, dtype: int64


In [11]:
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['tconst', 'movie_title', 'year', 'numVotes', 'label', 'genre', 'content_rating', 'production_company', 'tomatometer_status', 'tomatometer_rating', 'audience_status', 'audience_rating', 'review_score', 'like_count', 'label_int', 'reviews', 'review_lemmatized'],
    num_rows: 30635
})


In [12]:
dataset = Dataset.from_pandas(df)
print(dataset)

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def tokenize(examples):
    # Make sure we have valid texts in the 'reviews' column
    texts = examples['reviews']

    # Replace any None or NaN values with empty strings to avoid tokenization errors
    texts = [str(text) if text is not None else "" for text in texts]

    return tokenizer(texts, padding='max_length', truncation=True)

Dataset({
    features: ['tconst', 'movie_title', 'year', 'numVotes', 'label', 'genre', 'content_rating', 'production_company', 'tomatometer_status', 'tomatometer_rating', 'audience_status', 'audience_rating', 'review_score', 'like_count', 'label_int', 'reviews', 'review_lemmatized'],
    num_rows: 30635
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [13]:
tokenized = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/30635 [00:00<?, ? examples/s]

In [14]:
train_test_val = tokenized.train_test_split(test_size=0.2)
train_dataset = train_test_val['train']
val_dataset = train_test_val['test']

In [15]:
from transformers import BertForSequenceClassification, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup, EarlyStoppingCallback

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
#model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

num_epochs = 2
batch_size = 16

total_steps = len(train_dataset) * num_epochs // batch_size
warmup_steps = int(0.1 * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



In [17]:
training_args = TrainingArguments(
    report_to="none",
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    #fp16=True,
    weight_decay=0.01,
)
# Define Trainer with model, arguments, and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5264,0.546452,0.737392,0.681575,0.846192,0.570577
2,0.4189,0.455643,0.786519,0.779501,0.793411,0.76607


TrainOutput(global_step=3064, training_loss=0.519610775979941, metrics={'train_runtime': 5183.2386, 'train_samples_per_second': 9.457, 'train_steps_per_second': 0.591, 'total_flos': 1.289665148952576e+16, 'train_loss': 0.519610775979941, 'epoch': 2.0})

In [21]:
metrics

{'eval_loss': 0.45564281940460205,
 'eval_accuracy': 0.7865186877754202,
 'eval_f1': 0.7795010114632501,
 'eval_precision': 0.7934111187371311,
 'eval_recall': 0.7660702451954937,
 'eval_runtime': 198.7024,
 'eval_samples_per_second': 30.835,
 'eval_steps_per_second': 1.928,
 'epoch': 2.0}

In [18]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.45564281940460205, 'eval_accuracy': 0.7865186877754202, 'eval_f1': 0.7795010114632501, 'eval_precision': 0.7934111187371311, 'eval_recall': 0.7660702451954937, 'eval_runtime': 198.7024, 'eval_samples_per_second': 30.835, 'eval_steps_per_second': 1.928, 'epoch': 2.0}


In [None]:
output_dir = "./sentiment_model/"
model_path = os.path.join(output_dir, "model")
tokenizer_path = os.path.join(output_dir, "tokenizer")

In [None]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [24]:
model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)
print(f"Model saved to {model_path}")
print(f"Tokenizer saved to {tokenizer_path}")

Model saved to ./sentiment_model/model
Tokenizer saved to ./sentiment_model/tokenizer


In [25]:
import shutil
shutil.make_archive("sentiment_model", 'zip', output_dir)
from google.colab import files
files.download("sentiment_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TEXT TO USE THE MODEL DOWNLOADED

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load model and tokenizer
loaded_model = BertForSequenceClassification.from_pretrained("./sentiment_model/model")
loaded_tokenizer = BertTokenizer.from_pretrained("./sentiment_model/tokenizer")

# Use the model for inference
inputs = loaded_tokenizer("This movie was great!", return_tensors="pt")
outputs = loaded_model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)  # [negative_prob, positive_prob]