In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Lighthouse Labs/LLM-Project/notebooks

Mounted at /content/drive
/content/drive/MyDrive/Lighthouse Labs/LLM-Project/notebooks


In [None]:
!pip install datasets transformers evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
from datasets import load_from_disk
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
import torch
import evaluate
import os

# Load the preprocessed dataset
tokenized_ds = load_from_disk("/content/drive/MyDrive/Lighthouse Labs/LLM-Project/notebooks/tokenized_imdb")


In [None]:
# Use DistilBERT for sentiment classification
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)  # IMDB is binary (pos/neg)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Reduce GPU usage: use only 10-20% of data
small_train_ds = tokenized_ds["train"].shuffle(seed=42).select(range(int(0.2 * len(tokenized_ds["train"]))))
small_test_ds = tokenized_ds["test"].shuffle(seed=42).select(range(int(0.2 * len(tokenized_ds["test"]))))

In [None]:
# Disable wandb and optimize GPU
os.environ["WANDB_DISABLED"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    fp16=True,  # Enables mixed precision for lower memory use
    learning_rate=2e-5,
    disable_tqdm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_ds,
    eval_dataset=small_test_ds,
    tokenizer=tokenizer
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3129,0.286791
2,0.2025,0.369405


TrainOutput(global_step=1250, training_loss=0.23568124389648437, metrics={'train_runtime': 194.7893, 'train_samples_per_second': 51.338, 'train_steps_per_second': 6.417, 'total_flos': 1324673986560000.0, 'train_loss': 0.23568124389648437, 'epoch': 2.0})

In [None]:
# Save model
model.save_pretrained("/content/drive/MyDrive/Lighthouse Labs/LLM-Project/notebooks/distilbert_sentiment_model")

In [None]:
# Clear GPU memory
torch.cuda.empty_cache()

In [None]:
# Evaluate on smaller sample
metric = evaluate.load("accuracy")
small_eval = tokenized_ds["test"].shuffle(seed=42).select(range(100))

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
preds = trainer.predict(small_eval)
predictions = torch.argmax(torch.tensor(preds.predictions), dim=1)

In [None]:
# Compute accuracy
results = metric.compute(predictions=predictions, references=small_eval["label"])
print("Accuracy:", results["accuracy"])


Accuracy: 0.9


In [None]:
# Display sample results with sentiment labels
print("\nSample Sentiment Analysis Results:")
for i in range(3):  # Adjust number of samples shown
    review = small_eval[i]['text'][:300]  # First 300 characters
    label = "Positive" if small_eval[i]['label'] == 1 else "Negative"
    prediction = "Positive" if preds.predictions[i].argmax() == 1 else "Negative"

    print(f"Review:\n{review}...\n")
    print(f"True Sentiment: {label}")
    print(f"Predicted Sentiment: {prediction}")
    print("-" * 100)


Sample Sentiment Analysis Results:
Review:
when i unsuspectedly rented a thousand acres i thought i was in for an entertaining king lear story and of course michelle pfeiffer was in it so what could go wrong  very quickly however i realized that this story was about a thousand other things besides just acres i started crying and couldnt stop...

True Sentiment: Positive
Predicted Sentiment: Positive
----------------------------------------------------------------------------------------------------
Review:
this is the latest entry in the long series of films with the french agent oss  the french answer to james bond the series was launched in the early s and spawned at least eight films none of which was ever released in the us osscaironest of spies is a  eezy little comedy that should notrepeat not b...

True Sentiment: Positive
Predicted Sentiment: Positive
----------------------------------------------------------------------------------------------------
Review:
this movie was so 