**Download the IMDB Dataset**

In [17]:
# Install and import necessary libraries (if needed)
# Comment: Here we install Kaggle (if we choose to use Kaggle API) and import necessary libraries.

!pip install pandas scikit-learn --quiet
!pip install kaggle --quiet

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
from google.colab import files
uploaded = files.upload()


In [20]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [21]:
# Comment: This command downloads the IMDB dataset (as a .zip file)
# from the Kaggle dataset repository: lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews


Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 97% 25.0M/25.7M [00:02<00:00, 21.2MB/s]
100% 25.7M/25.7M [00:02<00:00, 11.8MB/s]


In [22]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [23]:
# The dataset file is usually named 'IMDB Dataset.csv'
# or 'Dataset.csv' depending on the source.
df = pd.read_csv("IMDB Dataset.csv")

print("Dataframe shape:", df.shape)
df.head()

Dataframe shape: (50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


**Step 2: Data Processing**

In [24]:
# Encode sentiment
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Rename columns for clarity (optional, but helps):
df = df.rename(columns={'review': 'review', 'sentiment': 'label'})

# Retain only the 'review' and 'label' columns
df = df[['review', 'label']]

print("Dataframe with relevant columns:")
df.head()

Dataframe with relevant columns:


Unnamed: 0,review,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [25]:
# 80% Training, 20% holdout (to be further split into val/test)
train_df, temp_df = train_test_split(df,
                                     test_size=0.2,
                                     random_state=42,
                                     stratify=df['label'])

# Now split the holdout (20%) equally into validation and test sets => 10% each
val_df, test_df = train_test_split(temp_df,
                                   test_size=0.5,
                                   random_state=42,
                                   stratify=temp_df['label'])

print("Train size:", train_df.shape[0])
print("Validation size:", val_df.shape[0])
print("Test size:", test_df.shape[0])

Train size: 40000
Validation size: 5000
Test size: 5000


**Step3: INSTALLING AND IMPORTING HUGGING FACE LIBRARIES**

In [11]:

# INSTALLING AND IMPORTING HUGGING FACE LIBRARIES

!pip install transformers datasets huggingface_hub evaluate  # install these only if necessary

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8

In [12]:
# We select a pre-trained Hugging Face Transformer model.
# Let's choose 'distilbert-base-uncased' as an example.
# Then we tokenize the dataset with truncation, padding,
# and a maximum sequence length of 256.

MODEL_NAME = "distilbert-base-uncased"

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Define a function to tokenize a batch of data
def tokenize_function(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        padding="max_length",     # or "longest" if you prefer dynamic padding
        max_length=256
    )

# We'll prepare our train, validation, and test data for the HuggingFace Trainer.
# Let's first convert the pandas DataFrames to the Hugging Face Dataset objects.
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Map the tokenization function to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset   = val_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)

# Hugging Face datasets typically need the label column named "labels".
# If your label column is 'label', rename it to 'labels' if necessary.
train_dataset = train_dataset.rename_column("label", "labels")
val_dataset   = val_dataset.rename_column("label", "labels")
test_dataset  = test_dataset.rename_column("label", "labels")

# We also need to remove the "review" column after tokenization to avoid confusion.
train_dataset = train_dataset.remove_columns(["review", "__index_level_0__"])
val_dataset   = val_dataset.remove_columns(["review", "__index_level_0__"])
test_dataset  = test_dataset.remove_columns(["review", "__index_level_0__"])

# Verify tokenized data
print(train_dataset[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

{'labels': 1, 'input_ids': [101, 1045, 3236, 2023, 2210, 17070, 6135, 2011, 4926, 2067, 1999, 3150, 2030, 1005, 6282, 1012, 1045, 2001, 2012, 1037, 6308, 3004, 2000, 2156, 2048, 2214, 10021, 16596, 1011, 10882, 5691, 1012, 1996, 3004, 2001, 8966, 2440, 1998, 1006, 2007, 2053, 5432, 1007, 2027, 3662, 1037, 9129, 1997, 16596, 1011, 10882, 2460, 11867, 21511, 2015, 1006, 2000, 2131, 2149, 1999, 1996, 6888, 1007, 1012, 2087, 2020, 5399, 19142, 2021, 2023, 2234, 2006, 1998, 1010, 2306, 3823, 1010, 1996, 4378, 2001, 1999, 1044, 27268, 22420, 2015, 999, 1996, 5221, 4756, 2234, 2043, 2027, 3662, 1000, 4615, 21110, 2050, 1000, 2383, 4121, 21229, 21122, 2015, 2612, 1997, 2606, 2006, 2014, 2132, 1012, 2016, 3504, 2012, 1996, 4950, 1010, 3957, 1037, 11844, 2868, 1998, 11232, 1012, 2008, 2081, 2009, 2130, 4569, 14862, 999, 2017, 10657, 2156, 1000, 21271, 19736, 16665, 1000, 2209, 2011, 2054, 3504, 2066, 1037, 14163, 29519, 999, 2009, 2001, 5186, 10021, 1998, 5236, 1012, 1012, 1012, 2021, 1045, 2481

In [13]:
# Load the pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Define our evaluation metrics using the 'evaluate' library
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    prec = precision_metric.compute(predictions=predictions, references=labels, average="binary")
    rec = recall_metric.compute(predictions=predictions, references=labels, average="binary")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="binary")

    return {
        "accuracy": acc["accuracy"],
        "precision": prec["precision"],
        "recall": rec["recall"],
        "f1": f1["f1"]
    }

# Define training arguments
# Comment: We'll log evaluation metrics, save checkpoints, etc.
training_args = TrainingArguments(
    output_dir="distilbert-imdb-checkpoints",
    evaluation_strategy="epoch",     # Evaluate after each epoch
    learning_rate=5e-5,              # Default or as specified
    per_device_train_batch_size=16,  # or 32 as per requirement
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_steps=100,
    logging_dir="logs",
    save_strategy="epoch"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmikza[0m ([33mmikza-tampere-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2553,0.231618,0.9158,0.908769,0.9244,0.916518
2,0.1428,0.270429,0.9192,0.913249,0.9264,0.919778


TrainOutput(global_step=5000, training_loss=0.20933591289520265, metrics={'train_runtime': 1903.3787, 'train_samples_per_second': 42.031, 'train_steps_per_second': 2.627, 'total_flos': 5298695946240000.0, 'train_loss': 0.20933591289520265, 'epoch': 2.0})

In [14]:
# Comment: We'll use the .save_pretrained() method on both the model and tokenizer
# to save all the necessary files locally. We specify a local directory name.

SAVE_DIRECTORY = "./distilbert-imdb-model"

# Save the model
model.save_pretrained(SAVE_DIRECTORY)

# Save the tokenizer
tokenizer.save_pretrained(SAVE_DIRECTORY)

print(f"Model and tokenizer saved to: {SAVE_DIRECTORY}")


Model and tokenizer saved to: ./distilbert-imdb-model


In [15]:
# Comment: We'll use the huggingface_hub notebook_login utility to log in with your token.
# Note: If you haven't installed huggingface_hub, install it via pip first (pip install huggingface_hub).

from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
# Comment: We'll import the push_to_hub utility from the transformers library.
# This will create a new repository under your Hugging Face account with the model files.

from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Reload the model from the local directory (just to confirm everything is correct)
model_reloaded = AutoModelForSequenceClassification.from_pretrained(SAVE_DIRECTORY)
tokenizer_reloaded = AutoTokenizer.from_pretrained(SAVE_DIRECTORY)

# Push to hub
model_reloaded.push_to_hub("mursuturpa/distilbert-imdb-model")
tokenizer_reloaded.push_to_hub("mursuturpa/distilbert-imdb-model")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mursuturpa/distilbert-imdb-model/commit/bda25109fec7494a66344975cd8d5fb8433a8a22', commit_message='Upload tokenizer', commit_description='', oid='bda25109fec7494a66344975cd8d5fb8433a8a22', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mursuturpa/distilbert-imdb-model', endpoint='https://huggingface.co', repo_type='model', repo_id='mursuturpa/distilbert-imdb-model'), pr_revision=None, pr_num=None)

In [17]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="mursuturpa/distilbert-imdb-model")

# Test the pipeline
prediction = classifier("This movie was fantastic! I absolutely loved it.")
print(prediction)


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9974502921104431}]


**Part 2: API Development and Testing (5 points)**

In [6]:
!pip install fastapi uvicorn --quiet
!pip install pydantic --quiet
!pip install colabcode --quiet
!pip install groq --quiet
!pip install pyngrok --upgrade --quiet

In [18]:
import os
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel

# For the Hugging Face pipeline
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# For calling Groq's Llama
from groq import Groq

# ========== DATA MODELS FOR FASTAPI ==========
class SentimentRequest(BaseModel):
    text: str
    model: str  # "custom" or "llama"

class SentimentResponse(BaseModel):
    sentiment: str
    confidence: float


# ========== FASTAPI APP INITIALIZATION ==========
app = FastAPI(title="Sentiment Analysis API with Groq", version="1.0")

# ========== LOAD HUGGING FACE MODEL (CUSTOM) LAZILY ==========
hf_pipeline = None

def load_hf_model():
    """
    Loads your fine-tuned Hugging Face model for sentiment analysis
    (DistilBERT, BERT, etc.).
    """
    global hf_pipeline
    if hf_pipeline is None:
        print("Loading custom fine-tuned model from Hugging Face...")
        model_path = "mursuturpa/distilbert-imdb-model"  # or local path, e.g. "./distilbert-imdb-model"
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_path)

        hf_pipeline = pipeline(
            "sentiment-analysis",
            model=model,
            tokenizer=tokenizer,
            device=0 if torch.cuda.is_available() else -1
        )
    return hf_pipeline


# ========== GROQ LLAMA 3 MODEL CALL ==========
def call_llama3_model(text: str):
    """
    Calls Llama 3.3-70b-versatile on Groq for sentiment analysis.
    We'll use a system prompt to instruct the model how to respond.
    """
    # Initialize Groq client with your API key from env
    client = Groq(
        api_key=os.environ.get("token123")
    )

    # Use a structured system prompt telling the model to output sentiment & confidence
    # Example system prompt. You can refine to suit your needs.
    system_prompt = {
        "role": "system",
        "content": (
            "You are a sentiment analysis assistant. "
            "Given the user's text, respond in JSON with two fields: "
            "'sentiment' (positive or negative) and 'confidence' (a floating-point number). "
            "For example: {\"sentiment\": \"positive\", \"confidence\": 0.87}"
        )
    }

    # The user prompt is the text to analyze
    user_prompt = {
        "role": "user",
        "content": text
    }

    # Call the Groq chat completion endpoint
    chat_completion = client.chat.completions.create(
        messages=[system_prompt, user_prompt],
        model="llama-3.3-70b-versatile",  # The specific Llama model on Groq
    )

    # Extract the content from the first choice
    content = chat_completion.choices[0].message.content

    # Now parse the JSON that we asked the model to return
    import json
    try:
        parsed = json.loads(content.strip())
        sentiment_label = parsed.get("sentiment", "unknown").lower()
        confidence_score = float(parsed.get("confidence", 0.0))
    except Exception as e:
        # If parsing fails, handle gracefully
        sentiment_label = "unknown"
        confidence_score = 0.0
        print("Error parsing Llama 3 output:", e)

    return sentiment_label, confidence_score


# ========== FASTAPI ROUTE: /analyze/ ==========
@app.post("/analyze/", response_model=SentimentResponse)
def analyze_sentiment(request: SentimentRequest):
    """
    POST endpoint that:
    - Accepts 'text' (string) and 'model' ('custom' or 'llama')
    - Returns sentiment ('positive' or 'negative') and confidence score
    """
    text = request.text
    model_choice = request.model.lower().strip()

    if model_choice == "custom":
        # Use the HF fine-tuned DistilBERT (or another model)
        pipeline_instance = load_hf_model()
        result = pipeline_instance(text)[0]  # returns something like: {'label': 'POSITIVE', 'score': 0.98}
        sentiment_label = result["label"].lower()
        confidence_score = float(result["score"])

    elif model_choice == "llama":
        # Use the Groq Llama 3 model
        sentiment_label, confidence_score = call_llama3_model(text)
    else:
        sentiment_label, confidence_score = "unknown", 0.0

    return {"sentiment": sentiment_label, "confidence": confidence_score}


In [31]:
!ngrok authtoken token123
from pyngrok import ngrok

public_url = ngrok.connect(addr=8000)
print("Public URL:", public_url)

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


ERROR:pyngrok.process.ngrok:t=2025-02-04T19:39:10+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: The authtoken you specified does not look like a proper ngrok tunnel authtoken.\nYour authtoken: cr_2saZLdurVP7Hunjowg6qOPtiOPY\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n"
ERROR:pyngrok.process.ngrok:t=2025-02-04T19:39:10+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: The authtoken you specified does not look like a proper ngrok tunnel authtoken.\nYour authtoken: cr_2saZLdurVP7Hunjowg6qOPtiOPY\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n"
ERROR:pyngrok.process.ngrok:t=2025-02-04T19:39:10+0000 lvl=eror msg="terminating with error" obj=app err="authentication failed: The authtoken you specified does 

PyngrokNgrokError: The ngrok process errored on start: authentication failed: The authtoken you specified does not look like a proper ngrok tunnel authtoken.\nYour authtoken: cr_2saZLdurVP7Hunjowg6qOPtiOPY\nInstructions to install your authtoken are on your ngrok dashboard:\nhttps://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_105\r\n.

SyntaxError: unterminated string literal (detected at line 79) (<ipython-input-32-603593a76b40>, line 79)