## Part 1: Dataset Preparation and Fine-Tuning

### Step 1: Download the IMDB Dataset

In [5]:
import shutil
import os

kaggle_dir = '/root/.kaggle'
if not os.path.exists(kaggle_dir):
    os.makedirs(kaggle_dir)

# Move the kaggle.json file to the correct directory
shutil.move('/content/kaggle.json', os.path.join(kaggle_dir, 'kaggle.json'))

# Set the correct permissions for the file
os.chmod(os.path.join(kaggle_dir, 'kaggle.json'), 600)


# Install Kaggle API
!pip install kaggle

# Use the Kaggle API to download the dataset
zip_file = 'imdb-movie-ratings-sentiment-analysis.zip'

# Check if the zip file already exists
if not os.path.exists(zip_file):
    print("Downloading the dataset...")
    # Use the Kaggle API to download the dataset
    !kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
else:
    print("Dataset already downloaded.")


import zipfile

!ls
!ls /content/

# Unzip the downloaded dataset
with zipfile.ZipFile('imdb-dataset-of-50k-movie-reviews.zip', 'r') as zip_ref:
    zip_ref.extractall('/content')


import pandas as pd



file_path = "/content/IMDB Dataset.csv"
df = pd.read_csv(file_path)

# Display basic information
print(df.info())  # Check dataset structure
print(df.head())  # Display first few rows

Downloading the dataset...
Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)
'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   sample_data
'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   sample_data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family 

### Step 2: Data Preprocessing

In [8]:
import pandas as pd

# Load dataset
file_path = "/content/IMDB Dataset.csv"
df = pd.read_csv(file_path)

# Encode sentiment column
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Retain only the review and label columns
df = df[['review', 'label']]

# Display the cleaned dataset
print(df.head(10))
print(df.info())


                                              review  label
0  One of the other reviewers has mentioned that ...      1
1  A wonderful little production. <br /><br />The...      1
2  I thought this was a wonderful way to spend ti...      1
3  Basically there's a family where a little boy ...      0
4  Petter Mattei's "Love in the Time of Money" is...      1
5  Probably my all-time favorite movie, a story o...      1
6  I sure would like to see a resurrection of a u...      1
7  This show was an amazing, fresh & innovative i...      0
8  Encouraged by the positive comments about this...      0
9  If you like original gut wrenching laughter yo...      1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  50000 non-null  object
 1   label   50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB
None


In [9]:
from sklearn.model_selection import train_test_split

# Split into training (80%) and temporary (20%)
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Split temporary data into validation (10%) and test (10%)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['label'])

# Print dataset sizes
print(f"Training set: {len(train_data)}")
print(f"Validation set: {len(val_data)}")
print(f"Testing set: {len(test_data)}")


Training set: 40000
Validation set: 5000
Testing set: 5000


### Step 3: Model Selection and Tokenization

In [10]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupt

In [24]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

# Load the tokenizer for DistilBERT
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:


# Apply tokenization
import pandas as pd
from datasets import Dataset


# Convert Pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)





def tokenize_function(examples):
    return tokenizer(
        examples["review"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Rename the label column to match Trainer expectations
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# Remove original text column (not needed for training)
tokenized_dataset = tokenized_dataset.remove_columns(["review"])

# Display tokenized dataset sample
print(tokenized_dataset[0])


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'labels': 1, 'input_ids': [101, 2028, 1997, 1996, 2060, 15814, 2038, 3855, 2008, 2044, 3666, 2074, 1015, 11472, 2792, 2017, 1005, 2222, 2022, 13322, 1012, 2027, 2024, 2157, 1010, 2004, 2023, 2003, 3599, 2054, 3047, 2007, 2033, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 2034, 2518, 2008, 4930, 2033, 2055, 11472, 2001, 2049, 24083, 1998, 4895, 10258, 2378, 8450, 5019, 1997, 4808, 1010, 2029, 2275, 1999, 2157, 2013, 1996, 2773, 2175, 1012, 3404, 2033, 1010, 2023, 2003, 2025, 1037, 2265, 2005, 1996, 8143, 18627, 2030, 5199, 3593, 1012, 2023, 2265, 8005, 2053, 17957, 2007, 12362, 2000, 5850, 1010, 3348, 2030, 4808, 1012, 2049, 2003, 13076, 1010, 1999, 1996, 4438, 2224, 1997, 1996, 2773, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2009, 2003, 2170, 11472, 2004, 2008, 2003, 1996, 8367, 2445, 2000, 1996, 17411, 4555, 3036, 2110, 7279, 4221, 12380, 2854, 1012, 2009, 7679, 3701, 2006, 14110, 2103, 1010, 2019, 6388, 2930, 1997, 1996, 3827, 2073, 2035, 1996, 4442, 2031,

### Step 4: Fine-Tune the Model

In [30]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

train_testvalid = tokenized_dataset.train_test_split(test_size=0.2)
test_valid = train_testvalid["test"].train_test_split(test_size=0.5)

train_dataset = train_testvalid["train"]
valid_dataset = test_valid["train"]
test_dataset = test_valid["test"]


print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(valid_dataset)}")
print(f"Testing samples: {len(test_dataset)}")


Training samples: 40000
Validation samples: 5000
Testing samples: 5000


In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",  # Save model at the end of each epoch
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)




In [18]:
!pip install evaluate

import numpy as np
import evaluate
from transformers import Trainer, TrainingArguments

# Load metric functions
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision_metric.compute(predictions=predictions, references=labels, average="binary")["precision"],
        "recall": recall_metric.compute(predictions=predictions, references=labels, average="binary")["recall"],
        "f1": f1_metric.compute(predictions=predictions, references=labels, average="binary")["f1"],
    }


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.0/84.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2657,0.214495,0.919,0.930833,0.903356,0.916889
2,0.1393,0.271384,0.921,0.919628,0.920744,0.920186


TrainOutput(global_step=5000, training_loss=0.20842278260588645, metrics={'train_runtime': 1856.6118, 'train_samples_per_second': 43.089, 'train_steps_per_second': 2.693, 'total_flos': 5298695946240000.0, 'train_loss': 0.20842278260588645, 'epoch': 2.0})

In [None]:
results = trainer.evaluate(test_dataset)
print(results)


### Step 5: Save and Upload the Model to Hugging Face

In [32]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_imdb")
tokenizer.save_pretrained("./fine_tuned_imdb")


('./fine_tuned_imdb/tokenizer_config.json',
 './fine_tuned_imdb/special_tokens_map.json',
 './fine_tuned_imdb/vocab.txt',
 './fine_tuned_imdb/added_tokens.json',
 './fine_tuned_imdb/tokenizer.json')

In [33]:
from huggingface_hub import notebook_login

# Log in to Hugging Face (run this cell and follow instructions)
notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [34]:

repo_name = "ft-imdb-distilbert"

# Push the model to Hugging Face
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/niklassuvitie/ft-imdb-distilbert/commit/026e687eb72c4c3f9aefe0a0d989e6a7f531a09b', commit_message='Upload tokenizer', commit_description='', oid='026e687eb72c4c3f9aefe0a0d989e6a7f531a09b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/niklassuvitie/ft-imdb-distilbert', endpoint='https://huggingface.co', repo_type='model', repo_id='niklassuvitie/ft-imdb-distilbert'), pr_revision=None, pr_num=None)

https://huggingface.co/niklassuvitie/ft-imdb-distilbert

## Part 2: API Development and Testing

https://github.com/niqdevgit/LLM-api-example

## Part 3: UI Design and Explanation

https://github.com/niqdevgit/LLM-api-example

https://youtu.be/ZQ1q64cwxlU