In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# import os
# os.chdir('/content/drive/My Drive/TikTok Hackathon/data gzip')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


User guide:

To see how the model is trained, and how is it's performance, please go to **Model Training**.

Note that the model parameters for DistilBERT is **too big to upload to Github (>100MB)**, therefore, the trained model could not be uploaded. If you wish to test out model performance, please run the model training on your own selected dataset. After obtaining the model parameters, you could further test out the model performance. 

## Fine-Tuning
This model is a DistilBERT-based sequence classification model fine-tuned to categorize online reviews into four classes:


*   ads_spam – promotional or spam content
*   irrelevant – off-topic or unrelated content
*   no first-hand experience – reviews without direct experience
*   relevant review – genuine, actionable user feedback

Architecture & Training Details:

*   Base Model: distilbert-base-uncased, lightweight and fast, chosen for experimentation
*   Tokenizer: Hugging Face AutoTokenizer for text preprocessing
*   Input: Review text
*   Output: Predicted class label
*   Loss Function: Cross-entropy
*   Metrics: Accuracy, precision, recall, F1-score












### Model Training

First, load the test dataset. This is a very small test set with only ~100 reviews.

In [3]:
import pandas as pd
df = pd.read_csv("df_test.csv")
df = df.drop_duplicates()

In [21]:
df.value_counts("category")

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
irrelevant,33
ads_spam,30
relevant review,20
no first-hand experience,12


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

# Load CSV
df = pd.read_csv("df_test.csv")

# Map text labels to integers (keep column name 'category')
label2id = {"ads_spam":0, "irrelevant":1, "no first-hand experience":2, "relevant review":3}
id2label = {v:k for k,v in label2id.items()}
df["category"] = df["category"].map(label2id)

# Stratified train-test split
train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df["category"], random_state=42
)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# Load tokenizer and model
model_name = "distilbert-base-uncased"
num_labels = 4
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels, id2label=id2label, label2id=label2id
)

# Tokenization
def tokenize(batch):
    tokens = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=64)
    tokens["labels"] = batch["category"]  # map category to labels internally
    return tokens

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Training arguments
training_args = TrainingArguments(
    output_dir="./demo_model",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=2,
    save_strategy="no",
    report_to="tensorboard"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

# Final evaluation
results = trainer.evaluate(test_dataset)
print("Final Test Results:", results)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]



Step,Training Loss
2,1.3428
4,1.3518
6,1.3812
8,1.1033
10,1.1444
12,1.2916
14,1.5009
16,1.0577
18,1.1002
20,1.2866


Final Test Results: {'eval_loss': 0.49300333857536316, 'eval_accuracy': 0.8421052631578947, 'eval_precision': 0.7644736842105263, 'eval_recall': 0.8421052631578947, 'eval_f1': 0.7999999999999999, 'eval_runtime': 1.6553, 'eval_samples_per_second': 11.478, 'eval_steps_per_second': 1.812, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Final Test Results:

1.   'eval_loss': 0.49300333857536316,
2.   'eval_accuracy': 0.8421052631578947,
1.    'eval_precision': 0.7644736842105263,
2.   'eval_recall': 0.8421052631578947,
1.   'eval_f1': 0.7999999999999999,
2.   'eval_runtime': 1.6553,
1.   'eval_samples_per_second': 11.478
2.   'eval_steps_per_second': 1.812,
1.   'epoch': 3.0


The warning messages shows that some of the classes were not predictied at all in the test set. This might be due to the small size of the dataset.

### Saving the model and components

In [8]:
model.save_pretrained("./demo_model")
tokenizer.save_pretrained("./demo_model")


('./demo_model/tokenizer_config.json',
 './demo_model/special_tokens_map.json',
 './demo_model/vocab.txt',
 './demo_model/added_tokens.json',
 './demo_model/tokenizer.json')

## Deploying the Model

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

def classify_reviews(input_csv: str, output_csv: str = None, model_path: str = "./demo_model"):
    """
    Load a CSV file, run the fine-tuned model on the 'text' column,
    and save the results with predictions.

    Parameters:
    - input_csv: Path to input CSV file containing a 'text' column.
    - output_csv: Path to save output CSV with predictions. Defaults to 'input_csv_with_predictions.csv'.
    - model_path: Path to the fine-tuned model directory.

    Returns:
    - DataFrame with original data plus 'predicted_category' and 'predicted_score' columns.
    """

    # Load raw data
    df = pd.read_csv(input_csv)

    # Load fine-tuned model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    # Create pipeline
    classifier = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer
    )

    # Run predictions
    predictions = classifier(df["text"].tolist(), batch_size=16)

    # Map predicted labels to numbers
    label2id = {"ads_spam":0, "irrelevant":1, "no first-hand experience":2, "relevant review":3}
    df["predicted_category"] = [label2id[pred["label"]] for pred in predictions]
    df["predicted_score"] = [pred["score"] for pred in predictions]

    # Save output
    if output_csv is None:
        output_csv = input_csv.replace(".csv", "_with_predictions.csv")
    df.to_csv(output_csv, index=False)

    print(f"Predictions added. Saved to {output_csv}")
    return df


### To deploy, simply call the function
**Note:** it is assumed that the dataset has a column named `text`, which contains the review contents.

In [None]:
# Example:
# df_result = classify_reviews("new_reviews.csv")
