In [1]:
import pandas as pd
import re


from sklearn.metrics import accuracy_score
import joblib

In [2]:
columns = ['target', 'id', 'date', 'query', 'user', 'text']

df = pd.read_csv(
    '/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv',
    encoding='latin-1',
    names=columns
)

df.head()

Unnamed: 0,target,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
len(df)

1600000

In [4]:
df_scaled = pd.read_csv('/kaggle/input/sentiment-scaled-dataset/cleaned_sentiment_data.csv')
df_scaled.head()

Unnamed: 0,target,id,date,flag,user,text,clean_text
0,4,1835324555,Mon May 18 05:43:27 PDT 2009,NO_QUERY,666TheBeast666,mood swings mood swings =( mood swings &gt; m...,mood swings mood swings =( mood swings &gt; mo...
1,4,1998937009,Mon Jun 01 19:22:14 PDT 2009,NO_QUERY,cephalopod_gal,@doubtme Indeed! I'll add you when this episod...,indeed! add episode star trek
2,0,2218830692,Wed Jun 17 23:13:59 PDT 2009,NO_QUERY,emilyisfinee_09,aww anna i'm sorry if u need me i'm here.,aww anna sorry u need here.
3,0,2300933313,Tue Jun 23 14:46:54 PDT 2009,NO_QUERY,rhonda416,Battery is dying-no charger-another hour befor...,battery dying-no charger-another hour leaving
4,4,2063505695,Sun Jun 07 02:56:09 PDT 2009,NO_QUERY,AyeBloodyRight,@MrsJames_Waters how've you been mrs?,how've mrs?


In [5]:
len(df_scaled)

50000

In [6]:
def clean_tweet(text):
  text = text.lower()
  text = re.sub(r'http\S+', '', text)
  text = re.sub(r'@\S+', '', text)

  return text.strip()

In [7]:
df['clean_text'] = df['text'].apply(clean_tweet)
df.head()

Unnamed: 0,target,id,date,query,user,text,clean_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got davi..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ..."


In [8]:
df.isna().sum()

target        0
id            0
date          0
query         0
user          0
text          0
clean_text    0
dtype: int64

In [9]:
df_scaled.isna().sum()

target          0
id              0
date            0
flag            0
user            0
text            0
clean_text    135
dtype: int64

In [10]:
df_scaled.dropna(inplace = True)

In [11]:
# Original Dataset
from sklearn.model_selection import train_test_split

def split(df):
    TRAIN_SIZE = 0.70
    TEST_SIZE = 0.15
    VAL_SIZE = 0.15
    
    train_val, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=42, stratify = df['target'])
    
    adjusted_val_size = VAL_SIZE/(TRAIN_SIZE+VAL_SIZE)
    
    train_df, val_df = train_test_split(train_val, test_size=adjusted_val_size, random_state=42, stratify = train_val['target'])
    
    print(f"  Total samples: {len(df):,}")
    print(f"  Training:   {len(train_df):,} ({len(train_df)/len(df)*100:.1f}%)")
    print(f"  Validation: {len(val_df):,} ({len(val_df)/len(df)*100:.1f}%)")
    print(f"  Test:       {len(test_df):,} ({len(test_df)/len(df)*100:.1f}%)")

    return train_df, test_df, val_df

In [12]:
print('Splitting Original Dataset')
train_df, test_df, val_df = split(df)
print('\nSplitting Scaled Dataset')
train_df_scaled, test_df_scaled, val_df_scaled = split(df_scaled)

Splitting Original Dataset
  Total samples: 1,600,000
  Training:   1,119,999 (70.0%)
  Validation: 240,001 (15.0%)
  Test:       240,000 (15.0%)

Splitting Scaled Dataset
  Total samples: 49,865
  Training:   34,905 (70.0%)
  Validation: 7,480 (15.0%)
  Test:       7,480 (15.0%)


In [13]:
X_train = train_df['clean_text']
y_train = train_df['target']

X_val = val_df['clean_text']
y_val = val_df['target']

X_train_scaled = train_df_scaled['clean_text']
y_train_scaled = train_df_scaled['target']

X_val_scaled = val_df_scaled['clean_text']
y_val_scaled = val_df_scaled['target']

## Classical ML

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, n_jobs=-1),
    "Linear SVM": LinearSVC() 
}

vectorizer = TfidfVectorizer(max_features=100000, ngram_range=(1,2))
print("Vectorizing Oringal Dataset")
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)

print("Vectorizing Scaled Dataset")
X_train_scaled = vectorizer.fit_transform(X_train_scaled)
X_val_scaled = vectorizer.transform(X_val_scaled)

Vectorizing Oringal Dataset
Vectorizing Scaled Dataset


In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_metrics(y_val, y_pred, model_name="Model"):
    print(f"\nEvaluation for {model_name}")
    
    acc = accuracy_score(y_val, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='weighted')
    
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    
    # 2. Detailed Report
    print(classification_report(y_val, y_pred))
    cm = confusion_matrix(y_val, y_pred)
    
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall, "cm": cm}

In [17]:
def finetune(models, X_train, y_train, X_val, y_val):
    results = {}
    
    for name, model in models.items():
        print(f"----Training {name}----")
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        metrics = evaluate_metrics(y_val, preds, model_name=name)
        results[name] = metrics
        
    return results

In [18]:
print('---Fine-Tuning On Original Dataset---')
results = finetune(models, X_train, y_train, X_val, y_val)
print('---Fine-Tuning On Scaled Dataset---')
results_scaled = finetune(models, X_train_scaled, y_train_scaled, X_val_scaled, y_val_scaled)

---Fine-Tuning On Original Dataset---
----Training Naive Bayes----

Evaluation for Naive Bayes
Accuracy:  0.7994
Precision: 0.7995
Recall:    0.7994
F1 Score:  0.7994
              precision    recall  f1-score   support

           0       0.80      0.80      0.80    120001
           4       0.80      0.79      0.80    120000

    accuracy                           0.80    240001
   macro avg       0.80      0.80      0.80    240001
weighted avg       0.80      0.80      0.80    240001

----Training Logistic Regression----

Evaluation for Logistic Regression
Accuracy:  0.8195
Precision: 0.8196
Recall:    0.8195
F1 Score:  0.8194
              precision    recall  f1-score   support

           0       0.83      0.81      0.82    120001
           4       0.81      0.83      0.82    120000

    accuracy                           0.82    240001
   macro avg       0.82      0.82      0.82    240001
weighted avg       0.82      0.82      0.82    240001

----Training Linear SVM----

Evalu

## DistilBert

In [19]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!pip install optimum[onnxruntime]

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m
Collecting optimum[onnxruntime]
  Downloading optimum-2.1.0-py3-none-any.whl.metadata (14 kB)
Collecting optimum-onnx[onnxruntime] (from optimum[onnxruntime])
  Downloading optimum_onnx-0.1.0-py3-none-any.whl.metadata (4.8 kB)
Collecting onnxruntime>=1.18.0 (from optimum-onnx[onnxruntime]; extra == "onnxruntime"->optimum[onnxruntime])
  Downloading onnxruntime-1.24.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Downloading optimum-2.1.0-py3-none-any.whl (161 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m161.2/161.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.24.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

In [20]:
from tqdm.auto import tqdm
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

2026-02-06 09:56:19.342146: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770371779.538602      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770371779.592979      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770371780.121837      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770371780.121871      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770371780.121874      24 computation_placer.cc:177] computation placer alr

In [21]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 64
max_length = 64

In [22]:
def clean_tweet_for_bert(text):
    text = str(text).lower()                   
    text = re.sub(r'@[A-Za-z0-9_]+', '', text) 
    text = re.sub(r'http\S+', '', text)        
    text = " ".join(text.split())
    return text.strip()

df['clean_text_bert'] = df['text'].apply(clean_tweet_for_bert)

In [23]:
from datasets import Dataset

def create_hf_dataset(df, text_col='clean_text', target_col='target'):
    target_map = {0: 0, 4: 1}

    df_processed = df.copy()
    df_processed['label'] = df_processed[target_col].map(target_map)
    
    df_processed = df_processed[[text_col, 'label']]
    
    return Dataset.from_pandas(df_processed)

train_dataset = create_hf_dataset(train_df)
val_dataset   = create_hf_dataset(val_df)

train_dataset_scaled = create_hf_dataset(train_df_scaled)
val_dataset_scaled   = create_hf_dataset(val_df_scaled)

In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    return tokenizer(examples["clean_text"], truncation=True, padding="max_length", max_length=max_length)

print("Tokenizing Original Data...")
encoded_train = train_dataset.map(preprocess_function, batched=True, desc="Tokenizing Train")
encoded_val = val_dataset.map(preprocess_function, batched=True, desc="Tokenizing Val")

print("Tokenizing Scaled Data...")
encoded_train_scaled = train_dataset_scaled.map(preprocess_function, batched=True, desc="Tokenizing Train")
encoded_val_scaled = val_dataset_scaled.map(preprocess_function, batched=True, desc="Tokenizing Val")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizing Original Data...


Tokenizing Train:   0%|          | 0/1119999 [00:00<?, ? examples/s]

Tokenizing Val:   0%|          | 0/240001 [00:00<?, ? examples/s]

Tokenizing Scaled Data...


Tokenizing Train:   0%|          | 0/34905 [00:00<?, ? examples/s]

Tokenizing Val:   0%|          | 0/7480 [00:00<?, ? examples/s]

In [25]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [26]:
tqdm.pandas() 

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_steps=50, 
    disable_tqdm=False,
    report_to="none" 
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
import os
from optimum.onnxruntime import ORTModelForSequenceClassification
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from optimum.onnxruntime import ORTQuantizer

def quantize_to_onnx(input_model_path, output_dir):
    print(f"üöÄ Starting conversion for: {input_model_path}")
    
    model_onnx = ORTModelForSequenceClassification.from_pretrained(
        input_model_path,
        export=True
    )

    qconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=True)
    
    quantizer = ORTQuantizer.from_pretrained(model_onnx)
    
    quantizer.quantize(
        save_dir=output_dir,
        quantization_config=qconfig
    )
    
    final_file = os.path.join(output_dir, "model_quantized.onnx")
    size_mb = os.path.getsize(final_file) / (1024 * 1024)
    print(f"Model saved to: {final_file}")
    print(f"Final Size: {size_mb:.2f} MB")

Multiple distributions found for package optimum. Picked distribution: optimum-onnx


In [28]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_train,
    eval_dataset=encoded_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Training on Original Dataset...")
trainer.train()
trainer.save_model("./distilbert_original")
quantize_to_onnx("./distilbert_original", "./onnx_quantized_original")

  trainer = Trainer(


Training on Original Dataset...




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3261,0.322312,0.860321,0.860306,0.860478,0.860321


`torch_dtype` is deprecated! Use `dtype` instead!


üöÄ Starting conversion for: ./distilbert_original


  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


Model saved to: ./onnx_quantized_original/model_quantized.onnx
Final Size: 64.45 MB


In [29]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_train,
    eval_dataset=encoded_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Training on Scaled Dataset...")
trainer.train()
trainer.save_model("./distilbert_scaled")
quantize_to_onnx("./distilbert_scaled", "./onnx_quantized_scaled")

Training on Scaled Dataset...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3233,0.316078,0.863792,0.863778,0.863949,0.863792


üöÄ Starting conversion for: ./distilbert_scaled


  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


Model saved to: ./onnx_quantized_scaled/model_quantized.onnx
Final Size: 64.45 MB
