In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import pickle
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from scipy.sparse import hstack, save_npz, load_npz, csr_matrix
from sklearn.svm import LinearSVC
print("Libraries loaded successfully!")

Libraries loaded successfully!


In [2]:
# Preprocessing: Load data, clean text, create splits
def clean_text(s):
    s = (s or "").lower()
    s = re.sub(r'http\\S+|<[^>]+>', ' ', s)
    s = re.sub(r"[^a-z0-9\\s']", ' ', s)
    return re.sub(r"\\s+", ' ', s).strip()


p = Path('test.ft.txt')
if p.exists():
    text = p.read_text(encoding='utf-8', errors='replace')
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    rows = []
    for l in lines:
        if ' ' in l:
            label_token, review = l.split(' ', 1)
        else:
            label_token, review = l, ''
        m = re.match(r"__label__([0-9]+)", label_token)
        label = int(m.group(1)) if m else label_token
        rows.append({'label': label, 'text': review})
    df = pd.DataFrame(rows)
    df['text_clean'] = df['text'].apply(clean_text)
    df['length'] = df['text_clean'].str.len()
    df['tokens'] = df['text_clean'].str.split()
    
  
    from sklearn.model_selection import train_test_split
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
    Path('splits').mkdir(exist_ok=True)
    train_df.to_csv('splits/train.csv', index=False)
    val_df.to_csv('splits/validation.csv', index=False)
    print(f"Preprocessing complete. Train: {train_df.shape}, Val: {val_df.shape}")
else:
    print("test.ft.txt not found. Please place the file in the notebook directory.")

Preprocessing complete. Train: (320000, 5), Val: (80000, 5)


In [3]:
# Feature Engineering: TF-IDF + Lexical features
feature_dir = Path('features')
feature_dir.mkdir(exist_ok=True)


def extract_lexical_features(df):
    features = pd.DataFrame()
    features['char_count'] = df['text_clean'].str.len()
    features['word_count'] = df['text_clean'].str.split().str.len()
    features['avg_word_length'] = features['char_count'] / (features['word_count'] + 1)
    features['exclamation_count'] = df['text_clean'].str.count('!')
    features['question_count'] = df['text_clean'].str.count('\\?')
    return features

train_lexical = extract_lexical_features(train_df)
val_lexical = extract_lexical_features(val_df)


vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.95, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(train_df['text_clean'])
X_val_tfidf = vectorizer.transform(val_df['text_clean'])


train_lexical_sparse = csr_matrix(train_lexical.values)
val_lexical_sparse = csr_matrix(val_lexical.values)
X_train = hstack([X_train_tfidf, train_lexical_sparse])
X_val = hstack([X_val_tfidf, val_lexical_sparse])


save_npz(feature_dir / 'tfidf_lex_combined_train.npz', X_train)
save_npz(feature_dir / 'tfidf_lex_combined_val.npz', X_val)
np.save(feature_dir / 'train_labels.npy', train_df['label'].values)
np.save(feature_dir / 'val_labels.npy', val_df['label'].values)
with open(feature_dir / 'tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print(f"Features saved. Train shape: {X_train.shape}, Val shape: {X_val.shape}")

Features saved. Train shape: (320000, 1222610), Val shape: (80000, 1222610)


In [4]:
# Model 1: Decision Tree
# Toggle tuning: set to True to run GridSearchCV, False to train default estimator
RUN_DT_TUNING = False

if RUN_DT_TUNING:
    dt_params = {
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10]
    }

    dt_grid = GridSearchCV(
        DecisionTreeClassifier(criterion="gini", random_state=42),
        dt_params,
        cv=3, 
        scoring="accuracy"
    )

    dt_grid.fit(X_train, train_df['label'])
    print("Best Params:", dt_grid.best_params_)
    best_dt = dt_grid.best_estimator_
else:
    best_dt = DecisionTreeClassifier(criterion="gini", random_state=42)
    best_dt.fit(X_train, train_df['label'])

# Evaluation
y_pred_dt = best_dt.predict(X_val)
print("Decision Tree Results:")
print(f"Accuracy: {accuracy_score(val_df['label'], y_pred_dt):.4f}")
print(classification_report(val_df['label'], y_pred_dt))

Decision Tree Results:
Accuracy: 0.7706
              precision    recall  f1-score   support

           1       0.77      0.77      0.77     40000
           2       0.77      0.78      0.77     40000

    accuracy                           0.77     80000
   macro avg       0.77      0.77      0.77     80000
weighted avg       0.77      0.77      0.77     80000



In [5]:
# Model 2: Logistic Regression
# Toggle tuning: set to True to run GridSearchCV, False to train default estimator
RUN_LR_TUNING = False

if RUN_LR_TUNING:
    lr_params = {
        "C": [0.01, 0.1, 1, 10]
    }

    lr_grid = GridSearchCV(
        LogisticRegression(max_iter=1000),
        lr_params,
        cv=3,  
        scoring="accuracy"
    )

    lr_grid.fit(X_train, train_df['label'])
    print("Best Params:", lr_grid.best_params_)
    best_lr = lr_grid.best_estimator_
else:
    best_lr = LogisticRegression(max_iter=1000)
    best_lr.fit(X_train, train_df['label'])

# Evaluation
y_pred_lr = best_lr.predict(X_val)
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(val_df['label'], y_pred_lr):.4f}")
print(classification_report(val_df['label'], y_pred_lr))

Logistic Regression Results:
Accuracy: 0.8960
              precision    recall  f1-score   support

           1       0.90      0.89      0.90     40000
           2       0.89      0.90      0.90     40000

    accuracy                           0.90     80000
   macro avg       0.90      0.90      0.90     80000
weighted avg       0.90      0.90      0.90     80000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# Model 3: SVM
# Toggle tuning: set to True to run GridSearchCV, False to train a fast linear SVM
RUN_SVM_TUNING = False

if RUN_SVM_TUNING:
    svm_params = {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"]
    }

    svm_grid = GridSearchCV(
        SVC(),
        svm_params,
        cv=3,  
        scoring="accuracy"
    )

    svm_grid.fit(X_train, train_df['label'])
    print("Best Params:", svm_grid.best_params_)
    best_svm = svm_grid.best_estimator_
else:
    from sklearn.svm import LinearSVC
    best_svm = LinearSVC(max_iter=2000)
    best_svm.fit(X_train, train_df['label'])

# Evaluation
y_pred_svm = best_svm.predict(X_val)
print("SVM Results:")
print(f"Accuracy: {accuracy_score(val_df['label'], y_pred_svm):.4f}")
print(classification_report(val_df['label'], y_pred_svm))

SVM Results:
Accuracy: 0.8640
              precision    recall  f1-score   support

           1       0.92      0.80      0.85     40000
           2       0.82      0.93      0.87     40000

    accuracy                           0.86     80000
   macro avg       0.87      0.86      0.86     80000
weighted avg       0.87      0.86      0.86     80000





In [7]:
# Model 4: AdaBoost
# Toggle tuning: set to True to run GridSearchCV, False to train default estimator
RUN_ADA_TUNING = False

if RUN_ADA_TUNING:
    ada_params = {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 1]
    }

    ada_grid = GridSearchCV(
        AdaBoostClassifier(random_state=42),
        ada_params,
        cv=3,  
        scoring="accuracy"
    )

    ada_grid.fit(X_train, train_df['label'])
    print("Best Params:", ada_grid.best_params_)
    best_ada = ada_grid.best_estimator_
else:
    best_ada = AdaBoostClassifier(random_state=42)
    best_ada.fit(X_train, train_df['label'])

# Evaluation
y_pred_ada = best_ada.predict(X_val)
print("AdaBoost Results:")
print(f"Accuracy: {accuracy_score(val_df['label'], y_pred_ada):.4f}")
print(classification_report(val_df['label'], y_pred_ada))

AdaBoost Results:
Accuracy: 0.7279
              precision    recall  f1-score   support

           1       0.69      0.83      0.75     40000
           2       0.79      0.63      0.70     40000

    accuracy                           0.73     80000
   macro avg       0.74      0.73      0.73     80000
weighted avg       0.74      0.73      0.73     80000



In [8]:
# Prediction Pipeline: Predict sentiment on new text using the trained models

new_texts = [
    "This product is amazing! I love it so much.",
    "Terrible quality, I regret buying this.",
    "It's okay, nothing special."
]


new_df = pd.DataFrame({'text': new_texts})
new_df['text_clean'] = new_df['text'].apply(clean_text)
new_df['length'] = new_df['text_clean'].str.len()
new_df['tokens'] = new_df['text_clean'].str.split()


new_lexical = extract_lexical_features(new_df)
new_tfidf = vectorizer.transform(new_df['text_clean'])
new_lexical_sparse = csr_matrix(new_lexical.values)
X_new = hstack([new_tfidf, new_lexical_sparse])


models = {
    "Decision Tree": best_dt,
    "Logistic Regression": best_lr,
    "SVM": best_svm,
    "AdaBoost": best_ada
}

for name, model in models.items():
    predictions = model.predict(X_new)
    print(f"\n{name} Predictions:")
    for text, pred in zip(new_texts, predictions):
        sentiment = "Positive" if pred == 2 else "Negative"  
        print(f"  '{text}' -> {sentiment} (label: {pred})")


Decision Tree Predictions:
  'This product is amazing! I love it so much.' -> Positive (label: 2)
  'Terrible quality, I regret buying this.' -> Negative (label: 1)
  'It's okay, nothing special.' -> Negative (label: 1)

Logistic Regression Predictions:
  'This product is amazing! I love it so much.' -> Positive (label: 2)
  'Terrible quality, I regret buying this.' -> Negative (label: 1)
  'It's okay, nothing special.' -> Negative (label: 1)

SVM Predictions:
  'This product is amazing! I love it so much.' -> Positive (label: 2)
  'Terrible quality, I regret buying this.' -> Negative (label: 1)
  'It's okay, nothing special.' -> Negative (label: 1)

AdaBoost Predictions:
  'This product is amazing! I love it so much.' -> Positive (label: 2)
  'Terrible quality, I regret buying this.' -> Negative (label: 1)
  'It's okay, nothing special.' -> Negative (label: 1)
