In [1]:
"""
 Machine Learning Solution for Sentiment Analysis

This version includes TWO feature setups:
    - Setup A: Word-level TF-IDF (unigrams + bigrams)
    - Setup B: Character-level TF-IDF (3–5 character n-grams)

We compare their performance using the same train/validation/test splits.
"""

import re
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import joblib


# ----------------------------------------------------------------------
# 1. CONFIGURATION
# ----------------------------------------------------------------------

DATA_ROOT = Path(r"C:\Users\DELL\Documents\NLP\2017_English_final\GOLD\Subtask_A")

# Use the files that actually contain TEXT
TRAIN_FILES = [
    "twitter-2013train-A.txt",
]

# For simplicity, we also evaluate on the SMS set as "test"
TEST_FILE = "twitter-2013test-A.txt"

DEFAULT_SEP = "\t"

LABEL_NORMALISATION = {
    "positive": "positive",
    "negative": "negative",
    "neutral": "neutral",
    "objective-or-neutral": "neutral",   # merged into neutral
}


# ----------------------------------------------------------------------
# 2. DATA LOADING & PREPROCESSING
# ----------------------------------------------------------------------

def load_single_file(path: Path, sep: str = "\t") -> pd.DataFrame:
    """
    Load one TSV/CSV file and return a DataFrame with three columns:
        ID, Sentiment, Text

    Handles:
    - 3 columns: [id, label, text]
    - 4+ columns: [id, something, label, text, ...]
    - 2 columns (id + label) are ignored (no text -> unusable for ML)
    """
    if not path.exists():
        print(f"[WARN] File not found: {path}. Skipping.")
        return pd.DataFrame(columns=["ID", "Sentiment", "Text"])

    df_raw = pd.read_csv(
        path,
        sep=sep,
        header=None,
        encoding="utf-8",
        quoting=3,   # QUOTE_NONE
        engine="python"
    )

    n_cols = df_raw.shape[1]

    if n_cols == 2:
        print(f"[INFO] {path.name}: appears to be only ID + label (no text). Skipping.")
        return pd.DataFrame(columns=["ID", "Sentiment", "Text"])

    elif n_cols == 3:
        # [id, label, text]
        df = df_raw.rename(columns={0: "ID", 1: "Sentiment", 2: "Text"})

    elif n_cols >= 4:
        # [id, something, label, text, maybe extra...]
        df = df_raw.rename(columns={0: "ID", 2: "Sentiment", 3: "Text"})

    else:
        print(f"[WARN] {path.name}: unexpected number of columns = {n_cols}. Skipping.")
        return pd.DataFrame(columns=["ID", "Sentiment", "Text"])

    df = df[["ID", "Sentiment", "Text"]]
    return df


def load_data(file_names, data_root: Path = DATA_ROOT, sep: str = DEFAULT_SEP) -> pd.DataFrame:
    """
    Load and concatenate multiple files into a single DataFrame.
    """
    data_frames = []

    for fname in file_names:
        path = data_root / fname
        df = load_single_file(path, sep=sep)
        if df.empty:
            continue
        data_frames.append(df)

    if not data_frames:
        raise ValueError("No usable data loaded. Check file names and formats.")

    df_all = pd.concat(data_frames, ignore_index=True)

    df_all["Text"] = df_all["Text"].fillna("")

    df_all["Sentiment"] = df_all["Sentiment"].astype(str).str.strip().str.lower()
    df_all["Sentiment"] = df_all["Sentiment"].map(LABEL_NORMALISATION)

    before = len(df_all)
    df_all = df_all.dropna(subset=["Sentiment"])
    after = len(df_all)
    if after < before:
        print(f"[INFO] Dropped {before - after} rows with unknown labels.")

    return df_all


def preprocess_text(text: str) -> str:
    """
    Simple preprocessing:
        - Replace URLs with 'url'
        - Replace @mentions with 'at_user'
        - Lowercase
        - Collapse whitespace
    """
    text = str(text)
    text = re.sub(r"http\S+|www\S+|https\S+", "url", text, flags=re.MULTILINE)
    text = re.sub(r"@\w+", "at_user", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text


# ----------------------------------------------------------------------
# 3. FEATURE EXTRACTION (WORD & CHAR)
# ----------------------------------------------------------------------

def build_word_vectorizer():
    """
    Word-level TF-IDF (unigrams + bigrams).
    """
    return TfidfVectorizer(
        lowercase=False,        # already lowercased
        ngram_range=(1, 2),
        max_features=20000,
        min_df=2,
        stop_words="english"
    )


def build_char_vectorizer():
    """
    Character-level TF-IDF (3–5 character n-grams).
    """
    return TfidfVectorizer(
        analyzer="char",
        ngram_range=(3, 5),
        max_features=30000,
        min_df=2
    )


# ----------------------------------------------------------------------
# 4. BASELINES & MODEL TRAINING
# ----------------------------------------------------------------------

def majority_class_baseline(y_train, y_true, name="Majority class baseline"):
    """
    Always predict the majority class from y_train.
    """
    values, counts = np.unique(y_train, return_counts=True)
    majority_label = values[np.argmax(counts)]

    y_pred = np.full_like(y_true, fill_value=majority_label, dtype=object)

    print(f"\n=== {name} ===")
    print(f"Majority label: {majority_label}")
    print(classification_report(y_true, y_pred, digits=3))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Macro F1:", f1_score(y_true, y_pred, average="macro"))


def train_and_evaluate_classifier(
    name: str,
    clf,
    X_train,
    y_train,
    X_val,
    y_val
):
    """
    Fit classifier on training data and evaluate on validation set.
    """
    print(f"\n=== {name} ===")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)

    print(classification_report(y_val, y_pred, digits=3))
    acc = accuracy_score(y_val, y_pred)
    macro_f1 = f1_score(y_val, y_pred, average="macro")

    print(f"Accuracy: {acc:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")

    return acc, macro_f1, clf


# ----------------------------------------------------------------------
# 5. MAIN EXECUTION
# ----------------------------------------------------------------------

if __name__ == "__main__":

    print("--- 7120CEM CW1: Sentiment Analysis (Two Feature Setups) ---")

    # 5.1 Load training and test data
    print("\n[STEP] Loading training data...")
    train_df = load_data(TRAIN_FILES, data_root=DATA_ROOT, sep=DEFAULT_SEP)

    print("[STEP] Loading test data...")
    test_df = load_data([TEST_FILE], data_root=DATA_ROOT, sep=DEFAULT_SEP)

    # 5.2 Preprocess text
    print("[STEP] Preprocessing text...")
    train_df["Clean_Text"] = train_df["Text"].apply(preprocess_text)
    test_df["Clean_Text"] = test_df["Text"].apply(preprocess_text)

    X_all = train_df["Clean_Text"]
    y_all = train_df["Sentiment"]

    X_test_raw = test_df["Clean_Text"]
    y_test = test_df["Sentiment"]

    # 5.3 Train/validation split (same as the run that worked)
    print("[STEP] Creating train/validation split...")
    X_train, X_val, y_train, y_val = train_test_split(
        X_all,
        y_all,
        test_size=0.2,
        random_state=42,
        stratify=y_all,
    )

    print(f"Training size:   {len(X_train)}")
    print(f"Validation size: {len(X_val)}")
    print(f"Test size:       {len(X_test_raw)}")

    print("\nSample preprocessed texts:")
    print(X_train.head(5))

    # ------------------------------------------------------------------
    # EXPERIMENT A: WORD TF-IDF
    # ------------------------------------------------------------------
    print("\n[STEP] Building WORD TF-IDF features...")
    word_vectorizer = build_word_vectorizer()

    X_train_word = word_vectorizer.fit_transform(X_train)
    X_val_word = word_vectorizer.transform(X_val)
    X_test_word = word_vectorizer.transform(X_test_raw)

    print("Word feature space dimension:", X_train_word.shape[1])

    # Baseline on labels (same for all feature sets)
    majority_class_baseline(y_train, y_val, name="Majority baseline on validation set")

    # Logistic Regression (word)
    logreg_word = LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    )

    acc_log_w, f1_log_w, _ = train_and_evaluate_classifier(
        "Logistic Regression (WORD TF-IDF, balanced)",
        logreg_word,
        X_train_word,
        y_train,
        X_val_word,
        y_val
    )

    # Linear SVM (word)
    svm_word = LinearSVC(
        class_weight="balanced",
        C=1.0
    )

    acc_svm_w, f1_svm_w, svm_word = train_and_evaluate_classifier(
        "Linear SVM (WORD TF-IDF, balanced)",
        svm_word,
        X_train_word,
        y_train,
        X_val_word,
        y_val
    )

    # ------------------------------------------------------------------
    # EXPERIMENT B: CHAR TF-IDF
    # ------------------------------------------------------------------
    print("\n[STEP] Building CHAR TF-IDF features...")
    char_vectorizer = build_char_vectorizer()

    X_train_char = char_vectorizer.fit_transform(X_train)
    X_val_char = char_vectorizer.transform(X_val)
    X_test_char = char_vectorizer.transform(X_test_raw)

    print("Char feature space dimension:", X_train_char.shape[1])

    # Linear SVM (char)
    svm_char = LinearSVC(
        class_weight="balanced",
        C=1.0
    )

    acc_svm_c, f1_svm_c, svm_char = train_and_evaluate_classifier(
        "Linear SVM (CHAR TF-IDF, balanced)",
        svm_char,
        X_train_char,
        y_train,
        X_val_char,
        y_val
    )

    # ------------------------------------------------------------------
    # COMPARE FEATURE SETUPS
    # ------------------------------------------------------------------
    print("\n========== SUMMARY OF FEATURE SETS (Validation) ==========")
    print(f"WORD TF-IDF  (SVM):  Acc={acc_svm_w:.4f}, Macro F1={f1_svm_w:.4f}")
    print(f"CHAR TF-IDF  (SVM):  Acc={acc_svm_c:.4f}, Macro F1={f1_svm_c:.4f}")

    # Choose best feature setup based on validation macro F1
    if f1_svm_c > f1_svm_w:
        best_feature_type = "char"
        best_vectorizer = char_vectorizer
        best_model = svm_char
        X_test_best = X_test_char
        print("\n[INFO] Best feature setup on validation: CHAR TF-IDF")
    else:
        best_feature_type = "word"
        best_vectorizer = word_vectorizer
        best_model = svm_word
        X_test_best = X_test_word
        print("\n[INFO] Best feature setup on validation: WORD TF-IDF")

    # ------------------------------------------------------------------
    # FINAL TEST EVALUATION USING BEST FEATURE SETUP
    # ------------------------------------------------------------------
    print("\n[STEP] Training best model on FULL training data and evaluating on test set...")

    # Refit vectorizer on all training data (X_all)
    if best_feature_type == "word":
        best_vectorizer = build_word_vectorizer()
        X_full = best_vectorizer.fit_transform(X_all)
        X_test_full = best_vectorizer.transform(X_test_raw)
    else:
        best_vectorizer = build_char_vectorizer()
        X_full = best_vectorizer.fit_transform(X_all)
        X_test_full = best_vectorizer.transform(X_test_raw)

    best_model = LinearSVC(class_weight="balanced", C=1.0)
    best_model.fit(X_full, y_all)
    y_test_pred = best_model.predict(X_test_full)

    print("\n=== Final Test Set Performance (Best Feature Setup: "
          f"{best_feature_type.upper()} TF-IDF) ===")
    print(classification_report(y_test, y_test_pred, digits=3))
    print("Test accuracy:", accuracy_score(y_test, y_test_pred))
    print("Test macro F1:", f1_score(y_test, y_test_pred, average="macro"))

    # ------------------------------------------------------------------
    # SAVE BEST MODEL + VECTORIZER
    # ------------------------------------------------------------------
    print("\n[STEP] Saving best model and vectorizer...")
    joblib.dump(best_model, f"cw1_best_svm_{best_feature_type}_tfidf.joblib")
    joblib.dump(best_vectorizer, f"cw1_best_vectorizer_{best_feature_type}_tfidf.joblib")
    print(f"Saved as 'cw1_best_svm_{best_feature_type}_tfidf.joblib' and "
          f"'cw1_best_vectorizer_{best_feature_type}_tfidf.joblib'.")


--- 7120CEM CW1: Sentiment Analysis (Two Feature Setups) ---

[STEP] Loading training data...
[STEP] Loading test data...
[STEP] Preprocessing text...
[STEP] Creating train/validation split...
Training size:   7747
Validation size: 1937
Test size:       3547

Sample preprocessed texts:
1811    at_user my birthday on february. my friends th...
4099    who\u2019s next then? the togo team of emmanue...
1353    hope kevin frandsen is taking 3rd base for the...
6229    rt at_user: it's kerry's birthday tonight on b...
7322    "rt at_user: red sox, phillies flying high as ...
Name: Clean_Text, dtype: object

[STEP] Building WORD TF-IDF features...
Word feature space dimension: 13906

=== Majority baseline on validation set ===
Majority label: neutral
              precision    recall  f1-score   support

    negative      0.000     0.000     0.000       292
     neutral      0.473     1.000     0.643       917
    positive      0.000     0.000     0.000       728

    accuracy               

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    negative      0.450     0.514     0.480       292
     neutral      0.683     0.697     0.690       917
    positive      0.694     0.637     0.664       728

    accuracy                          0.647      1937
   macro avg      0.609     0.616     0.611      1937
weighted avg      0.652     0.647     0.649      1937

Accuracy: 0.6469
Macro F1: 0.6114

=== Linear SVM (WORD TF-IDF, balanced) ===
              precision    recall  f1-score   support

    negative      0.502     0.397     0.444       292
     neutral      0.659     0.721     0.689       917
    positive      0.669     0.646     0.657       728

    accuracy                          0.644      1937
   macro avg      0.610     0.588     0.596      1937
weighted avg      0.639     0.644     0.640      1937

Accuracy: 0.6438
Macro F1: 0.5963

[STEP] Building CHAR TF-IDF features...
Char feature space dimension: 30000

=== Linear SVM (CHAR TF-IDF, balanced) ===
     