In [None]:
import numpy as np
import os
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score

def load_split_data(split_dir):
    texts = []
    labels = []

    for label_name, label_val in [('pos', 1), ('neg', 0)]:
        dir_path = os.path.join(split_dir, label_name)
        for filename in os.listdir(dir_path):
            file_path = os.path.join(dir_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(label_val)

    return texts, labels

X_train, y_train = load_split_data('data/train')
X_val, y_val     = load_split_data('data/val')
X_test, y_test   = load_split_data('data/dev')

# Candidate Hyperparameters "C"
#C_values = [0.01, 0.1, 1, 10, 100]
#C_values = [0.09, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
C_values = [0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5]

# Track best model and score
best_C = None
best_f1 = -1
best_model = None

# 1–4. Try each C, train on train set, evaluate on validation set
for C in C_values:
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('svm', LinearSVC(C=C, dual=False))
    ])
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)
    f1 = f1_score(y_val, y_val_pred, average='macro')  # or 'weighted'

    print("C=" + str(C) + ", F1 on val: " + str(f1))
    
    if f1 > best_f1:
        best_f1 = f1
        best_C = C
        best_model = pipeline  # Save best model temporarily

print("\n")
print("Best C based on validation set:", best_C)

# 5. Retrain on full train + val set using best_C
X_full_train = X_train + X_val
y_full_train = y_train + y_val

final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', LinearSVC(C=best_C, dual=False))
])
final_pipeline.fit(X_full_train, y_full_train)

# Final test evaluation
y_test_pred = final_pipeline.predict(X_test)
print("\n")
print("Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred))

C=0.32, F1 on val: 0.9139944956477215
C=0.34, F1 on val: 0.9139951622278754
C=0.36, F1 on val: 0.9157472143922759
C=0.38, F1 on val: 0.9164979124478112
C=0.4, F1 on val: 0.9157480990664852
C=0.42, F1 on val: 0.9152484691754744
C=0.44, F1 on val: 0.9152484691754744
C=0.46, F1 on val: 0.9154978874471861
C=0.48, F1 on val: 0.9152476640137392
C=0.5, F1 on val: 0.9154978874471861

✅ Best C based on validation set: 0.38

📊 Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.91      0.92      0.92      2000
           1       0.92      0.91      0.91      2000

    accuracy                           0.92      4000
   macro avg       0.92      0.92      0.91      4000
weighted avg       0.92      0.92      0.91      4000

