## Libraries

In [1]:
import pandas as pd
import os
import joblib
import numpy as np

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

## Load Data

In [3]:
# === Load Data ===
# def load_data(file_path):
#     return pd.read_csv(file_path, sep=';', names=['text', 'label'])

# train = load_data('data/train.txt')
# val = load_data('data/val.txt')
# test = load_data('data/test.txt')

# trainval = pd.concat([train, val], ignore_index=True)

In [4]:
# Load the single CSV file
train = pd.read_csv('data/train.csv',names=['text', 'label'])
test = pd.read_csv('data/test.csv',names=['text', 'label'])

# === Label Encoding ===
label_encoder = LabelEncoder()
train['label_enc'] = label_encoder.fit_transform(train['label'])
test['label_enc'] = label_encoder.transform(test['label'])

## Define Models

In [5]:
# === Define Models ===
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    # "Random Forest": RandomForestClassifier(n_estimators=50, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42)
}

## Train and Evaluate

In [7]:
os.makedirs('model', exist_ok=True)

results = []

for name, clf in models.items():
    print(f"\n🔄 Training: {name}")
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', clf)
    ])
    
    pipeline.fit(train['text'], train['label_enc'])
    y_pred = pipeline.predict(test['text'])
    y_true = test['label_enc']

    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')

    print(f"✅ {name} - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
    print("📊 Classification Report:")
    print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

    conf_matrix = confusion_matrix(y_true, y_pred)
    print("🧩 Confusion Matrix:")
    print(conf_matrix)

    # ✅ Save model and confusion matrix
    model_dir = f'model/{name}'
    os.makedirs(model_dir, exist_ok=True)

    # Save model
    model_path = os.path.join(model_dir, f'{name}_model.pkl')
    joblib.dump(pipeline, model_path)

    # Save confusion matrix
    cm_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)
    cm_df.to_csv(os.path.join(model_dir, 'confusion_matrix.csv'))

    results.append({
        "model": name,
        "accuracy": accuracy,
        "f1_score": f1,
        "precision": precision,
        "recall": recall,
        "path": model_path
    })



🔄 Training: Logistic Regression
✅ Logistic Regression - Accuracy: 0.8955, F1: 0.8948
📊 Classification Report:
              precision    recall  f1-score   support

       anger       0.89      0.90      0.90     11463
        fear       0.84      0.84      0.84      9542
         joy       0.91      0.93      0.92     28214
        love       0.80      0.75      0.78      6911
     sadness       0.94      0.94      0.94     24238
    surprise       0.77      0.69      0.73      2994

    accuracy                           0.90     83362
   macro avg       0.86      0.84      0.85     83362
weighted avg       0.89      0.90      0.89     83362

🧩 Confusion Matrix:
[[10299   357   209    39   548    11]
 [  390  8022   201    33   484   412]
 [  153   151 26299  1134   337   140]
 [   44    24  1545  5196    89    13]
 [  610   413   327    70 22763    55]
 [   17   577   236    14    76  2074]]

🔄 Training: XGBoost
✅ XGBoost - Accuracy: 0.8960, F1: 0.8975
📊 Classification Report:
    

In [8]:
joblib.dump(label_encoder, 'model/label_encoder.pkl')

['model/label_encoder.pkl']

In [9]:
results_df = pd.DataFrame(results)
results_df.to_csv('model/model_results.csv', index=False)