In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!mkdir -p '/content/drive/MyDrive/Colab Notebooks/svm_classification/outputs'

In [3]:
"""
SVM Classification with Trigrams
=================================
Train an SVM classifier on your labeled samples using trigram features.

Difference from previous model:
- Previous: TF-IDF with unigrams + bigrams (1,2)
- This: TF-IDF with trigrams (3,3) or combined (1,3)
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import (classification_report, confusion_matrix,
                             accuracy_score, f1_score, precision_score, recall_score)
import re
import pickle
import warnings
warnings.filterwarnings('ignore')

# Update this to your Google Drive path
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/svm_classification/manual_label_batch1_updated.csv'

# Update outputs to save in the same folder
MODEL_SAVE_PATH = '/content/drive/MyDrive/Colab Notebooks/svm_classification/outputs/svm_trigram_model.pkl'
VECTORIZER_SAVE_PATH = '/content/drive/MyDrive/Colab Notebooks/svm_classification/outputs/svm_trigram_vectorizer.pkl'
CONFUSION_MATRIX_PATH = '/content/drive/MyDrive/Colab Notebooks/svm_classification/outputs/svm_trigram_confusion_matrix.png'

# DATA_PATH = 'manual_label_batch1.xlsx'
# MODEL_SAVE_PATH = 'outputs/svm_trigram_model.pkl'
# VECTORIZER_SAVE_PATH = 'outputs/svm_trigram_vectorizer.pkl'
# CONFUSION_MATRIX_PATH = 'outputs/svm_trigram_confusion_matrix.png'


In [4]:
# Trigram options:
# 'trigrams_only': (3, 3) - only 3-word sequences
# 'unigrams_to_trigrams': (1, 3) - words, 2-word, and 3-word sequences
NGRAM_TYPE = 'unigrams_to_trigrams'  # Change to 'trigrams_only' if you want only trigrams

# Data split sizes
VAL_SIZE = 0.15    # 15% for validation
TEST_SIZE = 0.15   # 15% for test
# This means: 70% train, 15% validation, 15% test
RANDOM_STATE = 42

In [5]:
# preprocessing
def preprocess_text(text):
    """Clean text data"""
    text = str(text).lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def handle_multilabels(df):
    """Take first label from multi-label samples"""
    print("\nHandling multi-label samples...")
    multi_label_mask = df['label'].astype(str).str.contains(',')
    multi_label_count = multi_label_mask.sum()

    if multi_label_count > 0:
        print(f"  Found {multi_label_count} multi-label samples")
        df['label'] = df['label'].astype(str).apply(lambda x: x.split(',')[0])

    df['label'] = df['label'].astype(int)
    return df

In [10]:
# MAIN TRAINING FUNCTION
# ============================================================================

def train_svm_trigram():
    """Train SVM with trigram features"""

    print("\n" + "="*80)
    print("SVM CLASSIFICATION WITH TRIGRAMS")
    print("="*80)

    # -------------------------------------------------------------------------
    # 1. LOAD DATA
    # -------------------------------------------------------------------------
    print("\n[1/8] Loading data...")
    df = pd.read_csv(DATA_PATH)
    print(f"  Loaded {len(df)} samples")
    print(f"  Columns: {df.columns.tolist()}")

    # -------------------------------------------------------------------------
    # 2. PREPROCESS
    # -------------------------------------------------------------------------
    print("\n[2/8] Preprocessing text...")
    df['text_processed'] = df['text'].apply(preprocess_text)
    df = handle_multilabels(df)

    print(f"\n  Label distribution:")
    print(df['label'].value_counts().sort_index())

    # -------------------------------------------------------------------------
    # 3. TRAIN-VALIDATION-TEST SPLIT
    # -------------------------------------------------------------------------
    print(f"\n[3/8] Splitting data (val: {VAL_SIZE}, test: {TEST_SIZE})...")
    X = df['text_processed'].values
    y = df['label'].values

    # First split: separate test set
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    
    # Second split: separate validation from remaining
    val_size_adjusted = VAL_SIZE / (1 - TEST_SIZE)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_size_adjusted, 
        random_state=RANDOM_STATE, stratify=y_temp
    )

    print(f"  Training set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
    print(f"  Validation set: {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")
    print(f"  Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")

    # -------------------------------------------------------------------------
    # 4. FEATURE EXTRACTION WITH TRIGRAMS
    # -------------------------------------------------------------------------
    print(f"\n[4/8] Extracting features...")
    print(f"  N-gram type: {NGRAM_TYPE}")

    if NGRAM_TYPE == 'trigrams_only':
        ngram_range = (3, 3)
        print(f"  Using: trigrams only (3-word sequences)")
    else:  # unigrams_to_trigrams
        ngram_range = (1, 3)
        print(f"  Using: unigrams + bigrams + trigrams")

    vectorizer = TfidfVectorizer(
        max_features=1000,
        min_df=2,
        max_df=0.8,
        ngram_range=ngram_range,
        stop_words='english'
    )

    # Fit on training data only
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    X_test_tfidf = vectorizer.transform(X_test)

    print(f"   Feature matrix shape (train): {X_train_tfidf.shape}")
    print(f"   Feature matrix shape (validation): {X_val_tfidf.shape}")
    print(f"   Feature matrix shape (test): {X_test_tfidf.shape}")
    print(f"   Number of features: {len(vectorizer.get_feature_names_out())}")

    # Show sample trigrams
    feature_names = vectorizer.get_feature_names_out()
    trigram_features = [f for f in feature_names if len(f.split()) == 3]
    if trigram_features:
        print(f"\n  Sample trigrams extracted:")
        for i, tg in enumerate(trigram_features[:5], 1):
            print(f"    {i}. '{tg}'")

    # -------------------------------------------------------------------------
    # 5. TRAIN SVM MODEL
    # -------------------------------------------------------------------------
    print(f"\n[5/8] Training SVM classifier...")

    results = {}

    # Try linear kernel
    print(f"\n  Training SVM with linear kernel...")
    svm_linear = SVC(kernel='linear', random_state=RANDOM_STATE, 
                     class_weight='balanced', probability=True)
    svm_linear.fit(X_train_tfidf, y_train)

    # Evaluate on validation set
    y_val_pred_linear = svm_linear.predict(X_val_tfidf)
    val_acc_linear = accuracy_score(y_val, y_val_pred_linear)
    val_f1_weighted_linear = f1_score(y_val, y_val_pred_linear, average='weighted')
    val_f1_macro_linear = f1_score(y_val, y_val_pred_linear, average='macro')

    results['linear'] = {
        'model': svm_linear,
        'val_accuracy': val_acc_linear,
        'val_f1_weighted': val_f1_weighted_linear,
        'val_f1_macro': val_f1_macro_linear,
        'val_pred': y_val_pred_linear
    }

    print(f"    Validation Accuracy: {val_acc_linear:.4f}")
    print(f"    Validation F1 (Weighted): {val_f1_weighted_linear:.4f}")
    print(f"    Validation F1 (Macro): {val_f1_macro_linear:.4f}")

    # Try RBF kernel
    print(f"\n  Training SVM with rbf kernel...")
    svm_rbf = SVC(kernel='rbf', random_state=RANDOM_STATE, 
                  class_weight='balanced', probability=True)
    svm_rbf.fit(X_train_tfidf, y_train)

    # Evaluate on validation set
    y_val_pred_rbf = svm_rbf.predict(X_val_tfidf)
    val_acc_rbf = accuracy_score(y_val, y_val_pred_rbf)
    val_f1_weighted_rbf = f1_score(y_val, y_val_pred_rbf, average='weighted')
    val_f1_macro_rbf = f1_score(y_val, y_val_pred_rbf, average='macro')

    results['rbf'] = {
        'model': svm_rbf,
        'val_accuracy': val_acc_rbf,
        'val_f1_weighted': val_f1_weighted_rbf,
        'val_f1_macro': val_f1_macro_rbf,
        'val_pred': y_val_pred_rbf
    }

    print(f"    Validation Accuracy: {val_acc_rbf:.4f}")
    print(f"    Validation F1 (Weighted): {val_f1_weighted_rbf:.4f}")
    print(f"    Validation F1 (Macro): {val_f1_macro_rbf:.4f}")

    # Select best model based on validation F1
    best_kernel = 'linear' if val_f1_weighted_linear >= val_f1_weighted_rbf else 'rbf'
    best_model = results[best_kernel]['model']
    best_val_pred = results[best_kernel]['val_pred']

    print(f"\n   Best kernel (based on validation): {best_kernel}")
    print(f"   Best validation F1 score: {results[best_kernel]['val_f1_weighted']:.4f}")

    # -------------------------------------------------------------------------
    # 6. EVALUATE ON VALIDATION SET
    # -------------------------------------------------------------------------
    print(f"\n[6/8] Validation Set Performance...")
    print("\n  Classification Report (Validation):")
    print(classification_report(y_val, best_val_pred))

    print("\n  Confusion Matrix (Validation):")
    cm_val = confusion_matrix(y_val, best_val_pred)
    print(cm_val)

    # -------------------------------------------------------------------------
    # 7. FINAL EVALUATION ON TEST SET
    # -------------------------------------------------------------------------
    print(f"\n[7/8] Test Set Performance (Final Evaluation)...")
    y_test_pred = best_model.predict(X_test_tfidf)
    
    test_acc = accuracy_score(y_test, y_test_pred)
    test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
    test_f1_macro = f1_score(y_test, y_test_pred, average='macro')

    print(f"\n  Test Accuracy: {test_acc:.4f}")
    print(f"  Test F1 (Weighted): {test_f1_weighted:.4f}")
    print(f"  Test F1 (Macro): {test_f1_macro:.4f}")

    print("\n  Classification Report (Test):")
    print(classification_report(y_test, y_test_pred))

    print("\n  Confusion Matrix (Test):")
    cm_test = confusion_matrix(y_test, y_test_pred)
    print(cm_test)

    # Plot confusion matrices side by side
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    sns.heatmap(cm_val, annot=True, fmt='d', cmap='Blues', ax=axes[0])
    axes[0].set_title('Validation Set Confusion Matrix', fontweight='bold')
    axes[0].set_ylabel('True Label')
    axes[0].set_xlabel('Predicted Label')
    
    sns.heatmap(cm_test, annot=True, fmt='d', cmap='Greens', ax=axes[1])
    axes[1].set_title('Test Set Confusion Matrix (Final)', fontweight='bold')
    axes[1].set_ylabel('True Label')
    axes[1].set_xlabel('Predicted Label')
    
    plt.tight_layout()
    plt.savefig(CONFUSION_MATRIX_PATH, dpi=300, bbox_inches='tight')
    plt.close()

    print(f"\n   Confusion matrices saved: {CONFUSION_MATRIX_PATH}")

    # -------------------------------------------------------------------------
    # 8. SAVE MODEL
    # -------------------------------------------------------------------------
    print(f"\n[8/8] Saving model...")
    with open(MODEL_SAVE_PATH, 'wb') as f:
        pickle.dump(best_model, f)
    print(f"   Model saved: {MODEL_SAVE_PATH}")

    with open(VECTORIZER_SAVE_PATH, 'wb') as f:
        pickle.dump(vectorizer, f)
    print(f"  Vectorizer saved: {VECTORIZER_SAVE_PATH}")

    # -------------------------------------------------------------------------
    # SUMMARY
    # -------------------------------------------------------------------------
    print("\n" + "="*80)
    print("TRAINING COMPLETE!")
    print("="*80)

    print("\n Results Summary:")
    print(f"  Model: SVM ({best_kernel} kernel)")
    print(f"  Features: {NGRAM_TYPE}")
    print(f"  Data split: {len(X_train)} train / {len(X_val)} val / {len(X_test)} test")
    print(f"\n  Validation Performance:")
    print(f"    Accuracy: {results[best_kernel]['val_accuracy']*100:.2f}%")
    print(f"    F1 (Weighted): {results[best_kernel]['val_f1_weighted']:.4f}")
    print(f"    F1 (Macro): {results[best_kernel]['val_f1_macro']:.4f}")
    print(f"\n  Test Performance (Final):")
    print(f"    Accuracy: {test_acc*100:.2f}%")
    print(f"    F1 (Weighted): {test_f1_weighted:.4f}")
    print(f"    F1 (Macro): {test_f1_macro:.4f}")

    print("\n Files Created:")
    print(f"  1. {MODEL_SAVE_PATH}")
    print(f"  2. {VECTORIZER_SAVE_PATH}")
    print(f"  3. {CONFUSION_MATRIX_PATH}")

    # Store results for return
    results['best_kernel'] = best_kernel
    results['test_accuracy'] = test_acc
    results['test_f1_weighted'] = test_f1_weighted
    results['test_f1_macro'] = test_f1_macro
    results['y_val'] = y_val
    results['y_val_pred'] = best_val_pred
    results['y_test'] = y_test
    results['y_test_pred'] = y_test_pred

    return best_model, vectorizer, results


In [11]:
#Prediction Function
def test_predictions():
    """Test the trained model on sample texts"""

    print("\n" + "="*80)
    print("TESTING PREDICTIONS")
    print("="*80)

    # Load model
    with open(MODEL_SAVE_PATH, 'rb') as f:
        model = pickle.load(f)
    with open(VECTORIZER_SAVE_PATH, 'rb') as f:
        vectorizer = pickle.load(f)

    # Sample texts
    sample_texts = [
        "I'm experiencing severe burnout in my cybersecurity role",
        "Looking for career advice in IT security",
        "Just got promoted to senior security analyst",
        "Dealing with stress and anxiety at work"
    ]

    print("\nSample Predictions:")
    print("-" * 80)

    for i, text in enumerate(sample_texts, 1):
        # Preprocess
        processed = preprocess_text(text)

        # Vectorize
        vectorized = vectorizer.transform([processed])

        # Predict
        prediction = model.predict(vectorized)[0]
        probability = model.predict_proba(vectorized)
        confidence = probability.max()

        print(f"\n{i}. Text: {text[:60]}...")
        print(f"   Predicted Label: {prediction}")
        print(f"   Confidence: {confidence:.2%}")

In [12]:
if __name__ == "__main__":

    # Train model
    model, vectorizer, results = train_svm_trigram()

    # Test predictions
    print("\n")
    test_predictions()

    # Optional: Uncomment to run hyperparameter tuning
    # print("\n")
    # print("Would you like to run hyperparameter tuning? (takes 5-10 minutes)")
    # response = input("Run tuning? (y/n): ")
    # if response.lower() == 'y':
    #     # Load data again
    #     df = pd.read_excel(DATA_PATH)
    #     df['text_processed'] = df['text'].apply(preprocess_text)
    #     df = handle_multilabels(df)
    #     X = df['text_processed'].values
    #     y = df['label'].values
    #     X_train, X_test, y_train, y_test = train_test_split(
    #         X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    #     )
    #     # Vectorize
    #     vectorizer = TfidfVectorizer(
    #         max_features=1000,
    #         min_df=2,
    #         max_df=0.8,
    #         ngram_range=(1, 3),
    #         stop_words='english'
    #     )
    #     X_train_tfidf = vectorizer.fit_transform(X_train)
    #     # Tune
    #     tuned_model = tune_svm_hyperparameters(X_train_tfidf, y_train)

    print("\n" + "="*80)
    print("ALL DONE! ")
    print("="*80)


SVM CLASSIFICATION WITH TRIGRAMS

[1/7] Loading data...
  Loaded 188 samples
  Columns: ['id', 'text', 'similarity_score', 'label']

[2/7] Preprocessing text...

Handling multi-label samples...

  Label distribution:
label
0    139
1     49
Name: count, dtype: int64

[3/7] Splitting data (test size: 0.2)...
  Training set: 150 samples
  Test set: 38 samples

[4/7] Extracting features...
  N-gram type: unigrams_to_trigrams
  Using: unigrams + bigrams + trigrams
   Feature matrix shape (train): (150, 1000)
   Feature matrix shape (test): (38, 1000)
   Number of features: 1000

  Sample trigrams extracted:
    1. 'feel like im'
    2. 'just don know'
    3. 'learning new things'
    4. 'like cybersecurity job'
    5. 'looks like cybersecurity'

[5/7] Training SVM classifier...

  Training SVM with linear kernel...
    Accuracy: 0.8947
    F1 (Weighted): 0.8976
    F1 (Macro): 0.8721

  Training SVM with rbf kernel...
    Accuracy: 0.8684
    F1 (Weighted): 0.8519
    F1 (Macro): 0.7923

