In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!mkdir -p '/content/drive/MyDrive/Colab Notebooks/svm_classification/outputs'

In [3]:
"""
SVM Classification with Trigrams
=================================
Train an SVM classifier on your labeled samples using trigram features.

Difference from previous model:
- Previous: TF-IDF with unigrams + bigrams (1,2)
- This: TF-IDF with trigrams (3,3) or combined (1,3)
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import (classification_report, confusion_matrix,
                             accuracy_score, f1_score, precision_score, recall_score)
import re
import pickle
import warnings
warnings.filterwarnings('ignore')

# Update this to your Google Drive path
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/svm_classification/manual_label_batch1_updated.csv'

# Update outputs to save in the same folder
MODEL_SAVE_PATH = '/content/drive/MyDrive/Colab Notebooks/svm_classification/outputs/svm_trigram_model.pkl'
VECTORIZER_SAVE_PATH = '/content/drive/MyDrive/Colab Notebooks/svm_classification/outputs/svm_trigram_vectorizer.pkl'
CONFUSION_MATRIX_PATH = '/content/drive/MyDrive/Colab Notebooks/svm_classification/outputs/svm_trigram_confusion_matrix.png'

# DATA_PATH = 'manual_label_batch1.xlsx'
# MODEL_SAVE_PATH = 'outputs/svm_trigram_model.pkl'
# VECTORIZER_SAVE_PATH = 'outputs/svm_trigram_vectorizer.pkl'
# CONFUSION_MATRIX_PATH = 'outputs/svm_trigram_confusion_matrix.png'


In [4]:
# Trigram options:
# 'trigrams_only': (3, 3) - only 3-word sequences
# 'unigrams_to_trigrams': (1, 3) - words, 2-word, and 3-word sequences
NGRAM_TYPE = 'unigrams_to_trigrams'  # Change to 'trigrams_only' if you want only trigrams

TEST_SIZE = 0.2
RANDOM_STATE = 42

In [5]:
# preprocessing
def preprocess_text(text):
    """Clean text data"""
    text = str(text).lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def handle_multilabels(df):
    """Take first label from multi-label samples"""
    print("\nHandling multi-label samples...")
    multi_label_mask = df['label'].astype(str).str.contains(',')
    multi_label_count = multi_label_mask.sum()

    if multi_label_count > 0:
        print(f"  Found {multi_label_count} multi-label samples")
        df['label'] = df['label'].astype(str).apply(lambda x: x.split(',')[0])

    df['label'] = df['label'].astype(int)
    return df

In [10]:
# MAIN TRAINING FUNCTION
# ============================================================================

def train_svm_trigram():
    """Train SVM with trigram features"""

    print("\n" + "="*80)
    print("SVM CLASSIFICATION WITH TRIGRAMS")
    print("="*80)

    # -------------------------------------------------------------------------
    # 1. LOAD DATA
    # -------------------------------------------------------------------------
    print("\n[1/7] Loading data...")
    df = pd.read_csv(DATA_PATH)
    print(f"  Loaded {len(df)} samples")
    print(f"  Columns: {df.columns.tolist()}")

    # -------------------------------------------------------------------------
    # 2. PREPROCESS
    # -------------------------------------------------------------------------
    print("\n[2/7] Preprocessing text...")
    df['text_processed'] = df['text'].apply(preprocess_text)
    df = handle_multilabels(df)

    print(f"\n  Label distribution:")
    print(df['label'].value_counts().sort_index())

    # -------------------------------------------------------------------------
    # 3. TRAIN-TEST SPLIT
    # -------------------------------------------------------------------------
    print(f"\n[3/7] Splitting data (test size: {TEST_SIZE})...")
    X = df['text_processed'].values
    y = df['label'].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )

    print(f"  Training set: {len(X_train)} samples")
    print(f"  Test set: {len(X_test)} samples")

    # -------------------------------------------------------------------------
    # 4. FEATURE EXTRACTION WITH TRIGRAMS
    # -------------------------------------------------------------------------
    print(f"\n[4/7] Extracting features...")
    print(f"  N-gram type: {NGRAM_TYPE}")

    if NGRAM_TYPE == 'trigrams_only':
        ngram_range = (3, 3)
        print(f"  Using: trigrams only (3-word sequences)")
    else:  # unigrams_to_trigrams
        ngram_range = (1, 3)
        print(f"  Using: unigrams + bigrams + trigrams")

    vectorizer = TfidfVectorizer(
        max_features=1000,  # Increased for trigrams
        min_df=2,
        max_df=0.8,
        ngram_range=ngram_range,
        stop_words='english'
    )

    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    print(f"   Feature matrix shape (train): {X_train_tfidf.shape}")
    print(f"   Feature matrix shape (test): {X_test_tfidf.shape}")
    print(f"   Number of features: {len(vectorizer.get_feature_names_out())}")

    # Show sample trigrams
    feature_names = vectorizer.get_feature_names_out()
    trigram_features = [f for f in feature_names if len(f.split()) == 3]
    if trigram_features:
        print(f"\n  Sample trigrams extracted:")
        for i, tg in enumerate(trigram_features[:5], 1):
            print(f"    {i}. '{tg}'")

    # -------------------------------------------------------------------------
    # 5. TRAIN SVM MODEL
    # -------------------------------------------------------------------------
    print(f"\n[5/7] Training SVM classifier...")

    # Try different kernels
    kernels = ['linear', 'rbf']
    results = {}

    for kernel in kernels:
        print(f"\n  Training SVM with {kernel} kernel...")

        model = SVC(
            kernel=kernel,
            C=1.0,
            random_state=RANDOM_STATE,
            class_weight='balanced',
            probability=True  # Enable probability estimates
        )

        model.fit(X_train_tfidf, y_train)

        # Predict
        y_pred = model.predict(X_test_tfidf)

        # Evaluate
        accuracy = accuracy_score(y_test, y_pred)
        f1_weighted = f1_score(y_test, y_pred, average='weighted')
        f1_macro = f1_score(y_test, y_pred, average='macro')

        results[kernel] = {
            'model': model,
            'accuracy': accuracy,
            'f1_weighted': f1_weighted,
            'f1_macro': f1_macro,
            'predictions': y_pred
        }

        print(f"    Accuracy: {accuracy:.4f}")
        print(f"    F1 (Weighted): {f1_weighted:.4f}")
        print(f"    F1 (Macro): {f1_macro:.4f}")

    # Select best kernel
    best_kernel = max(results, key=lambda k: results[k]['f1_weighted'])
    best_model = results[best_kernel]['model']
    best_predictions = results[best_kernel]['predictions']

    print(f"\n   Best kernel: {best_kernel}")
    print(f"   Best F1 score: {results[best_kernel]['f1_weighted']:.4f}")

    # -------------------------------------------------------------------------
    # 6. DETAILED EVALUATION
    # -------------------------------------------------------------------------
    print(f"\n[6/7] Evaluating best model...")

    print(f"\n  Classification Report:")
    print(classification_report(y_test, best_predictions))

    # Confusion Matrix
    cm = confusion_matrix(y_test, best_predictions)
    print(f"\n  Confusion Matrix:")
    print(cm)

    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - SVM with Trigrams ({best_kernel} kernel)')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(CONFUSION_MATRIX_PATH, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"\n   Confusion matrix saved: {CONFUSION_MATRIX_PATH}")

    # -------------------------------------------------------------------------
    # 7. SAVE MODEL
    # -------------------------------------------------------------------------
    print(f"\n[7/7] Saving model...")

    with open(MODEL_SAVE_PATH, 'wb') as f:
        pickle.dump(best_model, f)
    print(f"   Model saved: {MODEL_SAVE_PATH}")

    with open(VECTORIZER_SAVE_PATH, 'wb') as f:
        pickle.dump(vectorizer, f)
    print(f"  Vectorizer saved: {VECTORIZER_SAVE_PATH}")

    # -------------------------------------------------------------------------
    # FINAL SUMMARY
    # -------------------------------------------------------------------------
    print("\n" + "="*80)
    print("TRAINING COMPLETE!")
    print("="*80)

    print(f"\n Results Summary:")
    print(f"  Model: SVM ({best_kernel} kernel)")
    print(f"  Features: {NGRAM_TYPE}")
    print(f"  Accuracy: {results[best_kernel]['accuracy']:.2%}")
    print(f"  F1 Score (Weighted): {results[best_kernel]['f1_weighted']:.4f}")
    print(f"  F1 Score (Macro): {results[best_kernel]['f1_macro']:.4f}")

    print(f"\n Files Created:")
    print(f"  1. {MODEL_SAVE_PATH}")
    print(f"  2. {VECTORIZER_SAVE_PATH}")
    print(f"  3. {CONFUSION_MATRIX_PATH}")

    print(f"\n Comparison with Previous Model:")
    print(f"  Previous: Logistic Regression with unigrams+bigrams → 66.67% accuracy")
    print(f"  Current:  SVM with {NGRAM_TYPE} → {results[best_kernel]['accuracy']:.2%} accuracy")

    if results[best_kernel]['accuracy'] > 0.6667:
        print(f"   Improvement with trigrams!")
    elif results[best_kernel]['accuracy'] < 0.6667:
        print(f"   Lower accuracy - unigrams+bigrams may work better")
    else:
        print(f"   Similar performance")

    return best_model, vectorizer, results


In [11]:
#Prediction Function
def test_predictions():
    """Test the trained model on sample texts"""

    print("\n" + "="*80)
    print("TESTING PREDICTIONS")
    print("="*80)

    # Load model
    with open(MODEL_SAVE_PATH, 'rb') as f:
        model = pickle.load(f)
    with open(VECTORIZER_SAVE_PATH, 'rb') as f:
        vectorizer = pickle.load(f)

    # Sample texts
    sample_texts = [
        "I'm experiencing severe burnout in my cybersecurity role",
        "Looking for career advice in IT security",
        "Just got promoted to senior security analyst",
        "Dealing with stress and anxiety at work"
    ]

    print("\nSample Predictions:")
    print("-" * 80)

    for i, text in enumerate(sample_texts, 1):
        # Preprocess
        processed = preprocess_text(text)

        # Vectorize
        vectorized = vectorizer.transform([processed])

        # Predict
        prediction = model.predict(vectorized)[0]
        probability = model.predict_proba(vectorized)
        confidence = probability.max()

        print(f"\n{i}. Text: {text[:60]}...")
        print(f"   Predicted Label: {prediction}")
        print(f"   Confidence: {confidence:.2%}")

In [12]:
if __name__ == "__main__":

    # Train model
    model, vectorizer, results = train_svm_trigram()

    # Test predictions
    print("\n")
    test_predictions()

    # Optional: Uncomment to run hyperparameter tuning
    # print("\n")
    # print("Would you like to run hyperparameter tuning? (takes 5-10 minutes)")
    # response = input("Run tuning? (y/n): ")
    # if response.lower() == 'y':
    #     # Load data again
    #     df = pd.read_excel(DATA_PATH)
    #     df['text_processed'] = df['text'].apply(preprocess_text)
    #     df = handle_multilabels(df)
    #     X = df['text_processed'].values
    #     y = df['label'].values
    #     X_train, X_test, y_train, y_test = train_test_split(
    #         X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    #     )
    #     # Vectorize
    #     vectorizer = TfidfVectorizer(
    #         max_features=1000,
    #         min_df=2,
    #         max_df=0.8,
    #         ngram_range=(1, 3),
    #         stop_words='english'
    #     )
    #     X_train_tfidf = vectorizer.fit_transform(X_train)
    #     # Tune
    #     tuned_model = tune_svm_hyperparameters(X_train_tfidf, y_train)

    print("\n" + "="*80)
    print("ALL DONE! ")
    print("="*80)


SVM CLASSIFICATION WITH TRIGRAMS

[1/7] Loading data...
  Loaded 188 samples
  Columns: ['id', 'text', 'similarity_score', 'label']

[2/7] Preprocessing text...

Handling multi-label samples...

  Label distribution:
label
0    139
1     49
Name: count, dtype: int64

[3/7] Splitting data (test size: 0.2)...
  Training set: 150 samples
  Test set: 38 samples

[4/7] Extracting features...
  N-gram type: unigrams_to_trigrams
  Using: unigrams + bigrams + trigrams
   Feature matrix shape (train): (150, 1000)
   Feature matrix shape (test): (38, 1000)
   Number of features: 1000

  Sample trigrams extracted:
    1. 'feel like im'
    2. 'just don know'
    3. 'learning new things'
    4. 'like cybersecurity job'
    5. 'looks like cybersecurity'

[5/7] Training SVM classifier...

  Training SVM with linear kernel...
    Accuracy: 0.8947
    F1 (Weighted): 0.8976
    F1 (Macro): 0.8721

  Training SVM with rbf kernel...
    Accuracy: 0.8684
    F1 (Weighted): 0.8519
    F1 (Macro): 0.7923

