In [None]:
'''
Text Classification Model Training - Simple SVM with Trigrams
Steps 
1. Data Loading & Exploration
2. Data Preprocessing
3. Handling Multi-labels
4. Train-Test Split
5. Feature Extraction (TF-IDF with Trigrams)
6. Model Training (SVM)
7. Model Evaluation
8. Model Saving
9. Prediction on New Data

Libraries 
pip install scikit-learn pandas numpy matplotlib seaborn openpyxl xlrd --break-system-packages
'''


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (classification_report, confusion_matrix, 
                             accuracy_score, f1_score)
from sklearn.svm import SVC
import re
import warnings
warnings.filterwarnings('ignore')

print("All necessary libraries are imported successfully.")

All necessary libraries are imported successfully.


In [2]:
# STEP 1: DATA LOADING & EXPLORATION

def load_and_explore_data(file_path):
    """Load data and perform initial exploration"""
    print("=" * 80)
    print("STEP 1: DATA LOADING & EXPLORATION")
    print("=" * 80)
    
    # Load data
    df = pd.read_excel(file_path)
    
    print(f"\nDataset shape: {df.shape}")
    print(f"\nColumns: {df.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(df.head())
    
    print(f"\nLabel distribution:")
    print(df['label'].value_counts())
    
    print(f"\nMissing values:")
    print(df.isnull().sum())
    
    print(f"\nText length statistics:")
    df['text_length'] = df['text'].apply(lambda x: len(str(x)))
    print(df['text_length'].describe())
    
    return df


In [3]:
# STEP 2: DATA PREPROCESSING

def preprocess_text(text):
    """Clean and preprocess text data"""
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [4]:
# STEP 3: HANDLING MULTI-LABELS

def handle_multilabels(df, strategy='first'):
    """
    Handle multi-label samples
    
    Strategies:
    - 'first': Take the first label
    - 'remove': Remove multi-label samples
    - 'separate': Create separate samples for each label
    """
    print("\n" + "=" * 80)
    print("STEP 3: HANDLING MULTI-LABELS")
    print("=" * 80)
    
    # Identify multi-label samples
    multi_label_mask = df['label'].astype(str).str.contains(',')
    multi_label_count = multi_label_mask.sum()
    
    print(f"\nMulti-label samples found: {multi_label_count}")
    print(f"Multi-label samples: {df[multi_label_mask]['label'].unique()}")
    
    if strategy == 'first':
        print("\nStrategy: Taking first label from multi-label samples")
        df['label'] = df['label'].astype(str).apply(lambda x: x.split(',')[0])
        
    elif strategy == 'remove':
        print("\nStrategy: Removing multi-label samples")
        df = df[~multi_label_mask].copy()
        
    elif strategy == 'separate':
        print("\nStrategy: Creating separate samples for each label")
        new_rows = []
        for idx, row in df[multi_label_mask].iterrows():
            labels = str(row['label']).split(',')
            for label in labels:
                new_row = row.copy()
                new_row['label'] = label.strip()
                new_rows.append(new_row)
        
        # Remove original multi-label rows and add new rows
        df = pd.concat([df[~multi_label_mask], pd.DataFrame(new_rows)], ignore_index=True)
    
    # Convert labels to integers
    df['label'] = df['label'].astype(int)
    
    print(f"\nFinal label distribution:")
    print(df['label'].value_counts().sort_index())
    
    return df

In [5]:
# STEP 4: TRAIN-TEST SPLIT

def split_data(df, test_size=0.2, random_state=42, stratify=True):
    """Split data into train and test sets"""
    print("\n" + "=" * 80)
    print("STEP 4: TRAIN-TEST SPLIT")
    print("=" * 80)
    
    X = df['text'].values
    y = df['label'].values
    
    # Use stratified split to maintain class distribution
    if stratify:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
    
    print(f"\nTraining set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    print(f"\nTraining set label distribution:")
    print(pd.Series(y_train).value_counts().sort_index())
    print(f"\nTest set label distribution:")
    print(pd.Series(y_test).value_counts().sort_index())
    
    return X_train, X_test, y_train, y_test


In [6]:
# STEP 5: FEATURE EXTRACTION (TF-IDF with Trigrams)

def extract_features_tfidf_trigrams(X_train, X_test, max_features=1000):
    """Extract TF-IDF features with trigrams"""
    print("\n" + "=" * 80)
    print("STEP 5: FEATURE EXTRACTION (TF-IDF with Trigrams)")
    print("=" * 80)
    
    # Use trigrams (1-3) instead of bigrams
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        min_df=2,  # Ignore terms that appear in less than 2 documents
        max_df=0.8,  # Ignore terms that appear in more than 80% of documents
        ngram_range=(1, 3),  # Use unigrams, bigrams, and trigrams
        stop_words='english'
    )
    
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    print(f"\nTF-IDF matrix shape (train): {X_train_tfidf.shape}")
    print(f"TF-IDF matrix shape (test): {X_test_tfidf.shape}")
    print(f"Number of features: {len(vectorizer.get_feature_names_out())}")
    print(f"\nSample trigram features:")
    features = vectorizer.get_feature_names_out()
    # Show some trigram examples
    trigrams = [f for f in features if len(f.split()) == 3]
    print(f"Example trigrams: {trigrams[:10]}")
    
    return X_train_tfidf, X_test_tfidf, vectorizer


In [7]:
# STEP 6: MODEL TRAINING (SVM only)

def train_svm_model(X_train, y_train, X_test, y_test):
    """Train SVM model"""
    print("\n" + "=" * 80)
    print("STEP 6: MODEL TRAINING (SVM)")
    print("=" * 80)
    
    print(f"\nTraining SVM with linear kernel...")
    
    # Train SVM
    model = SVC(kernel='linear', random_state=42, class_weight='balanced')
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    f1_weighted = f1_score(y_test, y_pred, average='weighted')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"F1 Score (Weighted): {f1_weighted:.4f}")
    print(f"F1 Score (Macro): {f1_macro:.4f}")
    
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return model, y_pred


In [8]:
# STEP 7: MODEL EVALUATION

def evaluate_model(y_test, y_pred):
    """Evaluate and visualize the model"""
    print("\n" + "=" * 80)
    print("STEP 7: MODEL EVALUATION")
    print("=" * 80)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print("\nConfusion Matrix:")
    print(cm)
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix - SVM with Trigrams')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix_svm_trigram.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print("\nConfusion matrix saved to: confusion_matrix_svm_trigram.png")


In [9]:
# STEP 8: MODEL SAVING

def save_model(model, vectorizer, filepath_model, filepath_vectorizer):
    """Save trained model and vectorizer"""
    print("\n" + "=" * 80)
    print("STEP 8: SAVING MODEL")
    print("=" * 80)
    
    import pickle
    
    # Save model
    with open(filepath_model, 'wb') as f:
        pickle.dump(model, f)
    print(f"\nModel saved to: {filepath_model}")
    
    # Save vectorizer
    with open(filepath_vectorizer, 'wb') as f:
        pickle.dump(vectorizer, f)
    print(f"Vectorizer saved to: {filepath_vectorizer}")


def load_model(filepath_model, filepath_vectorizer):
    """Load saved model and vectorizer"""
    import pickle
    
    with open(filepath_model, 'rb') as f:
        model = pickle.load(f)
    
    with open(filepath_vectorizer, 'rb') as f:
        vectorizer = pickle.load(f)
    
    return model, vectorizer


In [10]:
# STEP 9: PREDICTION ON NEW DATA

def predict_new_data(model, vectorizer, new_texts):
    """Predict labels for new text data"""
    print("\n" + "=" * 80)
    print("STEP 9: PREDICTION ON NEW DATA")
    print("=" * 80)
    
    # Preprocess
    new_texts_processed = [preprocess_text(text) for text in new_texts]
    
    # Vectorize
    new_texts_tfidf = vectorizer.transform(new_texts_processed)
    
    # Predict
    predictions = model.predict(new_texts_tfidf)
    
    for i, (text, pred) in enumerate(zip(new_texts, predictions)):
        print(f"\nText {i+1}: {text[:100]}...")
        print(f"Predicted Label: {pred}")
    
    return predictions

In [None]:
# MAIN PIPELINE

def main():
    """Run the complete classification pipeline"""
    
    print("\n" + "=" * 80)
    print("TEXT CLASSIFICATION: SVM WITH TRIGRAMS")
    print("=" * 80)
    
    # File path
    data_path = 'test_data_with_labels_BINARY.csv'
    
    # Step 1: Load data
    df = load_and_explore_data(data_path)
    
    # Step 2: Preprocess text
    print("\n" + "=" * 80)
    print("STEP 2: TEXT PREPROCESSING")
    print("=" * 80)
    df['text_processed'] = df['text'].apply(preprocess_text)
    print("\nSample preprocessed text:")
    print(df['text_processed'].iloc[0])
    
    # Step 3: Handle multi-labels
    df = handle_multilabels(df, strategy='first')
    
    # Step 4: Split data
    X_train, X_test, y_train, y_test = split_data(df, test_size=0.2)
    
    # Preprocess train and test texts
    X_train_processed = [preprocess_text(text) for text in X_train]
    X_test_processed = [preprocess_text(text) for text in X_test]
    
    # Step 5: Feature extraction with trigrams
    X_train_tfidf, X_test_tfidf, vectorizer = extract_features_tfidf_trigrams(
        X_train_processed, X_test_processed, max_features=1000
    )
    
    # Step 6: Train SVM model
    model, y_pred = train_svm_model(X_train_tfidf, y_train, X_test_tfidf, y_test)
    
    # Step 7: Evaluate model
    evaluate_model(y_test, y_pred)
    
    # Step 8: Save model
    save_model(
        model, 
        vectorizer, 
        'svm_trigram_model.pkl',
        'vectorizer_trigram.pkl'
    )
    
    # Step 9: Test prediction on new data
    sample_texts = [
        "I'm experiencing severe burnout in my cybersecurity role",
        "Looking for recommendations on IT career development",
        "Just got a new security analyst position, very excited!"
    ]
    
    predictions = predict_new_data(model, vectorizer, sample_texts)
    
    print("\n" + "=" * 80)
    print("PIPELINE COMPLETED SUCCESSFULLY!")
    print("=" * 80)
    print("\nSummary:")
    print("- Model: SVM with linear kernel")
    print("- Features: TF-IDF with trigrams (1-3 grams)")
    print("- No hyperparameter tuning (kept simple)")
    print("\nFiles created:")
    print("1. confusion_matrix_svm_trigram.png")
    print("2. svm_trigram_model.pkl")
    print("3. vectorizer_trigram.pkl")


if __name__ == "__main__":
    main()



TEXT CLASSIFICATION: SVM WITH TRIGRAMS
STEP 1: DATA LOADING & EXPLORATION

Dataset shape: (177, 4)

Columns: ['id', 'text', 'similarity_score', 'label']

First few rows:
        id                                               text  \
0  1g4a7ot  Burn out among Cybersecurity leaders at a frus...   
1  1dqiog2  Invitation to Participate in Research Study on...   
2  1g49xt4  Dealing with feeling stuck in the security fie...   
3  1fqxn7a  How are you doing guys?\nIs this cybersecurity...   
4  1fbdhwo  Hey folks, for those of you working right now,...   

   similarity_score label  
0          0.437536     1  
1          0.400815     0  
2          0.281245     0  
3          0.316323     1  
4          0.377892     2  

Label distribution:
label
0      107
1       38
2        8
6        6
3        3
7        3
9        3
5        2
7,8      2
4        2
4,8      1
2,9      1
1,4      1
Name: count, dtype: int64

Missing values:
id                  0
text                0
similarity_sc