In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


### Load Cleaned Data

In [2]:
train_df = pd.read_csv('train_data_cleaned.csv')
val_df = pd.read_csv('val_data_cleaned.csv')
test_df = pd.read_csv('test_data_cleaned.csv')

print(f"Train: {train_df.shape}, Validation: {val_df.shape}, Test: {test_df.shape}")

Train: (1972, 7), Validation: (493, 6), Test: (1057, 6)


### Load the Hugging Face Embedding Model

In [3]:
print("\nLoading model: sentence-transformers/all-mpnet-base-v2 ...")
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')


Loading model: sentence-transformers/all-mpnet-base-v2 ...


### Generate Embeddings

In [4]:
print("\nEncoding text into embeddings (this may take a few minutes)...")
train_embeddings = model.encode(train_df['clean_text'].tolist(), show_progress_bar=True, batch_size=32)
val_embeddings = model.encode(val_df['clean_text'].tolist(), show_progress_bar=True, batch_size=32)
test_embeddings = model.encode(test_df['clean_text'].tolist(), show_progress_bar=True, batch_size=32)


Encoding text into embeddings (this may take a few minutes)...


Batches: 100%|██████████████████████████████████| 62/62 [06:42<00:00,  6.48s/it]
Batches: 100%|██████████████████████████████████| 16/16 [01:41<00:00,  6.33s/it]
Batches: 100%|██████████████████████████████████| 34/34 [03:27<00:00,  6.11s/it]


#### Save embeddings for reuse

In [5]:
np.save('train_embeddings_mpnet.npy', train_embeddings)
np.save('val_embeddings_mpnet.npy', val_embeddings)
np.save('test_embeddings_mpnet.npy', test_embeddings)

print("\n✅ Embeddings generated and saved as .npy files!")


✅ Embeddings generated and saved as .npy files!


### Train a Simple Classifier (SVM for belong)

In [6]:
print("\nTraining SVM classifier for 'belong' label...")

X_train, y_train = train_embeddings, train_df['belong']
X_val, y_val = val_embeddings, val_df['belong']

svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

val_preds = svm_model.predict(X_val)
print("Validation Accuracy (belong):", accuracy_score(y_val, val_preds))
print("\nClassification Report:\n", classification_report(y_val, val_preds))


Training SVM classifier for 'belong' label...
Validation Accuracy (belong): 0.7789046653144016

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.72      0.74       214
           1       0.79      0.82      0.81       279

    accuracy                           0.78       493
   macro avg       0.78      0.77      0.77       493
weighted avg       0.78      0.78      0.78       493



### Repeat for "burden"

In [7]:
print("\nTraining SVM classifier for 'burden' label...")

X_train, y_train = train_embeddings, train_df['burden']
X_val, y_val = val_embeddings, val_df['burden']

svm_model_burden = SVC(kernel='linear', probability=True, random_state=42)
svm_model_burden.fit(X_train, y_train)

val_preds_burden = svm_model_burden.predict(X_val)
print("Validation Accuracy (burden):", accuracy_score(y_val, val_preds_burden))
print("\nClassification Report:\n", classification_report(y_val, val_preds_burden))


Training SVM classifier for 'burden' label...
Validation Accuracy (burden): 0.8296146044624746

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.92      0.87       307
           1       0.84      0.68      0.75       186

    accuracy                           0.83       493
   macro avg       0.83      0.80      0.81       493
weighted avg       0.83      0.83      0.83       493



### Save Models

In [8]:
import joblib
joblib.dump(svm_model, "svm_belong_mpnet.pkl")
joblib.dump(svm_model_burden, "svm_burden_mpnet.pkl")

print("\n✅ Models saved as 'svm_belong_mpnet.pkl' and 'svm_burden_mpnet.pkl'")


✅ Models saved as 'svm_belong_mpnet.pkl' and 'svm_burden_mpnet.pkl'
