In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from io import StringIO
metrics = pd.DataFrame(columns = ['task', 'precision', 'recall', 'f1', 'rocauc'])

In [None]:
df = pd.read_csv("./notna_df.csv", sep = ';', index_col=0)  
drop_cols = ['isna', 'student_group','student_name', 'person_id', 'project_id', 'target_multiclass', 'rate_total', 'project_head']
df = df.drop(columns = drop_cols)

cat_features = ['stepen', 'course']
df[cat_features] = df[cat_features].astype(str)

X = df.drop(columns=['target'])
y = df['target']

X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler

cat_features = X_train.select_dtypes(include='object').columns.tolist()
for col in cat_features:
    encoder = LabelEncoder()
    X_train[col] = encoder.fit_transform(X_train[col].astype(str))
    X_test[col] = encoder.transform(X_test[col].astype(str))

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42))
])

model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print('TRAIN')
print(classification_report(y_train, y_pred_train))
print('TEST')
print(classification_report(y_test, y_pred))

pr_train = precision_score(y_train, y_pred_train)
pr_test = precision_score(y_test, y_pred)

rec_train = recall_score(y_train, y_pred_train)
rec_test = recall_score(y_test, y_pred)

f_train = f1_score(y_train, y_pred_train)
f_test = f1_score(y_test, y_pred)

roc_train= roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
roc_test= roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])


print('ROC-AUC train', roc_train)
print('ROC-AUC test', roc_test)


TRAIN
              precision    recall  f1-score   support

           0       0.18      0.89      0.29        80
           1       0.98      0.63      0.77       907

    accuracy                           0.65       987
   macro avg       0.58      0.76      0.53       987
weighted avg       0.92      0.65      0.73       987

TEST
              precision    recall  f1-score   support

           0       0.15      0.75      0.25        20
           1       0.97      0.63      0.77       227

    accuracy                           0.64       247
   macro avg       0.56      0.69      0.51       247
weighted avg       0.90      0.64      0.72       247

ROC-AUC train 0.8507855567805954
ROC-AUC test 0.7651982378854626


In [None]:
task = 'svm'
new_row = pd.DataFrame({"task": [task + '_train'], "precision": [pr_train],  "recall": [rec_train],  "f1": [f_train],  "rocauc": [roc_train]})
report = classification_report(y_train, y_pred_train)
report = pd.read_csv(StringIO(report), sep='\s\s+', engine='python').iloc[:1, :3].add_suffix('_class=0')
new_row = new_row.merge(report, how = 'cross')
metrics = pd.concat([metrics, new_row], ignore_index=True)

new_row = pd.DataFrame({"task": [task + '_test'], "precision": [pr_test],  "recall": [rec_test],  "f1": [f_test],  "rocauc": [roc_test]})
report = classification_report(y_test, y_pred)
report = pd.read_csv(StringIO(report), sep='\s\s+', engine='python').iloc[:1, :3].add_suffix('_class=0')
new_row = new_row.merge(report, how = 'cross')
metrics = pd.concat([metrics, new_row], ignore_index=True)

metrics.to_csv('./metrics.csv')
metrics

  metrics = pd.concat([metrics, new_row], ignore_index=True)


Unnamed: 0,task,precision,recall,f1,rocauc,precision_class=0,recall_class=0,f1-score_class=0
0,svm_train,0.984563,0.632856,0.77047,0.850786,0.18,0.89,0.29
1,svm_test,0.966443,0.634361,0.765957,0.765198,0.15,0.75,0.25


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(probability=True, random_state=42))
])

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__gamma': ['scale', 'auto'],
    'svc__class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best ROC-AUC score:", grid_search.best_score_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters: {'svc__C': 10, 'svc__class_weight': 'balanced', 'svc__gamma': 'scale', 'svc__kernel': 'poly'}
Best ROC-AUC score: 0.7504538279400158


In [None]:
model = SVC(kernel='poly', gamma='scale', C=10, probability=True, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print('TRAIN')
print(classification_report(y_train, y_pred_train))
print('TEST')
print(classification_report(y_test, y_pred))

pr_train = precision_score(y_train, y_pred_train)
pr_test = precision_score(y_test, y_pred)

rec_train = recall_score(y_train, y_pred_train)
rec_test = recall_score(y_test, y_pred)

f_train = f1_score(y_train, y_pred_train)
f_test = f1_score(y_test, y_pred)

roc_train= roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
roc_test= roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])


print('ROC-AUC train', roc_train)
print('ROC-AUC test', roc_test)

TRAIN
              precision    recall  f1-score   support

           0       0.20      0.94      0.33        80
           1       0.99      0.67      0.80       907

    accuracy                           0.69       987
   macro avg       0.60      0.80      0.56       987
weighted avg       0.93      0.69      0.76       987

TEST
              precision    recall  f1-score   support

           0       0.16      0.80      0.27        20
           1       0.97      0.63      0.77       227

    accuracy                           0.65       247
   macro avg       0.57      0.72      0.52       247
weighted avg       0.91      0.65      0.73       247

ROC-AUC train 0.8910281146637266
ROC-AUC test 0.7702643171806168


In [None]:
task = 'svm_tuned'
new_row = pd.DataFrame({"task": [task + '_train'], "precision": [pr_train],  "recall": [rec_train],  "f1": [f_train],  "rocauc": [roc_train]})
report = classification_report(y_train, y_pred_train)
report = pd.read_csv(StringIO(report), sep='\s\s+', engine='python').iloc[:1, :3].add_suffix('_class=0')
new_row = new_row.merge(report, how = 'cross')
metrics = pd.concat([metrics, new_row], ignore_index=True)

new_row = pd.DataFrame({"task": [task + '_test'], "precision": [pr_test],  "recall": [rec_test],  "f1": [f_test],  "rocauc": [roc_test]})
report = classification_report(y_test, y_pred)
report = pd.read_csv(StringIO(report), sep='\s\s+', engine='python').iloc[:1, :3].add_suffix('_class=0')
new_row = new_row.merge(report, how = 'cross')
metrics = pd.concat([metrics, new_row], ignore_index=True)

metrics.to_csv('./metrics.csv')
metrics

Unnamed: 0,task,precision,recall,f1,rocauc,precision_class=0,recall_class=0,f1-score_class=0
0,svm_train,0.984563,0.632856,0.77047,0.850786,0.18,0.89,0.29
1,svm_test,0.966443,0.634361,0.765957,0.765198,0.15,0.75,0.25
2,svm_tuned_train,0.99183,0.669239,0.79921,0.891028,0.2,0.94,0.33
3,svm_tuned_test,0.972973,0.634361,0.768,0.770264,0.16,0.8,0.27


In [None]:
    from sklearn.inspection import permutation_importance
    import numpy as np

    result = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=42)

    importances = result.importances_mean
    df_importance = pd.DataFrame(importances, index=feature_names, columns=['importance'])
    df_importance['abs_importance'] = df_importance['importance'].abs()
    df_importance = df_importance.sort_values('abs_importance', ascending=False)

    print("Feature importances (permutation importance):")
    print(df_importance)

Feature importances (permutation importance):
                                        importance  abs_importance
completed_count_wekan                 4.620061e-02    4.620061e-02
card_created_total_wekan              3.698075e-02    3.698075e-02
created_count_wekan                   2.492401e-02    2.492401e-02
course                                2.228977e-02    2.228977e-02
hours_credited_wekan                  1.752786e-02    1.752786e-02
stepen                                1.722391e-02    1.722391e-02
card_completed_total_wekan            1.458967e-02    1.458967e-02
student_commits_contribution_git      9.017224e-03    9.017224e-03
max_hours_user_taiga                  6.585613e-03    6.585613e-03
hours_credited_taiga                  6.281662e-03    6.281662e-03
avg_activity_per_day_zulip            4.863222e-03    4.863222e-03
avg_operations_commit_git            -4.052685e-03    4.052685e-03
avg_dt_between_commits_git            3.951368e-03    3.951368e-03
median_hours_use

In [None]:
df_importance

Unnamed: 0,importance,abs_importance
completed_count_wekan,0.04620061,0.04620061
card_created_total_wekan,0.03698075,0.03698075
created_count_wekan,0.02492401,0.02492401
course,0.02228977,0.02228977
hours_credited_wekan,0.01752786,0.01752786
stepen,0.01722391,0.01722391
card_completed_total_wekan,0.01458967,0.01458967
student_commits_contribution_git,0.009017224,0.009017224
max_hours_user_taiga,0.006585613,0.006585613
hours_credited_taiga,0.006281662,0.006281662


In [None]:
metrics.round(2).to_csv('metrics_svm.csv', sep=';')