In [29]:
import pandas as pd
import json
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer

In [None]:
raw_data = pd.read_csv("../../data/extracted_features/features/x5_llm_features_interns.csv")
raw_data.head()

In [31]:
df = raw_data.copy()
df["features_json"] = df['response'].apply(json.loads)

In [32]:
df = df[df['features_json'].notnull()].reset_index(drop=True)
df_expanded = pd.json_normalize(df['features_json'])
df_final = pd.concat([df_expanded, df['Hire status']], axis=1)
df_final

Unnamed: 0,gpa,cplus_plus_knowledge,csharp_knowledge,sql_knowledge,python_knowledge,javascript_knowledge,java_knowledge,cloud_computing_skills,teamwork_skills,english_knowledge,...,education_type,university,experience_as_analyst,experience_as_data_scientist,experience_as_machine_learning_engineer,experience_as_backend_developer,experience_as_frontend_developer,conference_participation,hackathon_participation,Hire status
0,0.0,False,False,True,True,False,False,False,3,3,...,1,1,False,False,False,False,False,False,False,0
1,4.0,False,False,True,True,False,False,False,5,3,...,1,2,False,False,False,True,False,False,True,0
2,4.5,False,False,True,True,False,False,False,5,4,...,1,1,True,True,False,False,False,False,False,0
3,0.0,False,False,True,True,False,False,False,0,0,...,0,0,True,True,True,False,False,False,False,1
4,4.0,False,False,True,True,False,False,False,3,3,...,1,1,False,False,False,False,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,4.5,False,False,True,True,False,False,False,3,2,...,1,1,False,True,False,False,False,False,False,1
307,4.0,False,False,False,True,False,False,False,4,4,...,4,4,False,False,True,False,False,False,True,0
308,4.5,False,False,True,True,False,False,False,5,3,...,1,1,False,True,True,False,False,False,False,0
309,4.5,False,False,True,True,False,False,False,5,5,...,1,1,False,True,False,False,False,False,False,1


In [33]:
X = df_final.iloc[:, :-1]
y = df_final.iloc[:, -1]

In [34]:
binary_features = X.select_dtypes(include=['bool']).columns
categorical_features = X.select_dtypes(include=['int64']).columns
numerical_features = X.select_dtypes(include=['float64']).columns

In [35]:
preprocessor = ColumnTransformer(
    transformers=[
        ('binary', 'passthrough', binary_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'
)

In [36]:
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

gradient_boosting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier())
])

svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC())
])

pipelines = [
    ('Logistic Regression', logistic_pipeline),
    ('Gradient Boosting', gradient_boosting_pipeline),
    ('SVM', svm_pipeline)
]

In [40]:
for name, pipeline in pipelines:
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='roc_auc')
    print(f"{name} ROC-AUC: {scores.mean():.2f}")

Logistic Regression ROC-AUC: 0.50
Gradient Boosting ROC-AUC: 0.50
SVM ROC-AUC: 0.52


In [38]:
scores

array([0.46975806, 0.47138398, 0.6024974 , 0.51404787, 0.54994797])