In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_score
import numpy as np

In [2]:
df = pd.read_csv('..\\data\\employee_churn_dataset.csv')

In [3]:
for column in df.drop('Employee ID',axis=1).columns:
    try:
        df[column] + 1
    except:
        dummies = pd.get_dummies(df[column]).add_prefix(column+'_')
        df[dummies.columns] = dummies
        df.drop(column, axis=1, inplace=True)
features = df.drop(['Churn','Employee ID'], axis=1).columns

In [4]:
data_X = df[features]
data_y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=42, stratify=data_y)

In [5]:
from src.feature_engineering import FeatureEngineering
models = [LogisticRegression(), DummyClassifier(strategy='most_frequent')]
for model in range(len(models)):
    models[model] = make_pipeline(FeatureEngineering(), StandardScaler(), models[model])
    models[model].fit(X_train, y_train)

In [6]:
def precision_at_k(y_true, y_score, k):
    top_k_indices = np.argsort(y_score)[::-1][:k]
    relevant = y_true[top_k_indices].sum()
    return relevant / k

scores = []
for model in range(len(models)):
    y_true = y_test.values
    y_score_proba = models[model].predict_proba(X_test)[:,0]
    y_score = y_score_proba>=0.5
    auc_score = roc_auc_score(y_true, y_score)
    precision_at_k_score = precision_at_k(y_true, y_score_proba, 100)
    scores.append((models[model].steps, auc_score, precision_at_k_score))

In [7]:
scores.sort(key=lambda x:x[2], reverse=True)
scores

[([('featureengineering',
    <src.feature_engineering.FeatureEngineering at 0x1b09b756a70>),
   ('standardscaler', StandardScaler()),
   ('dummyclassifier', DummyClassifier(strategy='most_frequent'))],
  0.5,
  np.float64(0.21)),
 ([('featureengineering',
    <src.feature_engineering.FeatureEngineering at 0x1b09b757fd0>),
   ('standardscaler', StandardScaler()),
   ('logisticregression', LogisticRegression())],
  0.5,
  np.float64(0.19))]