In [1]:
from pysymbolic.algorithms.symbolic_metamodeling import *
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split




In [2]:
data          = pd.read_csv("data/absenteeism.csv", delimiter=';')

feature_names = ['Transportation expense', 'Distance from Residence to Work',
                 'Service time', 'Age', 'Work load Average/day ', 'Hit target',
                 'Disciplinary failure', 'Education', 'Son', 'Social drinker',
                 'Social smoker', 'Pet', 'Weight', 'Height', 'Body mass index']

scaler        = MinMaxScaler(feature_range=(0, 1))
X             = scaler.fit_transform(data[feature_names])
Y             = ((data['Absenteeism time in hours'] > 4) * 1) 

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)


In [3]:
model         = XGBClassifier()

model.fit(X_train, Y_train)
print("ROCAUC score of XGB Classifier is ",roc_auc_score(Y_test, model.predict_proba(X_test)[:, 1]))

ROCAUC score of XGB Classifier is  0.700890272148234


In [4]:
model_L = LogisticRegression()

model_L.fit(X_train, Y_train)

print("ROCAUC score of logistic regression is ",roc_auc_score(Y_test, model_L.predict_proba(X_test)[:, 1]))

ROCAUC score of logistic regression is  0.6709250144759699


In [5]:
metamodel = symbolic_metamodel(model, X_train)

metamodel.fit(num_iter=10, batch_size=X_train.shape[0], learning_rate=.01)
Y_metamodel = metamodel.evaluate(X_test)

print("ROCAUC score of symbolic metamodel is ",roc_auc_score(Y_test, Y_metamodel))

---- Tuning the basis functions ----


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


----  Optimizing the metamodel  ----


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


ROCAUC score of symbolic metamodel is  0.7467790966994788


In [6]:
feature = PolynomialFeatures(2, include_bias=False, interaction_only=True)
X_train = feature.fit_transform(X_train)
X_test = feature.transform(X_test)
clf = Ridge(alpha=.1, fit_intercept=False, normalize=True)
clf.fit(X_train,Y_train)
Y_pred_r = clf.predict(X_test)
Y_pred = 1 / (1 + np.exp(-1 * Y_pred_r))
print("ROCAUC score of linear regression on extended feature set is ",roc_auc_score(Y_test,Y_pred))

ROCAUC score of linear regression on extended feature set is  0.7325926462072958
