In [1]:
from cds_hw5_library.load.load_data import load_data
from cds_hw5_library.split.split_data import split_data
from cds_hw5_library.clean.missing import nan_destroyer, fill_nan
from cds_hw5_library.features.features import dummies, binary
from cds_hw5_library.trainpred.trainpred import train, pred
from cds_hw5_library.score.score import score

# a) read the data 
df = load_data('sample_diabetes_mellitus_data.csv')
df = df.copy()

# b) splitting data 
X_train, X_test, y_train, y_test = split_data(df)

# c) getting rid of null values in gender, ethnicity and age

X_train, y_train = nan_destroyer(X_train, y_train, ['gender','ethnicity', 'age'])
X_test, y_test = nan_destroyer(X_test, y_test, ['gender','ethnicity', 'age'])

# d) filling height and weight null values with the mean 
X_train = fill_nan(X_train, ['height', 'weight'])
X_test = fill_nan(X_test, ['height', 'weight'])

# e) get dummies for ethnicity 
X_train = dummies(X_train, ['ethnicity'])
X_test = dummies(X_test, ['ethnicity'])


# f) generate a binary variable for gender

X_train['gender_binary'] = binary(X_train.gender)
X_test['gender_binary'] = binary(X_test.gender)


# g) train a model with the train data
cols = ['age','height','weight','aids','cirrhosis','hepatic_failure','immunosuppression','leukemia','lymphoma','solid_tumor_with_metastasis']
X_train, y_train = train(X_train[cols], y_train)


# h) predict the targets for both the train and test sets and add the prediction as a new column
X_test = X_test[cols]
X_train , X_test, predictiontrain, predictiontest = pred(X_train, X_test)


# i) compute the train and test roc_auc metric using roc_auc_score

score_train, score_test = score(predictiontrain, y_train, predictiontest, y_test)




In [2]:
score_train

0.6673548257281934

In [3]:
score_test

0.678587693147211