In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

**Models to try**
- Logistic regression
- Random forest
- xgboost
- Adaptive Boost

In [11]:
df_train = pd.read_csv("../data/kaggle_train.csv") 
df_train_xgb = df_train.drop(columns=['DoctorInCharge'])
y = df_train_xgb.Diagnosis
X = df_train_xgb.drop(columns=['Diagnosis'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
#models to explore
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, classification_report


mse = {}
accuracy = {}
f1 = {}
models  = {
   # 'lr' : LogisticRegression(max_iter = 200),
    'gbc' : GradientBoostingClassifier(),
    'xgbc' : XGBClassifier(),
    'rfc'  :  RandomForestClassifier(random_state=42),
    'adbc' : AdaBoostClassifier()
}

In [53]:
for name, model in models.items():
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    mse[name] = mean_squared_error(y_val,y_pred)
    accuracy[name] = accuracy_score(y_val, y_pred)
    #f1[name] = f1_score(y_val, y_pred, average = 'weighted')



In [54]:
print('mean squared errors:\n',mse,'\n\n accuracies\n:', accuracy)#, '\n\n F1 scores:\n', f1)
#print(f"mean squared erros: {mse:.4f}")

mean squared errors:
 {'gbc': 0.0377906976744186, 'xgbc': 0.0436046511627907, 'rfc': 0.055232558139534885, 'adbc': 0.07267441860465117} 

 accuracies
: {'gbc': 0.9622093023255814, 'xgbc': 0.9563953488372093, 'rfc': 0.9447674418604651, 'adbc': 0.9273255813953488}


In [63]:
from sklearn.preprocessing import StandardScaler

# prepare standardize
scaler = StandardScaler()
scaler.fit(X)

# standardize data
X = scaler.transform(X)
X_scale_tr, X_scale_val, y_scale_tr, y_scale_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [64]:
mse_sc = {}
accuracy_sc = {}
f1_sc = {}
models  = {
    #'lr' : LogisticRegression(max_iter = 200),
    'gbc' : GradientBoostingClassifier(),
    'xgbc' : XGBClassifier(),
    'rfc'  :  RandomForestClassifier(random_state=42),
    'adbc' : AdaBoostClassifier()
}

In [65]:
for name, model in models.items():
    model.fit(X_scale_tr,y_scale_tr)
    y_pred = model.predict(X_scale_val)
    mse_sc[name] = mean_squared_error(y_scale_val,y_pred)
    accuracy_sc[name] = accuracy_score(y_scale_val, y_pred)
    #f1[name] = f1_score(y_val, y_pred, average = 'weighted')



In [66]:
print('mean squared errors:\n',mse_sc,'\n\n accuracies\n:', accuracy_sc)#, '\n\n F1 scores:\n', f1)
    #f1[name] = f1_score(y_val, y_pred, average = 'weighted')


mean squared errors:
 {'gbc': 0.0377906976744186, 'xgbc': 0.0436046511627907, 'rfc': 0.055232558139534885, 'adbc': 0.07267441860465117} 

 accuracies
: {'gbc': 0.9622093023255814, 'xgbc': 0.9563953488372093, 'rfc': 0.9447674418604651, 'adbc': 0.9273255813953488}


**I seem to be having problems with the logistic regression model so I'll explore that below**

In [70]:
lr_model = LogisticRegression()
lr_model.fit(X_scale_tr,y_scale_tr)
lr_pred = lr_model.predict(X_scale_val)
lr_mse_sc = mean_squared_error(y_scale_val, lr_pred)
lr_acc_sc = accuracy_score(y_scale_val,lr_pred)
print('mse: ', lr_mse_sc, 'accuracy: ', lr_acc_sc)

mse:  0.13953488372093023 accuracy:  0.8604651162790697


**It seems the problem was the fact that I had not scaled the data earlier.  After scaling, the Logistic regression model works well but has a lower accuracy of 0.86 as a well as a relatively higher mean squared error.  We might end up using this as our baseline model.**