# Fast and Cheap (BreakoutRoom #3)

This team can utilize any model and any features but are limited to **only using ~35% of the training data**

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline

In [1]:
import pandas as pd
df = pd.read_csv('Diabetes_Data/diabetes_reduced_train.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
585,1,93,56,11,0,22.5,0.417,22,0
717,10,94,72,18,0,23.1,0.595,56,0
175,8,179,72,42,130,32.7,0.719,36,1
86,13,106,72,54,0,36.6,0.178,45,0
119,4,99,76,15,51,23.2,0.223,21,0


In [2]:
# Do your magic!

y = df['Outcome']
X = df.drop(columns=['Outcome'], axis=1)

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,test_size = .20, stratify=y)

In [27]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [28]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(class_weight = 'balanced')
model_log = logreg.fit(X_train_scaled, y_train)
model_log

LogisticRegression(class_weight='balanced')

In [29]:
y_hat_test = logreg.predict(X_test_scaled)
y_hat_train = logreg.predict(X_train_scaled)

In [30]:
# Your code here
import numpy as np
# We could subtract the two columns. If values or equal, difference will be zero. Then count number of zeros 
residuals = np.abs(y_train - y_hat_train)
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))

0    147
1     31
Name: Outcome, dtype: int64
0    0.825843
1    0.174157
Name: Outcome, dtype: float64


In [31]:
def eval_classification(model, model_name,
                        X_tr, X_te, y_tr, y_te,
                        to_print=False):
    '''
    Finds predictions for train and test sets, then
    prints metrics for classification nicely

    Inputs:
    model : already-fit sklearn model
    model_name : string, name for index for output df
    X_tr : training X (can be scaled, that's fine)
    X_te : testing X
    y_tr : training target
    y_te : testing target
    to_print : boolean, will print output nicely if True

    Outputs:
    metric_df - pandas Dataframe showing output
    '''
    
    metrics = {"Accuracy": accuracy_score,
               "Recall": recall_score,
               "Precision": precision_score,
               "F1-Score": f1_score}

    y_pred_tr = model.predict(X_tr)
    y_pred_te = model.predict(X_te)

    # Defining the column names based on the metric dict keys
    col_list = []  # Starting a list
    for name in metrics.keys():
        col_list.append(f"{name.lower()}_train")
        col_list.append(f"{name.lower()}_test")

    metric_df = pd.DataFrame(columns=col_list)

    for name, metric_function in metrics.items():
        tr_col = f"{name.lower()}_train"
        metric_df.at[model_name, tr_col] = metric_function(y_tr, y_pred_tr)
        te_col = f"{name.lower()}_test"
        metric_df.at[model_name, te_col] = metric_function(y_te, y_pred_te)
        
        # Adding to-print option to print the metrics nicely
        if to_print:
            print(f"{name}:"); print("="*len(name))
            print(f"TRAIN: {metric_function(y_tr, y_pred_tr):.4f}")
            print(f"TEST: {metric_function(y_te, y_pred_te):.4f}")
            print("*" * 15)
    
    return metric_df

In [32]:
logreg_scores = eval_classification(logreg, "logreg",
                                    X_train_scaled, X_test_scaled,
                                    y_train, y_test,
                                    to_print=True)

Accuracy:
TRAIN: 0.8258
TEST: 0.7333
***************
Recall:
TRAIN: 0.8154
TEST: 0.7059
***************
Precision:
TRAIN: 0.7361
TEST: 0.6316
***************
F1-Score:
TRAIN: 0.7737
TEST: 0.6667
***************


---------

## Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
clf = RandomForestClassifier(n_estimators=15,
                             criterion='gini',
                             max_features='auto',
                             oob_score=True)

In [44]:
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.9887640449438202
0.7555555555555555


In [42]:
# Then use your model to predict the outcomes of the holdout_df
holdout_df = pd.read_csv('Diabetes_data/holdout_df.csv')
holdout_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
540,8,100,74,40,215,39.4,0.661,43
307,0,137,68,14,148,24.8,0.143,21
745,12,100,84,33,105,30.0,0.488,46
691,13,158,114,0,0,42.3,0.257,44
564,0,91,80,0,0,32.4,0.601,27


In [None]:
# And store those outcomes in the 'Outcome' column of this submission_df 
submission_df = pd.read_csv('Diabetes_Data/submission_df.csv')