Model Evaluation
---


In [1]:
# importing libraries
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn import metrics

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# import classifier algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [71]:
# read the data into pandas dataframe
train_df = pd.read_csv("../data/train.csv", index_col=0)
train_df.head()

Unnamed: 0,id,menopaus,agegrp,density,race,hispanic,bmi,agefirst,nrelbc,brstproc,lastmamm,surgmeno,hrt,invasive,cancer,count
0,RE-TRAIN-0000001,premenopausal,35-39,almost entirely fat,white,no,10-24.99,age <30,zero,no,,,,no,no,2
1,RE-TRAIN-0000002,premenopausal,35-39,almost entirely fat,white,no,10-24.99,age <30,zero,no,,,,no,no,4
2,RE-TRAIN-0000003,premenopausal,35-39,almost entirely fat,white,no,10-24.99,age <30,one,no,negative,,,no,no,1
3,RE-TRAIN-0000004,premenopausal,35-39,almost entirely fat,white,no,10-24.99,age <30,one,no,,,,no,no,1
4,RE-TRAIN-0000005,premenopausal,35-39,almost entirely fat,white,no,10-24.99,age 30 or greater,zero,no,negative,,,no,no,2


In [72]:
# Output variable - distribution
train_df.cancer.value_counts()

no     189942
yes      6520
Name: cancer, dtype: int64

In [73]:
# Check missing values
train_df.isnull().sum()

id              0
menopaus    11043
agegrp          0
density     49730
race        40451
hispanic    59899
bmi         76505
agefirst    64061
nrelbc      27786
brstproc    25377
lastmamm    65061
surgmeno    84860
hrt         65734
invasive        0
cancer          0
count           0
dtype: int64

In [74]:
# Replace missing values in categorical data with model value of the variable.
for col in train_df.columns[train_df.isnull().any().tolist()]:
    train_df[col].fillna(value=train_df[col].mode()[0], inplace=True)

In [75]:
# Set the `id` as index column
train_df = train_df.set_index('id')

In [76]:
# split the training data into training, validation and testing dataset.
df_full, df_test = train_test_split(train_df, test_size=0.2, random_state=42, shuffle=True, stratify=train_df['cancer'])
df_train, df_valid = train_test_split(df_full, test_size=0.25, random_state=42, shuffle=True, stratify=df_full['cancer'])

In [77]:
# data preparation - split the data into X and y.
y_train = (df_train['cancer']=='yes').astype('int').values
y_val = (df_valid['cancer']=='yes').astype('int').values
y_test = (df_test['cancer']=='yes').astype('int').values

x_train = df_train.drop(['cancer'], axis=1)
x_valid = df_valid.drop(['cancer'], axis=1)
x_test = df_test.drop(['cancer'], axis=1)

In [79]:
# Encoding categorical data
# Converting data into dictionary format.
train_dict = x_train.to_dict(orient='records')
valid_dict = x_valid.to_dict(orient='records')
test_dict = x_test.to_dict(orient='records')

# Create object of dictionary vectorizer
dv = DictVectorizer(sparse=False)

# data transformation
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(valid_dict)
X_test = dv.transform(test_dict)

In [82]:
# Base model using ensemble - RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
predictions = rf_clf.predict_proba(X_val)[:, 1]

# computing score
fpr, tpr, threshold = metrics.roc_curve(y_val, predictions)
auc_score = metrics.auc(fpr, tpr)

print("Random Forest Classifier Score:: %.3f" % auc_score)

Random Forest Classifier Score:: 0.930


In [None]:
# Let's find the base model with highest auc score
models = {
    'lr_clf': LogisticRegression(max_iter=1000, random_state=42),
    'cart': DecisionTreeClassifier(random_state=42),
    'et_clf': ExtraTreeClassifier(random_state=42),
    'gb_clf': GradientBoostingClassifier(random_state=42),
    'rf_clf': RandomForestClassifier(random_state=42),
    'knn': KNeighborsClassifier(),
    'svc': SVC(random_state=42)
    }

scores = []
for name, clf in models.items():
    CV = StratifiedKFold(n_splits=4, shuffle=True, random_state=42) 
    score = cross_val_score(clf, X_train, y_train, cv=CV, n_jobs=-1, scoring='roc_auc')
    scores.append((name, np.mean(score)))

    print("%s - roc score::%.3f" %(name, np.mean(score)))

lr_clf - roc score::0.968
cart - roc score::0.920
et_clf - roc score::0.914
gb_clf - roc score::0.968
rf_clf - roc score::0.923


Model optimization
---
Logistic Regression algorithm gives high score, Hence selected for further evaluation.

In [None]:
# to find best C vlaues
c_vals = [0.01, 0.03, 0.5, 1, 3, 5, 10, 15, 20, 25]
for C in c_vals:
    model = LogisticRegression(C=C, class_weight='balanced', multi_class='ovr', max_iter=1000, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    y_preds = model.predict_proba(X_val)[:, 1]
    
    # model evaluation
    fpr, tpr, thresholds = metrics.roc_curve(y_val, y_preds)
    score = metrics.auc(fpr, tpr)
    print("%f:: %.3f" % (C, score))

In [None]:
# final model
C = 0.5
clf = LogisticRegression(C=0.5, class_weight='balanced', multi_class='ovr', max_iter=1000, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

# Training data
y_predictions = clf.predict_proba(X_train)[:, 1]
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_predictions)
score = metrics.auc(fpr, tpr)
print("Training ROC Score:: %.3f" % (score))

# validation data
y_predictions = clf.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_predictions)
score = metrics.auc(fpr, tpr)
print("Testing ROC Score:: %.3f" % (score))