In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mutual_info_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

# Reading the data

In [2]:
data  = pd.read_csv('Datasets/Breast_Cancer.csv')

# EDA & Feature Engineering

In [3]:
#@ analysis of numerical variables
data.describe()

Unnamed: 0,Age,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months
count,4024.0,4024.0,4024.0,4024.0,4024.0
mean,53.972167,30.473658,14.357107,4.158052,71.297962
std,8.963134,21.119696,8.099675,5.109331,22.92143
min,30.0,1.0,1.0,1.0,1.0
25%,47.0,16.0,9.0,1.0,56.0
50%,54.0,25.0,14.0,2.0,73.0
75%,61.0,38.0,19.0,5.0,90.0
max,69.0,140.0,61.0,46.0,107.0


In [4]:
#@ analysis of categorical variables
data.describe(include='object')

Unnamed: 0,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Estrogen Status,Progesterone Status,Status
count,4024,4024,4024,4024,4024,4024,4024,4024,4024,4024,4024
unique,3,5,4,3,5,4,4,2,2,2,2
top,White,Married,T2,N1,IIA,Moderately differentiated,2,Regional,Positive,Positive,Alive
freq,3413,2643,1786,2732,1305,2351,2351,3932,3755,3326,3408


In [5]:
#@ checking for missing values
data.isna().sum()

Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64

In [6]:
#@ target variable distribution
target_class = data['Status']
target_class.value_counts(normalize=True)

Status
Alive    0.846918
Dead     0.153082
Name: proportion, dtype: float64

#@ data appears to be imbalanced

In [7]:
#@ data cleaning

df = data.copy()
df = df.rename(columns={'Reginol Node Positive':'regional_node_positive'})
df.columns = df.columns.str.lower().str.strip(' ').str.replace(' ', '_')

categorical = list(df.dtypes[df.dtypes=='object'].index)
numerical = list(df.dtypes[df.dtypes!='object'].index)

for col in categorical:
    df[col] = df[col].str.lower().str.replace(' ', '_')
    
df['status'] = (df['status']=='dead').astype(int)

categorical.remove('status')

In [8]:
#@ splitting the data 60:20:20

y = df['status'].values
df_full, df_test, y_full, y_test = train_test_split(df, y, test_size=0.2, stratify=y, random_state=1)
df_train, df_val, y_train, y_val = train_test_split(df_full, y_full, test_size=0.25, stratify=y_full, random_state=1)

df_val = df_val.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_full = df_full.reset_index(drop=True)

del df_train['status']
del df_val['status']

for i in [df_train, df_val]:
    assert 'status' not in list(i.columns)

In [9]:
size_mapping = {
    't1': 'Small',
    't2': 'Medium',
    't3': 'Medium',
    't4': 'Large'
}
    
age_group = {
    (0, 30): 'young',
    (30, 59): 'middle_aged',
    (59, 150): 'senior'
}

In [10]:
#@ Feature Engineering
def map_age_to_group(age):
    for age_range, group in age_group.items():
        if age_range[0] <= age <= age_range[1]:
            return group
    return 'unknown'

def feature_engineer(series):
# classification of patient ages
    series['age_group'] = series['age'].apply(map_age_to_group)
    
# classification of cancer sizes
    series['size_classification'] = series['t_stage'].map(size_mapping)
    
# lymph node positivity rate: extent or severity of lymph node involvement in breast cancer.
    series['lymph_node_positivity_%'] = (series['regional_node_positive'] / series['regional_node_examined'])*100
    
    return series

In [11]:
full_df = feature_engineer(df_full)

categorical = list(df_full.dtypes[df_full.dtypes=='object'].index)
numerical = list(df_full.dtypes[df_full.dtypes!='object'].index)

In [12]:
#@ correaltion of numerical variables with the target variable
full_df[numerical].corrwith(df_full['status'])

age                        0.042878
tumor_size                 0.147107
regional_node_examined     0.033861
regional_node_positive     0.256966
survival_months           -0.479545
status                     1.000000
lymph_node_positivity_%    0.244568
dtype: float64

In [13]:
#@ mutual information score between the categorical variables and target variable
def mis(series):
    return mutual_info_score(series, full_df['status'])

In [14]:
full_df[categorical].apply(mis)

race                   0.002943
marital_status         0.002331
t_stage                0.012222
n_stage                0.031615
6th_stage              0.033148
differentiate          0.012242
grade                  0.012242
a_stage                0.003268
estrogen_status        0.013337
progesterone_status    0.012716
age_group              0.002058
size_classification    0.011642
dtype: float64

In [15]:
#@ Baseline Features
base_features = sorted([
    'regional_node_examined',
    'regional_node_positive',
    'survival_months',
    'lymph_node_positivity_%',
    'race',
    'marital_status',
    't_stage',
    'n_stage',
    '6th_stage',
    'differentiate',
    'grade',
    'a_stage',
    'estrogen_status',
    'progesterone_status',
    'age_group',
    'size_classification'
])

In [16]:
def prepare_df(df):
    new_df = df.copy()
    new_df = feature_engineer(new_df)
    new_df = new_df[base_features]
    
    return new_df

In [17]:
train_df = prepare_df(df_train)
val_df = prepare_df(df_val)
test_df = prepare_df(df_test)
full_df = prepare_df(df_full)

In [18]:
#@ function to train model
def train(X, y, model):
    dv = DictVectorizer(sparse=False)
    
    X_dicts = X.to_dict(orient='records')
    X = dv.fit_transform(X_dicts)
   
    model.fit(X, y)
    
    return (model, dv)


#@ function to return precision and recall
def precision_recall(actual, predicted):
    conf_matrix = confusion_matrix(actual, predicted)
    tp = conf_matrix[0, 0]
    fp = conf_matrix[0, 1]
    fn = conf_matrix[1, 0]
    tn = conf_matrix[1, 1]

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    return (precision, recall)

#! function making predictions
def predict(df, model, dv):
    x_dict = df.to_dict(orient='records')
    X = dv.transform(x_dict)
    
    y_pred = model.best_estimator_.predict(X)
    
    return y_pred

# Model Selection

In [19]:
rf = RandomForestClassifier(class_weight='balanced')
clf = XGBClassifier()

In [21]:
#@ randomforest parameters
rf_params = {
    'n_estimators': np.arange(150, 200, 20),
    'max_depth': np.arange(5, 8, 2),
    'min_samples_leaf': np.arange(1,6,1)
    }
#@ xgbclassifier parameters
xgb_params = {
    'n_estimators': np.arange(160, 220, 20),
    'max_depth': np.arange(3, 10, 2),
    'learning_rate': np.arange(0.1,1.0, 0.1)
    }
#@randomized search
rf_search = GridSearchCV(rf, rf_params, scoring='f1', verbose=1, cv=5, n_jobs=-1)
xgb_search = GridSearchCV(clf, xgb_params, scoring='f1', verbose=1, cv=5, n_jobs=-1)

searches = {
    'Random Forest': rf_search,
    'XGBClassifer': xgb_search
}

In [31]:
#@ custom function to compare models
def model_comparison(df, y, val, val_y):
    for search_name, search in searches.items():
        model, dv = train(df, y, search)
        y_pred = predict(val, model, dv)
        precisonRecall = precision_recall(val_y, y_pred)
        estimators[search_name] = model.best_estimator_
        scores.append((model.best_estimator_, precisonRecall[0], precisonRecall[1]))
        
    df_scores = pd.DataFrame(scores, columns=['Model','Precision', 'Recall'])
    df_scores.index = [x for x,y in searches.items()]
    return df_scores

## Model Comparison

In [32]:
scores = []
estimators = {}

#@ comparing the scores of xgbclassifier against randomforest on df_train
scores = model_comparison(train_df, y_train, val_df, y_val)
scores

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits


Unnamed: 0,Model,Precision,Recall
Random Forest,"(DecisionTreeClassifier(max_depth=7, max_featu...",0.884164,0.933437
XGBClassifer,"XGBClassifier(base_score=None, booster=None, c...",0.972141,0.910714


### Best Estimator by recall

In [33]:
best_model = scores.sort_values(by='Recall', ascending=False).iloc[0].values[0]

In [34]:
best_model

## Testing the model on full train

In [35]:
model, dv = train(full_df, y_full, best_model)

# Exporting the model

In [36]:
import pickle

In [37]:
with open('model.bin', 'wb') as f:
    pickle.dump((model, dv), f)