# 1. Import libraries

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss, recall_score, fbeta_score
import pandas as pd

from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

# 2. Import data

In [None]:
data = pd.read_csv('data/The_Cancer_data_1500_V2.csv')

data.head()

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory,Diagnosis
0,58,1,16.085313,0,1,8.146251,4.148219,1,1
1,71,0,30.828784,0,1,9.36163,3.519683,0,0
2,48,1,38.785084,0,2,5.135179,4.728368,0,1
3,34,0,30.040296,0,0,9.502792,2.044636,0,0
4,62,1,35.479721,0,0,5.35689,3.309849,0,1


# 3. Preprocess data

## 3.1 Assign features and target

In [3]:
target = 'Diagnosis'
features = data.columns.drop(target).tolist()

X = data.loc[:, features]
y = data.loc[:, target]

## 3.2 Split data into train and test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1200, 8), (300, 8), (1200,), (300,))

## 3.3 Transform features

In [31]:
cat_features= ['Gender', 'Smoking', 'GeneticRisk', 'CancerHistory']
num_features = [col for col in X.columns if col not in cat_features]

In [32]:
from sklearn.preprocessing import StandardScaler

num_transform = StandardScaler()

X_train[num_features] = num_transform.fit_transform(X_train[num_features])
X_test[num_features] = num_transform.transform(X_test[num_features])

In [33]:
X_train.head(2)

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory
1376,-0.287156,0,-0.458439,0,0,-0.037009,-1.159941,0
787,0.619056,1,-0.413965,0,2,-1.138655,-0.082026,0


# 4. Train models

## 4.1 Create a model evaluation function

In the real world application of our model's predictions, false negatives are costlier than false positives.

Hence, our priority metrics will be **Recall** and **F2 score** because these metrics score models with little false negatives in their predictions higher.

This will help us choose a model that will perform best in the practical applications.

In [34]:
def get_metric_scores(y_true, y_pred):
    logloss = log_loss(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f2score = fbeta_score(y_true, y_pred, beta=2.0)
    auc = roc_auc_score(y_true, y_pred)

    return {'logloss': logloss, 'recall': recall, 'f2score': f2score, 'auc': auc}

def evaluate_models(X_train, y_train, X_test, y_test, models:dict):
    model_scores = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        metrics = get_metric_scores(y_test, y_pred)
        model_scores[name] = metrics
    return model_scores

## 4.2 Experiment with different algorithms

In [None]:
scale_pos_weight = sum(y_train == 0)/sum(y_train == 1)

seed = 0
models = {
    'CatBoostClassifier': CatBoostClassifier(verbose=False, random_seed=seed),
    'CatBoostClassifier (scale_pos_weight)': CatBoostClassifier(verbose=False, scale_pos_weight=scale_pos_weight, random_state=seed),
    'RandomForestClassifier': RandomForestClassifier(verbose=False, random_state=seed),
    'RandomForestClassifier (Balanced class_weight)': RandomForestClassifier(verbose=False, class_weight='balanced', random_state=seed),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=seed),
}

model_scores = evaluate_models(X_train, y_train, X_test, y_test, models)

In [None]:
model_scores = pd.DataFrame(model_scores).sort_values(by='f2score', axis=1, ascending=False).T
model_scores.head()

Unnamed: 0,logloss,recall,f2score,auc
CatBoostClassifier (scale_pos_weight),1.08131,0.936937,0.945455,0.963177
CatBoostClassifier,1.201455,0.918919,0.932358,0.956814
AdaBoostClassifier,1.441746,0.900901,0.917431,0.947805
RandomForestClassifier,2.042474,0.864865,0.885609,0.927141
RandomForestClassifier (Balanced class_weight),2.162619,0.864865,0.883978,0.924496


The `CatBoostClassifier` model with `scale_pos_weight` provided performs best in our top priority metrics: Recall & F2 score

## 4.3 Train the best model

In [None]:
model = CatBoostClassifier(scale_pos_weight=scale_pos_weight, verbose=300, random_seed=seed)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Learning rate set to 0.011136
0:	learn: 0.6847359	total: 10.4ms	remaining: 10.4s
300:	learn: 0.2260950	total: 1.37s	remaining: 3.19s
600:	learn: 0.1675987	total: 2.73s	remaining: 1.81s
900:	learn: 0.1290977	total: 4.01s	remaining: 441ms
999:	learn: 0.1188084	total: 4.43s	remaining: 0us


In [54]:
metrics = get_metric_scores(y_test, y_pred)
metrics

{'logloss': 1.0813096016735149,
 'recall': 0.9369369369369369,
 'f2score': 0.9454545454545454,
 'auc': 0.9631774631774631}