# 1. Import libraries

In [1]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score,log_loss, confusion_matrix, recall_score, fbeta_score
import pandas as pd

from catboost import Pool, CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

# 2. Import data

In [2]:
data = pd.read_csv('data/The_Cancer_data_1500_V2.csv')

data.head()

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory,Diagnosis
0,58,1,16.085313,0,1,8.146251,4.148219,1,1
1,71,0,30.828784,0,1,9.36163,3.519683,0,0
2,48,1,38.785084,0,2,5.135179,4.728368,0,1
3,34,0,30.040296,0,0,9.502792,2.044636,0,0
4,62,1,35.479721,0,0,5.35689,3.309849,0,1


# 3. Preprocess data

## 3.1 Assign features and target

In [3]:
target = 'Diagnosis'
features = data.columns.drop(target).tolist()

X = data.loc[:, features]
y = data.loc[:, target]

## 3.2 Transform feature columns

In [4]:
cat_features= ['Gender', 'Smoking', 'GeneticRisk', 'CancerHistory']
num_features = [col for col in X if col not in cat_features]

In [5]:
from sklearn.preprocessing import StandardScaler

num_transform = StandardScaler()

X[num_features] = num_transform.fit_transform(X[num_features])

In [6]:
X.iloc[:2]

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory
0,0.435495,1,-1.581162,0,1,1.133713,1.219465,1
1,1.172662,0,0.458722,0,1,1.557899,0.776474,0


## 3.3 Split data into train and test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1200, 8), (300, 8), (1200,), (300,))

# 4. Train models

## 4.1 Create a model evaluation function

In the real world application of our model's predictions, false negatives are costlier than false positives.

Hence, our priority metrics will be **Recall** and **F2 score** because these metrics score models with little false negatives in their predictions higher.

This will help us choose a model that will perform best in the practical applications.

In [8]:
def evaluate_model(y_true, y_pred):
    logloss = log_loss(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f2score = fbeta_score(y_true, y_pred, beta=2.0)
    auc = roc_auc_score(y_true, y_pred)
    confusion_mtx = confusion_matrix(y_true, y_pred)

    return {'logloss': logloss, 'accuracy': accuracy, 'recall': recall, 'f2score': f2score, 'auc': auc, 'confusion_matrix': confusion_mtx}

## 4.2 Experiment with different algorithms

In [27]:
scale_pos_weight = sum(data.Diagnosis == 0)/sum(data.Diagnosis == 1)

seed = 0
models = {
    'CatBoostClassifier': CatBoostClassifier(verbose=False, random_seed=seed),
    'CatBoostClassifier (scale_pos_weight)': CatBoostClassifier(verbose=False, scale_pos_weight=scale_pos_weight, random_state=seed),
    'RandomForestClassifier': RandomForestClassifier(verbose=False, random_state=seed),
    'RandomForestClassifier (Balanced class_weight)': RandomForestClassifier(verbose=False, class_weight='balanced', random_state=seed),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=seed),
}
model_scores = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = evaluate_model(y_test, y_pred)
    model_scores[name] = metrics

In [28]:
scores = pd.DataFrame(model_scores).sort_values(by='f2score', axis=1, ascending=False).T
scores.head()

Unnamed: 0,logloss,accuracy,recall,f2score,auc,confusion_matrix
CatBoostClassifier (scale_pos_weight),2.042474,0.943333,0.905172,0.913043,0.936282,"[[178, 6], [11, 105]]"
CatBoostClassifier,2.042474,0.943333,0.896552,0.907504,0.934689,"[[179, 5], [12, 104]]"
AdaBoostClassifier,1.922328,0.946667,0.87931,0.897887,0.93422,"[[182, 2], [14, 102]]"
RandomForestClassifier,2.643201,0.926667,0.87069,0.882867,0.916323,"[[177, 7], [15, 101]]"
RandomForestClassifier (Balanced class_weight),2.643201,0.926667,0.87069,0.882867,0.916323,"[[177, 7], [15, 101]]"


The `CatBoostClassifier` model with `scale_pos_weight` provided performs best in our metrics of top priority: Recall & F2 score

## 4.3 Train the best model

In [36]:
model = CatBoostClassifier(scale_pos_weight=scale_pos_weight, eval_metric='PRAUC', early_stopping_rounds=50, verbose=100)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
metrics = evaluate_model(y_test, y_pred)

Learning rate set to 0.011136
0:	learn: 0.9140855	total: 8.38ms	remaining: 8.37s
100:	learn: 0.9740146	total: 514ms	remaining: 4.58s
200:	learn: 0.9778118	total: 1.03s	remaining: 4.08s
300:	learn: 0.9812538	total: 1.54s	remaining: 3.58s
400:	learn: 0.9839230	total: 1.99s	remaining: 2.97s
500:	learn: 0.9866026	total: 2.44s	remaining: 2.43s
600:	learn: 0.9891529	total: 2.88s	remaining: 1.91s
700:	learn: 0.9921700	total: 3.32s	remaining: 1.42s
800:	learn: 0.9944053	total: 3.77s	remaining: 937ms
900:	learn: 0.9957552	total: 4.23s	remaining: 465ms
999:	learn: 0.9966948	total: 4.67s	remaining: 0us


In [37]:
metrics


{'logloss': 2.042473692049972,
 'accuracy': 0.9433333333333334,
 'recall': 0.9051724137931034,
 'f2score': 0.9130434782608695,
 'auc': 0.9362818590704647,
 'confusion_matrix': array([[178,   6],
        [ 11, 105]])}