<a href="https://colab.research.google.com/github/mohitbhati01/Melanoma-Tumor-Size-Prediction/blob/main/Melanoma_Tumor_Size_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours

In [None]:
#importing files
train = pd.read_csv('/run/media/mohit/Projects/Machinehack-Melanoma Tumor Size Predicition/input/Train.csv')
test = pd.read_csv('/run/media/mohit/Projects/Machinehack-Melanoma Tumor Size Predicition/input/Test.csv')

In [None]:
train.head()

Unnamed: 0,mass_npea,size_npear,malign_ratio,damage_size,exposed_area,std_dev_malign,err_malign,malign_penalty,damage_ratio,tumor_size
0,6930.9,2919.02,0.42116,51.8298,988829.4,109.487,2758.76,72,39.362,14.103
1,15635.7,4879.36,0.31206,223.55,2058426.0,248.881,5952.53,240,22.0253,2.648
2,10376.2,2613.88,0.25191,127.337,1434676.0,160.093,4635.26,73,29.9963,1.688
3,13093.8,4510.06,0.34444,155.44,1812195.0,173.015,5273.87,32,28.1354,3.796
4,7545.21,2882.36,0.38201,85.1237,1043918.0,124.414,3263.35,57,35.02,18.023


In [None]:
target = train.pop('tumor_size')

In [None]:
# Feature engineering

train['penalty-err'] = train['malign_penalty'] - train['err_malign']
train['d_size-ratio'] = train['damage_size'] - train['damage_ratio']
train['d_ratio-m_ratio'] = train['damage_ratio'] - (train['malign_ratio'])
train['penalty/std'] = train['malign_penalty']/train['std_dev_malign']
train['mass/area'] = (train['mass_npea'])/(train['exposed_area'])
train['area/mass'] = train['exposed_area']/train['mass_npea']
train['err/std'] = train['penalty-err']/train['std_dev_malign']
train['dsr/ps'] = train['damage_size']/train['penalty-err']
train['std/area'] = train['std_dev_malign']/train['exposed_area']
train['err/area'] = train['err_malign']/train['exposed_area']
train['dr/area'] = (train['damage_ratio']*100)/train['exposed_area']
train['std/err'] = (train['std_dev_malign']+1)/(train['err_malign']+1)
train['penalty/err'] = (train['malign_penalty']+1)/(train['err_malign']+1)

test['penalty-err'] = test['malign_penalty'] - test['err_malign']
test['d_size-ratio'] = test['damage_size'] - test['damage_ratio']
test['d_ratio-m_ratio'] = test['damage_ratio'] - (test['malign_ratio'])
test['penalty/std'] = test['malign_penalty']/test['std_dev_malign']
test['mass/area'] = (test['mass_npea'])/(test['exposed_area'])
test['area/mass'] = test['exposed_area']/test['mass_npea']
test['err/std'] = test['penalty-err']/test['std_dev_malign']
test['dsr/ps'] = test['damage_size']/test['penalty-err']
test['std/area'] = test['std_dev_malign']/test['exposed_area']
test['err/area'] = test['err_malign']/test['exposed_area']
test['dr/area'] = (test['damage_ratio']*100)/test['exposed_area']
test['std/err'] = (test['std_dev_malign']+1)/(test['err_malign']+1)
test['penalty/err'] = (test['malign_penalty']+1)/(test['err_malign']+1)

In [None]:
train.columns

Index(['mass_npea', 'size_npear', 'malign_ratio', 'damage_size',
       'exposed_area', 'std_dev_malign', 'err_malign', 'malign_penalty',
       'damage_ratio', 'penalty-err', 'd_size-ratio', 'd_ratio-m_ratio',
       'penalty/std', 'mass/area', 'area/mass', 'err/std', 'dsr/ps',
       'std/area', 'err/area', 'dr/area', 'std/err', 'penalty/err'],
      dtype='object')

In [None]:
test.columns

Index(['mass_npea', 'size_npear', 'malign_ratio', 'damage_size',
       'exposed_area', 'std_dev_malign', 'err_malign', 'malign_penalty',
       'damage_ratio', 'penalty-err', 'd_size-ratio', 'd_ratio-m_ratio',
       'penalty/std', 'mass/area', 'area/mass', 'err/std', 'dsr/ps',
       'std/area', 'err/area', 'dr/area', 'std/err', 'penalty/err'],
      dtype='object')

In [None]:
# transformation

train['exposed_area'] = np.log1p(train['exposed_area'])

test['exposed_area'] = np.log1p(test['exposed_area'])

In [None]:
#defining metric

def RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse = make_scorer(RMSE, greater_is_better=False)

In [None]:
#Hyperparameter tuning of extratrees Regressor using Bayesian optimization

def etc_cv(n_estimators, min_samples_split, max_features, data, targets):
    estimator = ExtraTreesRegressor(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        max_features=max_features,
        random_state=2,
        n_jobs = -1
    )
    cval = cross_val_score(estimator, data, targets,
                           scoring=rmse, cv=5)
    return cval.mean()

In [None]:
def optimize_etc(data, targets):
    def etc_crossval(n_estimators, min_samples_split, max_features):
        return etc_cv(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max(min(max_features, 0.999), 1e-3),
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=etc_crossval,
        pbounds={
            "n_estimators": (100, 400),
            "min_samples_split": (2, 25),
            "max_features": (0.1, 0.999),
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=15, init_points=10)

    print("Final result:", optimizer.max)

In [None]:
print(Colours.yellow("--- Optimizing Extra Trees ---"))
optimize_etc(train, target)

[93m--- Optimizing Extra Trees ---[0m
|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m-3.953   [0m | [0m 0.2722  [0m | [0m 16.31   [0m | [0m 231.3   [0m |
| [95m 2       [0m | [95m-3.906   [0m | [95m 0.806   [0m | [95m 19.94   [0m | [95m 181.8   [0m |
| [0m 3       [0m | [0m-3.97    [0m | [0m 0.3485  [0m | [0m 20.44   [0m | [0m 387.4   [0m |
| [95m 4       [0m | [95m-3.767   [0m | [95m 0.8875  [0m | [95m 10.23   [0m | [95m 250.3   [0m |
| [0m 5       [0m | [0m-3.893   [0m | [0m 0.7144  [0m | [0m 18.39   [0m | [0m 211.1   [0m |
| [0m 6       [0m | [0m-3.824   [0m | [0m 0.6045  [0m | [0m 13.57   [0m | [0m 104.1   [0m |
| [0m 7       [0m | [0m-3.944   [0m | [0m 0.7948  [0m | [0m 22.3    [0m | [0m 209.5   [0m |
| [95m 8       [0m | [95m-3.668   [0m | [95m 0.6532  [0m | [95m 3.734   [0m | [95m 210.6   [0m |
| [0

In [None]:
etc = ExtraTreesRegressor(n_estimators=int(366.84350004540295),
                         min_samples_split=int(2.0),
                         max_features=0.999,
                         n_jobs = -1,
                         random_state = 2)

In [None]:
#Hyperparameter tuning of RandomForest Regressor using Bayesian optimization

def rfc_cv(n_estimators, min_samples_split, max_features, data, targets):
    estimator = RandomForestRegressor(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        max_features=max_features,
        random_state=2,
        n_jobs = -1
    )
    cval = cross_val_score(estimator, data, targets,
                           scoring=rmse, cv=5)
    return cval.mean()

In [None]:
def optimize_rfc(data, targets):
    def rfc_crossval(n_estimators, min_samples_split, max_features):
        return rfc_cv(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max(min(max_features, 0.999), 1e-3),
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (100, 450),
            "min_samples_split": (2, 25),
            "max_features": (0.1, 0.999),
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=15, init_points=10)

    print("Final result:", optimizer.max)

In [None]:
print(Colours.green("--- Optimizing Random Forest ---"))
optimize_rfc(train, target)

[92m--- Optimizing Random Forest ---[0m
|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m-3.945   [0m | [0m 0.2722  [0m | [0m 16.31   [0m | [0m 253.2   [0m |
| [0m 2       [0m | [0m-3.996   [0m | [0m 0.806   [0m | [0m 19.94   [0m | [0m 195.4   [0m |
| [0m 3       [0m | [0m-3.975   [0m | [0m 0.3485  [0m | [0m 20.44   [0m | [0m 435.3   [0m |
| [95m 4       [0m | [95m-3.917   [0m | [95m 0.8875  [0m | [95m 10.23   [0m | [95m 275.3   [0m |
| [0m 5       [0m | [0m-3.979   [0m | [0m 0.7144  [0m | [0m 18.39   [0m | [0m 229.6   [0m |
| [0m 6       [0m | [0m-3.929   [0m | [0m 0.6045  [0m | [0m 13.57   [0m | [0m 104.8   [0m |
| [0m 7       [0m | [0m-4.01    [0m | [0m 0.7948  [0m | [0m 22.3    [0m | [0m 227.7   [0m |
| [95m 8       [0m | [95m-3.845   [0m | [95m 0.6532  [0m | [95m 3.734   [0m | [95m 229.1   [0m |
| [0m 9

In [None]:
rfc = RandomForestRegressor(n_estimators = int(309.9622556931372),
                           min_samples_split=int(2.1527588902394075),
                           max_features = 0.18433272597322473,
                           n_jobs = -1,
                           random_state = 2)

In [None]:
#Hyperparameter tuning of LightGBM Regressor using Bayesian optimization

def lgb_cv(n_estimators, num_leaves, min_child_samples, subsample, data, targets):
    estimator = LGBMRegressor(
        n_estimators=n_estimators,
        num_leaves = num_leaves,
        min_child_samples=min_child_samples,
        subsample = subsample,
        random_state = 2
    )
    cval = cross_val_score(estimator, data, targets,
                           scoring=rmse, cv=5)
    return cval.mean()

In [None]:
def optimize_lgb(data, targets):
    def lgb_crossval(n_estimators, num_leaves, min_child_samples, subsample):
        return lgb_cv(
            n_estimators=int(n_estimators),
            num_leaves = int(num_leaves),
            min_child_samples=int(min_child_samples),
            subsample=subsample,
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=lgb_crossval,
        pbounds={
            "n_estimators": (200,500),
            "num_leaves": (30,80),
            "min_child_samples": (5,30),
            "subsample": (0.6,1.0)
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=25, init_points=20)

    print("Final result:", optimizer.max)

In [None]:
print(Colours.blue("--- Optimizing Light GBM ---"))
optimize_lgb(train, target)

[94m--- Optimizing Light GBM ---[0m
|   iter    |  target   | min_ch... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-3.855   [0m | [0m 9.788   [0m | [0m 386.6   [0m | [0m 51.89   [0m | [0m 0.9141  [0m |
| [0m 2       [0m | [0m-3.89    [0m | [0m 24.5    [0m | [0m 281.8   [0m | [0m 43.82   [0m | [0m 0.9207  [0m |
| [0m 3       [0m | [0m-3.857   [0m | [0m 28.95   [0m | [0m 462.8   [0m | [0m 47.89   [0m | [0m 0.8004  [0m |
| [95m 4       [0m | [95m-3.848   [0m | [95m 22.09   [0m | [95m 413.8   [0m | [95m 48.51   [0m | [95m 0.8245  [0m |
| [0m 5       [0m | [0m-3.854   [0m | [0m 17.58   [0m | [0m 204.1   [0m | [0m 68.64   [0m | [0m 0.9531  [0m |
| [0m 6       [0m | [0m-3.876   [0m | [0m 14.12   [0m | [0m 384.6   [0m | [0m 33.77   [0m | [0m 0.7475  [0m |
| [0m 7       [0m | [0m-3.879   [0m | [0m 28.33   [0m | [0m 395.4   

In [None]:
lgb = LGBMRegressor(
        n_estimators=int(478.3739357476376),
        num_leaves = int(79.29868320714608),
        min_child_samples=int(13.577585500929393),
        subsample = 1.0,
        random_state = 2
    )

In [None]:
#Hyperparameter tuning of XGBoost Regressor using Bayesian optimization

def xgb_cv(n_estimators, max_depth, gamma, min_child_weight, subsample, data, targets):
    estimator = XGBRegressor(
        n_estimators=n_estimators,
        max_depth = max_depth,
        gamma = gamma,
        min_child_weight=min_child_weight,
        subsample = subsample,
        random_state = 2,
    )
    cval = cross_val_score(estimator, data, targets,
                           scoring=rmse, cv=5)
    return cval.mean()

In [None]:
def optimize_xgb(data, targets):
    def xgb_crossval(n_estimators, max_depth, gamma, min_child_weight, subsample):
        return xgb_cv(
            n_estimators=int(n_estimators),
            max_depth = int(max_depth),
            gamma = gamma,
            min_child_weight=min_child_weight,
            subsample=subsample,
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=xgb_crossval,
        pbounds={
            "n_estimators": (200, 500),
            "max_depth": (6,15),
            "gamma": (0,10),
            "min_child_weight": (0,10),
            "subsample": (0.8,1.0)
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=25, init_points=20)

    print("Final result:", optimizer.max)

In [None]:
print(Colours.red("--- Optimizing XGBoost ---"))
optimize_xgb(train, target)

[91m--- Optimizing XGBoost ---[0m
|   iter    |  target   |   gamma   | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-4.058   [0m | [0m 1.915   [0m | [0m 11.6    [0m | [0m 4.377   [0m | [0m 435.6   [0m | [0m 0.956   [0m |
| [95m 2       [0m | [95m-4.044   [0m | [95m 2.726   [0m | [95m 8.488   [0m | [95m 8.019   [0m | [95m 487.4   [0m | [95m 0.9752  [0m |
| [0m 3       [0m | [0m-4.085   [0m | [0m 3.578   [0m | [0m 10.51   [0m | [0m 6.835   [0m | [0m 413.8   [0m | [0m 0.8741  [0m |
| [0m 4       [0m | [0m-4.074   [0m | [0m 5.612   [0m | [0m 10.53   [0m | [0m 0.1377  [0m | [0m 431.8   [0m | [0m 0.9765  [0m |
| [0m 5       [0m | [0m-4.077   [0m | [0m 3.649   [0m | [0m 11.54   [0m | [0m 0.7538  [0m | [0m 310.6   [0m | [0m 0.9866  [0m |
| [0m 6       [0m | [0m-4.095   [0m | [0m 6.514   [0m | [0m 9.575   

In [None]:
xgb = XGBRegressor(
        n_estimators=int(370.207778935361),
        max_depth = int(15.0),
        gamma = 0.0,
        min_child_weight=10.0,
        subsample = 1.0,
        random_state = 2,
    )

In [None]:
#Hyperparameter tuning of Catboost Regressor using Bayesian optimization

def cb_cv(n_estimators, depth,data, targets):
    estimator = CatBoostRegressor(
        n_estimators=n_estimators,
#         learning_rate=learning_rate,
        depth=depth,
        random_state = 2,
        verbose = 0,
    )
    cval = cross_val_score(estimator, data, targets,
                           scoring=rmse, cv=5)
    return cval.mean()

In [None]:
def optimize_cb(data, targets):
    def cb_crossval(n_estimators, depth):
        return cb_cv(
            n_estimators=int(n_estimators),
#             learning_rate = learning_rate,
            depth = int(depth),
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=cb_crossval,
        pbounds={
            "n_estimators": (200, 600),
#             "learning_rate": (0.01,10),
            "depth": (4,16),
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=25, init_points=20)

    print("Final result:", optimizer.max)

In [None]:
print(Colours.bold("--- Optimizing Catboost ---"))
optimize_cb(train, target)

In [None]:
cb = CatBoostRegressor(n_estimators=309,
                      depth = 13,
                      verbose = 0,
                      random_state = 2)

In [None]:
#Hyperparameter tuning of XGBoost RandomForest Regressor using Bayesian optimization

def xgb_cv(n_estimators, max_depth, gamma, min_child_weight, subsample, data, targets):
    estimator = XGBRFRegressor(
        n_estimators=n_estimators,
        max_depth = max_depth,
        gamma = gamma,
        min_child_weight=min_child_weight,
        subsample = subsample,
        random_state = 2,
    )
    cval = cross_val_score(estimator, data, targets,
                           scoring=rmse, cv=5)
    return cval.mean()

In [None]:
def optimize_xgb(data, targets):
    def xgb_crossval(n_estimators, max_depth, gamma, min_child_weight, subsample):
        return xgb_cv(
            n_estimators=int(n_estimators),
            max_depth = int(max_depth),
            gamma = gamma,
            min_child_weight=min_child_weight,
            subsample=subsample,
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=xgb_crossval,
        pbounds={
            "n_estimators": (200, 500),
            "max_depth": (6,15),
            "gamma": (0,10),
            "min_child_weight": (0,10),
            "subsample": (0.8,1.0)
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=25, init_points=20)

    print("Final result:", optimizer.max)

In [None]:
print(Colours.red("--- Optimizing XGBoost RandomForest ---"))
optimize_xgb(train, target)

In [None]:
xgbrf = XGBRFRegressor(
        n_estimators=int(463.8),
        max_depth = int(15.0),
        gamma = 10.0,
        min_child_weight=0.0,
        subsample = 0.8,
        random_state = 2,
    )

In [None]:
#Stacking least correlated models

estimators = [('etc', etc), ('rfc', rfc), ('xgb', xgb), ('lgb', lgb), ('xgbrf', xgbrf)]

model = StackingRegressor(estimators=estimators)

In [None]:
scores = cross_val_score(model, train, target, cv = 5, scoring = rmse)

scores.mean()

-3.5971750064096297

In [None]:
#Fitting model

model.fit(train, target)

StackingRegressor(cv=None,
                  estimators=[('etc',
                               ExtraTreesRegressor(bootstrap=False,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features=0.999,
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=366, n_jobs=

In [None]:
#Predicting

preds = model.predict(test)

In [None]:
sub = pd.read_csv('/run/media/mohit/Projects/Machinehack-Melanoma Tumor Size Predicition/input/sample_submission.csv')

In [None]:
sub['tumor_size'] = preds

In [None]:
#Creating submission file

sub.to_csv('ensemble_2.csv', index = False)