In [1]:
import numpy as np 
import pandas as pd 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

In [2]:
train_dtype = {"image_name": "object", "patient_id": "object", "sex": "category","age_approx":"float32", 
               "anatom_site_general_challenge":"category","diagnosis":"object","benign_malignant":"object","target":"int64"}
test_dtype = {"image_name": "object", "patient_id": "object", "sex": "category","age_approx":"float32", 
               "anatom_site_general_challenge":"category"}

In [3]:
train_df  = pd.read_csv("train.csv", dtype = train_dtype)
train_df["anatom_site_general_challenge"] = train_df["anatom_site_general_challenge"].cat.add_categories("None").fillna("None")
train_df["sex"] = train_df["sex"].cat.add_categories("None").fillna("None")
train_df['age_approx'].fillna((train_df['age_approx'].mean()), inplace=True)

In [4]:
test_df = pd.read_csv("test.csv", dtype = test_dtype)
test_df["anatom_site_general_challenge"] = test_df["anatom_site_general_challenge"].cat.add_categories("None").fillna("None")

In [5]:
for col, col_dtype in train_dtype.items():
    if col_dtype == "category":
        train_df[col] = train_df[col].cat.codes.astype("int16")
        train_df[col] -= train_df[col].min()

In [6]:
for col, col_dtype in test_dtype.items():
    if col_dtype == "category":
        test_df[col] = test_df[col].cat.codes.astype("int16")
        test_df[col] -= test_df[col].min()

In [7]:
train_df = train_df.drop(['patient_id','diagnosis','benign_malignant'], axis=1)
X_test = test_df.drop(['image_name','patient_id'], axis=1)

In [8]:
train_df.head()

Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target
0,ISIC_2637011,1,45.0,0,0
1,ISIC_0015719,0,45.0,5,0
2,ISIC_0052212,0,50.0,1,0
3,ISIC_0068279,0,45.0,0,0
4,ISIC_0074268,0,55.0,5,0


In [9]:
X_test.head()

Unnamed: 0,sex,age_approx,anatom_site_general_challenge
0,1,70.0,6
1,1,40.0,1
2,0,55.0,4
3,0,50.0,4
4,0,45.0,1


In [10]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train_df.loc[:, 'fold'] = 0
for fold_number, (train_index, val_index) in enumerate(kfold.split(X=train_df.index, y=train_df['target'])):
    train_df.loc[train_df.iloc[val_index].index, 'fold'] = fold_number

In [11]:
train_df.head()

Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target,fold
0,ISIC_2637011,1,45.0,0,0,4
1,ISIC_0015719,0,45.0,5,0,1
2,ISIC_0052212,0,50.0,1,0,2
3,ISIC_0068279,0,45.0,0,0,1
4,ISIC_0074268,0,55.0,5,0,2


In [12]:
oof_pred = pd.DataFrame(columns = ['image_name', 'target', 'pred', 'fold'])
sub_pred = pd.DataFrame(columns = ['image_name', 'fold_0', 'fold_1','fold_2','fold_3','fold_4'])
sub_pred['image_name'] = test_df['image_name']

In [116]:
# lr=0.1, max_depth=3, n_esti=100
#gbc = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)

In [13]:
fold_lst = [0,1,2,3,4]
for fold in fold_lst:
    #Prep data
    train = train_df[train_df['fold'] != fold]
    val = train_df[train_df['fold'] == fold]

    
    y_train = train['target']
    X_train = train.drop(['image_name', 'target', 'fold'], axis=1)
    y_val = val['target']
    X_val = val.drop(['image_name','target', 'fold'], axis=1)
    
    #Fit model
    xgb = XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
           colsample_bynode=1, colsample_bytree=1, gamma=0.4, gpu_id=-1,
           importance_type='gain', interaction_constraints=None,
           learning_rate=0.005, max_delta_step=0, max_depth=3,
           min_child_weight=5, monotone_constraints=None,
           n_estimators=500, n_jobs=0, num_parallel_tree=1, random_state=0,
           reg_alpha=0.01, reg_lambda=1, scale_pos_weight=1, subsample=0.6,
           tree_method=None, validate_parameters=False, verbosity=None)
    #gbc = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)
    #gbc.fit(X_train, y_train)
    xgb.fit(X_train, y_train)
    
    #Predict Val
    y_pred = xgb.predict_proba(X_val)
    print(y_pred)
    
    oof_fold = pd.DataFrame(columns = ['image_name', 'target', 'pred', 'fold'])
    oof_fold['image_name'] = val['image_name']
    oof_fold['target'] = val['target']
    oof_fold['pred'] = y_pred[:,1]
    oof_fold['fold'] = fold
    
    oof_pred = pd.concat([oof_pred, oof_fold], ignore_index=True)
    
    #Predict Test
    y_test = xgb.predict_proba(X_test)
    fold_st = 'fold_'+str(fold)
    sub_pred[fold_st] = y_test[:,1]

[[0.9487172  0.0512828 ]
 [0.94943726 0.05056271]
 [0.93936694 0.06063307]
 ...
 [0.9492284  0.05077162]
 [0.93799955 0.06200045]
 [0.9487172  0.0512828 ]]
[[0.94965464 0.05034537]
 [0.94952255 0.05047745]
 [0.94816816 0.05183183]
 ...
 [0.93362796 0.06637204]
 [0.9479811  0.05201891]
 [0.94740564 0.05259437]]
[[0.94910485 0.05089514]
 [0.9439744  0.05602562]
 [0.94910485 0.05089514]
 ...
 [0.9426265  0.05737349]
 [0.9491283  0.05087165]
 [0.94886994 0.05113009]]
[[0.94965386 0.05034612]
 [0.9033835  0.0966165 ]
 [0.9457958  0.05420426]
 ...
 [0.9466382  0.05336178]
 [0.94917524 0.05082478]
 [0.94793653 0.05206345]]
[[0.9471588  0.05284118]
 [0.95034367 0.04965631]
 [0.95031846 0.04968154]
 ...
 [0.9400133  0.05998671]
 [0.9466275  0.05337251]
 [0.94792336 0.05207663]]


In [14]:
oof_pred.target.value_counts()

0    32542
1      584
Name: target, dtype: int64

In [15]:
oof_pred['target'] = oof_pred['target'].astype(int)
X = oof_pred['target'].to_numpy()
y = oof_pred['pred'].to_numpy()
auc = roc_auc_score(X,y)
print (auc)

0.6656754642893524


In [16]:
sub_pred.head()

Unnamed: 0,image_name,fold_0,fold_1,fold_2,fold_3,fold_4
0,ISIC_0052060,0.069517,0.068881,0.07184,0.065249,0.07248
1,ISIC_0052349,0.051225,0.051765,0.051106,0.050886,0.049682
2,ISIC_0058510,0.056628,0.054369,0.053443,0.054204,0.053654
3,ISIC_0073313,0.050969,0.050582,0.050766,0.052808,0.05205
4,ISIC_0073502,0.050962,0.050305,0.050872,0.051617,0.05147


In [17]:
sub_pred['target'] = (sub_pred['fold_0'] +sub_pred['fold_1'] +sub_pred['fold_2'] +sub_pred['fold_3'] +sub_pred['fold_4'])/5

In [18]:
sub_pred.head()

Unnamed: 0,image_name,fold_0,fold_1,fold_2,fold_3,fold_4,target
0,ISIC_0052060,0.069517,0.068881,0.07184,0.065249,0.07248,0.069593
1,ISIC_0052349,0.051225,0.051765,0.051106,0.050886,0.049682,0.050933
2,ISIC_0058510,0.056628,0.054369,0.053443,0.054204,0.053654,0.05446
3,ISIC_0073313,0.050969,0.050582,0.050766,0.052808,0.05205,0.051435
4,ISIC_0073502,0.050962,0.050305,0.050872,0.051617,0.05147,0.051045


In [19]:
sub_xgb = sub_pred[['image_name', 'target']]
sub_xgb = sub_xgb.sort_values('image_name')
sub_xgb.head()

Unnamed: 0,image_name,target
0,ISIC_0052060,0.069593
1,ISIC_0052349,0.050933
2,ISIC_0058510,0.05446
3,ISIC_0073313,0.051435
4,ISIC_0073502,0.051045


In [20]:
sub_xgb.to_csv("sub_xgb_fold.csv", index = False)

In [21]:
oof_pred.to_csv("oof_xgb.csv", index = False)