# Import libraries

In [109]:
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
import pandas as pd
import numpy as np
import warnings
import random
import time
import sys
import os

# Data preprocessing

In [2]:
os.chdir('D:\ISU\MIS 536\SEER Project\CSV data')

In [3]:
data = pd.read_csv('vfinal.csv')

In [61]:
data['GRADE'].describe()

count    742356.000000
mean          2.106941
std           0.607522
min           1.000000
25%           2.000000
50%           2.000000
75%           2.000000
max           4.000000
Name: GRADE, dtype: float64

In [6]:
data = data.drop(['Unnamed: 0'],axis=1)

In [51]:
data = data.drop(data[data['GRADE'] == 5].index)


In [62]:
data['GRADE'].value_counts()

2    505413
3    135397
1     86367
4     15179
Name: GRADE, dtype: int64

In [63]:
data['GRADE'] = data['GRADE'].replace({1:0, 2:1, 3:2, 4:3})

In [120]:
data['GRADE'].value_counts()

1    505413
2    135397
0     86367
3     15179
Name: GRADE, dtype: int64

## List of columns

In [7]:
data.columns

Index(['DX_CONF', 'REPT_SRC', 'RECNOREC', 'SITERWHO', 'BEHTREND', 'IHSLINK',
       'REG_Connecticut', 'REG_Greater California', 'REG_Greater Georgia',
       'REG_Hawaii', 'REG_Idaho', 'REG_Iowa', 'REG_Kentucky',
       'REG_Los Angeles', 'REG_Louisiana', 'REG_Massachusetts',
       'REG_Metropolitan Atlanta', 'REG_Metropolitan Detroit',
       'REG_New Jersey', 'REG_New Mexico', 'REG_New York', 'REG_Other',
       'REG_San Francisco-Oakland SMSA', 'REG_San Jose-Monterey',
       'REG_Seattle (Puget Sound)', 'REG_Utah', 'SEX_Female', 'SEX_Male',
       'PRIMSITE_C180', 'PRIMSITE_C181', 'PRIMSITE_C182', 'PRIMSITE_C183',
       'PRIMSITE_C184', 'PRIMSITE_C185', 'PRIMSITE_C186', 'PRIMSITE_C187',
       'PRIMSITE_C189', 'PRIMSITE_C199', 'PRIMSITE_C209', 'PRIMSITE_Other',
       'LATERAL_Not a paired site', 'LATERAL_Other', 'LATERAL_Paired site',
       'LATERAL_Right: origin of primary', 'HISTO3V_14573', 'HISTO3V_19802',
       'HISTO3V_20705', 'HISTO3V_26537', 'HISTO3V_27180', 'HISTO3V_5

In [65]:
X = data.drop(['GRADE'],axis=1)

In [66]:
y = data['GRADE']

## Test-train split

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Predictive analysis

## LGBM

**Setting up folds for crossfold validation**

In [88]:
folds = KFold(n_splits=5, shuffle=True, random_state=42)

**Model parameters**

In [89]:
params = {'objective' : "multiclass",
               'num_class':4,
               'boosting':"gbdt",
               'metric':"multi_error",
               'boost_from_average':"false",
               'num_threads':8,
               'learning_rate' : 0.0001,
               'num_leaves' : 30,
               'max_depth':-1,
               'tree_learner' : "serial",
               'feature_fraction' : 0.85,
               'bagging_freq' : 1,
               'bagging_fraction' : 0.85,
               'min_data_in_leaf' : 50,
               'verbosity' : -1}

In [90]:
%%time
y_pred_lgb = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in tqdm(enumerate(folds.split(X))):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
    model = lgb.LGBMRegressor(**params, n_estimators = 5000, n_jobs = -1)
    model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='multi_error',
                    verbose=1000, early_stopping_rounds=20)
            
    y_pred_valid = model.predict(X_valid)





0it [00:00, ?it/s]

Fold 0 started at Thu Oct 31 14:05:05 2019
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[23]	training's multi_error: 0.291591	valid_1's multi_error: 0.2899






1it [00:09,  9.28s/it]

Fold 1 started at Thu Oct 31 14:05:14 2019
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[11]	training's multi_error: 0.29142	valid_1's multi_error: 0.291794






2it [00:16,  8.65s/it]

Fold 2 started at Thu Oct 31 14:05:22 2019
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[13]	training's multi_error: 0.291524	valid_1's multi_error: 0.291835






3it [00:24,  8.36s/it]

Fold 3 started at Thu Oct 31 14:05:29 2019
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[3]	training's multi_error: 0.291306	valid_1's multi_error: 0.292926






4it [00:30,  7.66s/it]

Fold 4 started at Thu Oct 31 14:05:35 2019
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[4]	training's multi_error: 0.291386	valid_1's multi_error: 0.290845






5it [00:36,  7.26s/it]


Wall time: 36.3 s


# Binary Classification

## Data Prep

In [87]:
data['TARGET'] = [1 if i >= 2 else 0 for i in data['GRADE']]

In [91]:
data.columns

Index(['DX_CONF', 'REPT_SRC', 'RECNOREC', 'SITERWHO', 'BEHTREND', 'IHSLINK',
       'REG_Connecticut', 'REG_Greater California', 'REG_Greater Georgia',
       'REG_Hawaii', 'REG_Idaho', 'REG_Iowa', 'REG_Kentucky',
       'REG_Los Angeles', 'REG_Louisiana', 'REG_Massachusetts',
       'REG_Metropolitan Atlanta', 'REG_Metropolitan Detroit',
       'REG_New Jersey', 'REG_New Mexico', 'REG_New York', 'REG_Other',
       'REG_San Francisco-Oakland SMSA', 'REG_San Jose-Monterey',
       'REG_Seattle (Puget Sound)', 'REG_Utah', 'SEX_Female', 'SEX_Male',
       'PRIMSITE_C180', 'PRIMSITE_C181', 'PRIMSITE_C182', 'PRIMSITE_C183',
       'PRIMSITE_C184', 'PRIMSITE_C185', 'PRIMSITE_C186', 'PRIMSITE_C187',
       'PRIMSITE_C189', 'PRIMSITE_C199', 'PRIMSITE_C209', 'PRIMSITE_Other',
       'LATERAL_Not a paired site', 'LATERAL_Other', 'LATERAL_Paired site',
       'LATERAL_Right: origin of primary', 'HISTO3V_14573', 'HISTO3V_19802',
       'HISTO3V_20705', 'HISTO3V_26537', 'HISTO3V_27180', 'HISTO3V_5

In [93]:
X = data.drop(['GRADE','TARGET'],axis=1)

In [95]:
y = data['TARGET']

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## LGBM

In [97]:
folds = KFold(n_splits=5, shuffle=True, random_state=42)

In [103]:
params = {'objective' : "binary",
               'num_class':1,
               'boosting':"gbdt",
               'metric':"binary_error",
               'boost_from_average':"false",
               'num_threads':8,
               'learning_rate' : 0.0001,
               'num_leaves' : 50,
               'max_depth':-1,
               'tree_learner' : "serial",
               'feature_fraction' : 0.85,
               'bagging_freq' : 1,
               'bagging_fraction' : 0.85,
               'min_data_in_leaf' : 10,
               'verbosity' : -1}

In [104]:
%%time
y_pred_lgb = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in tqdm(enumerate(folds.split(X))):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
    model = lgb.LGBMRegressor(**params, n_estimators = 300000, n_jobs = -1)
    model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='binary_error',
                    verbose=1000, early_stopping_rounds=30)
            
    y_pred_valid = model.predict(X_valid)





0it [00:00, ?it/s]

Fold 0 started at Thu Oct 31 14:11:10 2019
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[49]	training's binary_error: 0.185257	valid_1's binary_error: 0.184486






1it [00:05,  6.00s/it]

Fold 1 started at Thu Oct 31 14:11:16 2019
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[26]	training's binary_error: 0.184792	valid_1's binary_error: 0.186488






2it [00:11,  5.72s/it]

Fold 2 started at Thu Oct 31 14:11:21 2019
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[31]	training's binary_error: 0.185349	valid_1's binary_error: 0.18413






3it [00:16,  5.69s/it]

Fold 3 started at Thu Oct 31 14:11:27 2019
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[28]	training's binary_error: 0.185058	valid_1's binary_error: 0.185161






4it [00:21,  5.51s/it]

Fold 4 started at Thu Oct 31 14:11:32 2019
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[2]	training's binary_error: 0.18516	valid_1's binary_error: 0.18512






5it [00:25,  5.14s/it]


Wall time: 25.7 s


# Undersampling

In [122]:
# Separate classes
data_0 = data[data.GRADE==0]
data_1 = data[data.GRADE==1]
data_2 = data[data.GRADE==2]
data_3 = data[data.GRADE==3]

In [124]:
data_2.describe()

Unnamed: 0,DX_CONF,REPT_SRC,RECNOREC,SITERWHO,BEHTREND,IHSLINK,REG_Connecticut,REG_Greater California,REG_Greater Georgia,REG_Hawaii,...,SCSSM2KO_Other,AGE_DX,YR_BIRTH,SEQ_NUM,MDXRECMP,YEAR_DX,GRADE,AGE_IREC,FIRSTPRM,TARGET
count,135397.0,135397.0,135397.0,135397.0,135397.0,135397.0,135397.0,135397.0,135397.0,135397.0,...,135397.0,135397.0,135397.0,135397.0,135397.0,135397.0,135397.0,135397.0,135397.0,135397.0
mean,0.997777,1.200824,662706.72376,21045.873579,2.989675,0.019838,0.040518,0.151185,0.040968,0.010842,...,0.269917,68.679963,44.199295,0.542324,6.414751,2007.386877,2.0,99822.987732,0.199879,1.0
std,0.047097,1.099561,253095.3834,3.96717,0.101087,0.192182,0.197171,0.35823,0.198218,0.10356,...,0.443918,14.344692,15.348001,0.918624,3.411978,4.824953,0.0,27889.969212,0.399911,0.0
min,0.0,1.0,4.0,21041.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.0,0.0,0.0,1.0,2000.0,2.0,7.0,0.0,1.0
25%,1.0,1.0,779121.0,21043.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,59.0,32.0,0.0,3.0,2003.0,2.0,89181.0,0.0,1.0
50%,1.0,1.0,779121.0,21045.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,71.0,42.0,0.0,6.0,2007.0,2.0,112513.0,0.0,1.0
75%,1.0,1.0,779121.0,21049.0,3.0,0.0,0.0,0.0,0.0,0.0,...,1.0,80.0,55.0,1.0,9.0,2011.0,2.0,118019.0,0.0,1.0
max,1.0,8.0,779121.0,21052.0,3.0,2.0,1.0,1.0,1.0,1.0,...,1.0,107.0,115.0,5.0,12.0,2016.0,2.0,121717.0,1.0,1.0


In [125]:
# Downsample majority class
data_0_downsampled = resample(data_0, 
                                 replace=False,    # sample without replacement
                                 n_samples=10000,     # to match minority class
                                 random_state=123) # reproducible results
data_1_downsampled = resample(data_1, 
                                 replace=False,    # sample without replacement
                                 n_samples=10000,     # to match minority class
                                 random_state=123) # reproducible results
data_2_downsampled = resample(data_2, 
                                 replace=False,    # sample without replacement
                                 n_samples=10000,     # to match minority class
                                 random_state=123) # reproducible results
data_3_downsampled = resample(data_3, 
                                 replace=False,    # sample without replacement
                                 n_samples=10000,     # to match minority class
                                 random_state=123) # reproducible results


In [126]:
sampled_data = pd.concat([data_0_downsampled, data_1_downsampled, 
                          data_2_downsampled, data_3_downsampled])

In [136]:
sampled_data.to_csv("Sampled_binary_class.csv")

In [127]:
sampled_data.head()

Unnamed: 0,DX_CONF,REPT_SRC,RECNOREC,SITERWHO,BEHTREND,IHSLINK,REG_Connecticut,REG_Greater California,REG_Greater Georgia,REG_Hawaii,...,SCSSM2KO_Other,AGE_DX,YR_BIRTH,SEQ_NUM,MDXRECMP,YEAR_DX,GRADE,AGE_IREC,FIRSTPRM,TARGET
768845,1,1,779121,21052,3,0,0,0,0,0,...,0,93.0,20,0,1,2008,0,112513,0,0
730518,1,1,779121,21042,3,0,0,0,0,0,...,0,54.0,56,0,5,2004,0,80595,0,0
862432,1,1,134276,21041,3,0,0,0,0,0,...,1,81.0,26,2,5,2001,0,109130,1,0
432190,1,1,779121,21041,3,0,0,1,0,0,...,0,61.0,59,0,10,2015,0,101326,0,0
439550,1,1,779121,21048,3,0,0,1,0,0,...,0,27.0,95,0,10,2016,0,7,0,0


In [128]:
X_sampled = sampled_data.drop(['GRADE','TARGET'],axis=1)

In [129]:
y_sampled = sampled_data['TARGET']

In [130]:
X_sampled_train, X_sampled_test, y_sampled_train, y_sampled_test = train_test_split(X_sampled, 
                                                                                    y_sampled, test_size=0.3, random_state=42)

## LGBM

In [131]:
folds = KFold(n_splits=5, shuffle=True, random_state=42)

In [134]:
params = {'objective' : "binary",
               'num_class':1,
               'boosting':"gbdt",
               'metric':"binary_error",
               'boost_from_average':"false",
               'num_threads':8,
               'learning_rate' : 0.0001,
               'num_leaves' : 51,
               'max_depth':-1,
               'tree_learner' : "serial",
               'feature_fraction' : 0.85,
               'bagging_freq' : 1,
               'bagging_fraction' : 0.85,
               'min_data_in_leaf' : 10,
               'verbosity' : -1}

In [135]:
%%time
y_pred_lgb = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in tqdm(enumerate(folds.split(X_sampled))):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
    model = lgb.LGBMRegressor(**params, n_estimators = 2000, n_jobs = -1)
    model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='binary_error',
                    verbose=1000, early_stopping_rounds=30)
            
    y_pred_valid = model.predict(X_valid)





0it [00:00, ?it/s]

Fold 0 started at Thu Oct 31 15:49:31 2019
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[33]	training's binary_error: 0.191844	valid_1's binary_error: 0.193






1it [00:01,  1.24s/it]

Fold 1 started at Thu Oct 31 15:49:32 2019
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[14]	training's binary_error: 0.192656	valid_1's binary_error: 0.19075






2it [00:02,  1.18s/it]

Fold 2 started at Thu Oct 31 15:49:34 2019
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[27]	training's binary_error: 0.191469	valid_1's binary_error: 0.193875






3it [00:03,  1.20s/it]

Fold 3 started at Thu Oct 31 15:49:35 2019
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[10]	training's binary_error: 0.191375	valid_1's binary_error: 0.195875






4it [00:04,  1.14s/it]

Fold 4 started at Thu Oct 31 15:49:36 2019
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[21]	training's binary_error: 0.191875	valid_1's binary_error: 0.1925






5it [00:05,  1.14s/it]


Wall time: 5.73 s
