# Import libraries

In [1]:
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
import pandas as pd
import numpy as np
import warnings
import random
import time
import sys
import os

# Data preprocessing

In [2]:
os.chdir('D:\ISU\MIS 536\SEER Project\CSV data')

In [3]:
data = pd.read_csv('vfinal.csv')

In [61]:
data['GRADE'].describe()

count    742356.000000
mean          2.106941
std           0.607522
min           1.000000
25%           2.000000
50%           2.000000
75%           2.000000
max           4.000000
Name: GRADE, dtype: float64

In [6]:
data = data.drop(['Unnamed: 0'],axis=1)

In [51]:
data = data.drop(data[data['GRADE'] == 5].index)


In [62]:
data['GRADE'].value_counts()

2    505413
3    135397
1     86367
4     15179
Name: GRADE, dtype: int64

In [63]:
data['GRADE'] = data['GRADE'].replace({1:0, 2:1, 3:2, 4:3})

In [64]:
data['GRADE'].value_counts()

1    505413
2    135397
0     86367
3     15179
Name: GRADE, dtype: int64

## List of columns

In [7]:
data.columns

Index(['DX_CONF', 'REPT_SRC', 'RECNOREC', 'SITERWHO', 'BEHTREND', 'IHSLINK',
       'REG_Connecticut', 'REG_Greater California', 'REG_Greater Georgia',
       'REG_Hawaii', 'REG_Idaho', 'REG_Iowa', 'REG_Kentucky',
       'REG_Los Angeles', 'REG_Louisiana', 'REG_Massachusetts',
       'REG_Metropolitan Atlanta', 'REG_Metropolitan Detroit',
       'REG_New Jersey', 'REG_New Mexico', 'REG_New York', 'REG_Other',
       'REG_San Francisco-Oakland SMSA', 'REG_San Jose-Monterey',
       'REG_Seattle (Puget Sound)', 'REG_Utah', 'SEX_Female', 'SEX_Male',
       'PRIMSITE_C180', 'PRIMSITE_C181', 'PRIMSITE_C182', 'PRIMSITE_C183',
       'PRIMSITE_C184', 'PRIMSITE_C185', 'PRIMSITE_C186', 'PRIMSITE_C187',
       'PRIMSITE_C189', 'PRIMSITE_C199', 'PRIMSITE_C209', 'PRIMSITE_Other',
       'LATERAL_Not a paired site', 'LATERAL_Other', 'LATERAL_Paired site',
       'LATERAL_Right: origin of primary', 'HISTO3V_14573', 'HISTO3V_19802',
       'HISTO3V_20705', 'HISTO3V_26537', 'HISTO3V_27180', 'HISTO3V_5

In [65]:
X = data.drop(['GRADE'],axis=1)

In [66]:
y = data['GRADE']

## Test-train split

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Predictive analysis

## LGBM

**Setting up folds for crossfold validation**

In [68]:
folds = KFold(n_splits=5, shuffle=True, random_state=42)

**Model parameters**

In [72]:
params = {'objective' : "multiclass",
               'num_class':4,
               'boosting':"gbdt",
               'metric':"multi_error",
               'boost_from_average':"false",
               'num_threads':8,
               'learning_rate' : 0.001,
               'num_leaves' : 100,
               'max_depth':-1,
               'tree_learner' : "serial",
               'feature_fraction' : 0.85,
               'bagging_freq' : 1,
               'bagging_fraction' : 0.85,
               'min_data_in_leaf' : 1000,
               'verbosity' : -1}

In [73]:
%%time
y_pred_lgb = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in tqdm(enumerate(folds.split(X))):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
    model = lgb.LGBMRegressor(**params, n_estimators = 300000, n_jobs = -1)
    model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='multi_error',
                    verbose=1000, early_stopping_rounds=20)
            
    y_pred_valid = model.predict(X_valid)





0it [00:00, ?it/s]

Fold 0 started at Thu Oct 31 13:06:18 2019
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	training's multi_error: 0.293153	valid_1's multi_error: 0.291489






1it [00:06,  6.79s/it]

Fold 1 started at Thu Oct 31 13:06:25 2019
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	training's multi_error: 0.292737	valid_1's multi_error: 0.293202






2it [00:13,  6.85s/it]

Fold 2 started at Thu Oct 31 13:06:32 2019
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[19]	training's multi_error: 0.292897	valid_1's multi_error: 0.293202






3it [00:24,  8.08s/it]

Fold 3 started at Thu Oct 31 13:06:43 2019
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	training's multi_error: 0.29238	valid_1's multi_error: 0.294051






4it [00:31,  7.73s/it]

Fold 4 started at Thu Oct 31 13:06:50 2019
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	training's multi_error: 0.292811	valid_1's multi_error: 0.292306






5it [00:38,  7.71s/it]


Wall time: 38.5 s


In [None]:
'REG_Other','SEX_Female','PRIMSITE_Other','LATERAL_Other','HISTO3V_Other','BEHO3V_Malignant - Invasive','ICCC3WHO_Other',
'ICCC3XWHO_Other','HISTREC_Other','RAC_RECA_Other','AYASITERWHO_Other','INTPRIM_Excluded','SCSSM2KO_Other'