### Importing Libraries

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import gc

from sklearn.exceptions import NotFittedError

from itertools import chain
%matplotlib inline
plt.style.use('bmh')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### Importing data

In [4]:
data = pd.read_csv(r'C:/Users/nisha/Downloads/Heart_Disease_Prediction_Data/TAMU_FINAL_DATASET_2018.csv', sep=",")
data.head()

Unnamed: 0,ID,AGE,SEX_CD,AMI_FLAG,ESRD_IND,HOSPICE_IND,ORIG_REAS_ENTITLE_CD,RECON_MA_RISK_SCORE_NBR,RECON_RX_RISK_SCORE_NBR,PCP_ASSIGNMENT,...,COL,COL_GAP,AMM,AMM_GAP,DIAB_PASS,ACE_PASS,STATIN_PASS,ACE_ELIG,DIAB_ELIG,STATIN_ELIG
0,1,77,F,0,N,N,0.0,0.424,0.402,MEMBER SELECTED,...,0,0,0,0,0,0,0,0,0,0
1,2,49,F,0,N,N,1.0,2.879,1.159,ATTRIBUTED,...,0,0,0,0,0,0,1,0,0,1
2,3,75,F,0,N,N,0.0,0.638,0.568,MEMBER SELECTED,...,1,0,0,0,0,1,1,1,0,1
3,4,68,M,0,N,N,0.0,0.584,0.886,MEMBER SELECTED,...,1,1,0,0,1,1,1,1,1,1
4,5,81,F,0,N,N,1.0,1.242,1.212,MEMBER SELECTED,...,0,0,0,0,0,0,0,0,0,0


#### Checking data shape

In [5]:
data.shape

(100000, 448)

### Data Pre-processing

#### Separate String, Numeric and Binary Columns

In [6]:
# Separate string columns
cols_string1 = ['SEX_CD', 'ESRD_IND', 'HOSPICE_IND', 'ORIG_REAS_ENTITLE_CD', 'PCP_ASSIGNMENT','Diab_Type','DUAL','Dwelling_Type','Education_level','INSTITUTIONAL','LIS','MCO_HLVL_PLAN_CD','MCO_PROD_TYPE_CD']

# Separate numeric columns
cols_numeric = [x for x in data.columns if x not in cols_string1]
cols_numeric = [x for x in cols_numeric if x not in ['AMI_FLAG', 'ID']]

# Update string columns to add numeric categorical variables 

cols_string = cols_string1 + ['CON_VISIT_04_Q01', 'CON_VISIT_04_Q02', 'CON_VISIT_04_Q03',
       'CON_VISIT_04_Q04', 'CON_VISIT_21_Q01', 'CON_VISIT_21_Q02',
       'CON_VISIT_21_Q03', 'CON_VISIT_03_Q02', 'CON_VISIT_03_Q04',
       'CON_VISIT_05_Q02', 'CON_VISIT_05_Q04', 'CON_VISIT_09_Q02',
       'CON_VISIT_10_Q02', 'CON_VISIT_18_Q02', 'CON_VISIT_19_Q04',
       'CON_VISIT_23_Q02', 'CON_VISIT_24_Q02', 'CON_VISIT_30_Q01',
       'CON_VISIT_30_Q02', 'CON_VISIT_30_Q04', 'CON_VISIT_32_Q02',
       'CON_VISIT_33_Q04', 'CON_VISIT_06_Q01', 'CON_VISIT_06_Q02',
       'CON_VISIT_06_Q03', 'CON_VISIT_10_Q01', 'CON_VISIT_19_Q03',
       'CON_VISIT_33_Q02', 'CON_VISIT_08_Q02', 'CON_VISIT_10_Q03',
       'CON_VISIT_10_Q04', 'CON_VISIT_24_Q04', 'CON_VISIT_08_Q01',
       'CON_VISIT_08_Q03', 'CON_VISIT_08_Q04', 'CON_VISIT_11_Q04',
       'CON_VISIT_21_Q04', 'CON_VISIT_25_Q01', 'CON_VISIT_27_Q02',
       'CON_VISIT_27_Q03', 'CON_VISIT_31_Q02', 'CON_VISIT_31_Q04',
       'CON_VISIT_32_Q01', 'CON_VISIT_09_Q03', 'CON_VISIT_31_Q01',
       'CON_VISIT_33_Q01', 'CON_VISIT_24_Q03', 'CON_VISIT_06_Q04',
       'CON_VISIT_33_Q03', 'CON_VISIT_23_Q01', 'CON_VISIT_25_Q04',
       'CON_VISIT_27_Q01', 'CON_VISIT_32_Q04', 'CON_VISIT_24_Q01',
       'CON_VISIT_07_Q04', 'CON_VISIT_19_Q01', 'CON_VISIT_01_Q01',
       'CON_VISIT_01_Q02', 'CON_VISIT_01_Q03', 'CON_VISIT_01_Q04',
       'CON_VISIT_02_Q01', 'CON_VISIT_03_Q01', 'CON_VISIT_11_Q01',
       'CON_VISIT_11_Q03', 'CON_VISIT_17_Q03', 'CON_VISIT_18_Q01',
       'CON_VISIT_26_Q02', 'CON_VISIT_05_Q01', 'CON_VISIT_09_Q01',
       'CON_VISIT_15_Q04', 'CON_VISIT_25_Q03', 'CON_VISIT_32_Q03',
       'CON_VISIT_17_Q04', 'CON_VISIT_31_Q03', 'CON_VISIT_02_Q04',
       'CON_VISIT_17_Q01', 'CON_VISIT_17_Q02', 'CON_VISIT_18_Q04',
       'CON_VISIT_20_Q04', 'CON_VISIT_26_Q01', 'CON_VISIT_26_Q03',
       'CON_VISIT_27_Q04', 'CON_VISIT_30_Q03', 'CON_VISIT_23_Q04',
       'CON_VISIT_19_Q02', 'CON_VISIT_07_Q03', 'CON_VISIT_09_Q04',
       'CON_VISIT_23_Q03', 'CON_VISIT_15_Q01', 'CON_VISIT_15_Q02',
       'CON_VISIT_15_Q03', 'CON_VISIT_26_Q04', 'CON_VISIT_02_Q02',
       'CON_VISIT_03_Q03', 'CON_VISIT_18_Q03', 'CON_VISIT_05_Q03',
       'CON_VISIT_22_Q01', 'CON_VISIT_22_Q02', 'CON_VISIT_22_Q03',
       'CON_VISIT_02_Q03', 'CON_VISIT_11_Q02', 'CON_VISIT_20_Q02',
       'CON_VISIT_20_Q03', 'CON_VISIT_25_Q02', 'CON_VISIT_07_Q01',
       'CON_VISIT_28_Q02', 'CON_VISIT_07_Q02', 'CON_VISIT_20_Q01',
       'CON_VISIT_22_Q04', 'CON_VISIT_28_Q03', 'CON_VISIT_14_Q03',
       'CON_VISIT_14_Q02', 'CON_VISIT_16_Q01', 'CON_VISIT_16_Q02',
       'CON_VISIT_16_Q03', 'CON_VISIT_16_Q04', 'CON_VISIT_28_Q01',
       'CON_VISIT_14_Q01', 'CON_VISIT_14_Q04', 'CON_VISIT_12_Q04',
       'CON_VISIT_28_Q04', 'CON_VISIT_12_Q03', 'CON_VISIT_12_Q01',
       'CON_VISIT_13_Q01', 'CON_VISIT_13_Q04', 'CON_VISIT_12_Q02',
       'CON_VISIT_13_Q02', 'CON_VISIT_13_Q03', 'POT_VISIT_11_Q01',
       'POT_VISIT_11_Q02', 'POT_VISIT_11_Q03', 'POT_VISIT_11_Q04',
       'POT_VISIT_81_Q01', 'POT_VISIT_21_Q02', 'POT_VISIT_23_Q02',
       'POT_VISIT_81_Q02', 'POT_VISIT_81_Q04', 'POT_VISIT_22_Q02',
       'POT_VISIT_12_Q02', 'POT_VISIT_19_Q02', 'POT_VISIT_22_Q03',
       'POT_VISIT_22_Q04', 'POT_VISIT_12_Q01', 'POT_VISIT_12_Q03',
       'POT_VISIT_12_Q04', 'POT_VISIT_22_Q01', 'POT_VISIT_23_Q01',
       'POT_VISIT_23_Q03', 'POT_VISIT_81_Q03', 'POT_VISIT_23_Q04',
       'POT_VISIT_19_Q01', 'POT_VISIT_19_Q04', 'POT_VISIT_20_Q01',
       'POT_VISIT_21_Q04', 'POT_VISIT_41_Q04', 'POT_VISIT_19_Q03',
       'POT_VISIT_21_Q01', 'POT_VISIT_31_Q01', 'POT_VISIT_31_Q02',
       'POT_VISIT_31_Q03', 'POT_VISIT_31_Q04', 'POT_VISIT_32_Q01',
       'POT_VISIT_32_Q02', 'POT_VISIT_32_Q03', 'POT_VISIT_32_Q04',
       'POT_VISIT_41_Q01', 'POT_VISIT_51_Q01', 'POT_VISIT_51_Q02',
       'POT_VISIT_24_Q02', 'POT_VISIT_21_Q03', 'POT_VISIT_41_Q02',
       'POT_VISIT_41_Q03', 'POT_VISIT_53_Q01', 'POT_VISIT_53_Q02',
       'POT_VISIT_53_Q03', 'POT_VISIT_53_Q04', 'POT_VISIT_20_Q02',
       'POT_VISIT_20_Q04', 'POT_VISIT_01_Q03', 'POT_VISIT_17_Q04',
       'POT_VISIT_20_Q03', 'POT_VISIT_24_Q03', 'POT_VISIT_24_Q04',
       'POT_VISIT_24_Q01', 'POT_VISIT_50_Q01', 'POT_VISIT_50_Q02',
       'POT_VISIT_50_Q03', 'POT_VISIT_50_Q04', 'POT_VISIT_72_Q01',
       'POT_VISIT_72_Q02', 'POT_VISIT_72_Q03', 'POT_VISIT_17_Q01',
       'POT_VISIT_15_Q04', 'POT_VISIT_99_Q03', 'POT_VISIT_99_Q04',
       'POT_VISIT_34_Q01', 'POT_VISIT_42_Q02', 'POT_VISIT_61_Q03',
       'POT_VISIT_51_Q04', 'POT_VISIT_72_Q04', 'POT_VISIT_49_Q03',
       'POT_VISIT_02_Q04', 'POT_VISIT_99_Q01', 'POT_VISIT_99_Q02',
       'POT_VISIT_49_Q01', 'POT_VISIT_49_Q02', 'POT_VISIT_15_Q01',
       'POT_VISIT_60_Q04', 'POT_VISIT_33_Q03', 'POT_VISIT_33_Q01',
       'POT_VISIT_13_Q01', 'POT_VISIT_13_Q02', 'POT_VISIT_13_Q03',
       'POT_VISIT_13_Q04', 'POT_VISIT_65_Q02', 'POT_VISIT_65_Q03',
       'POT_VISIT_65_Q04', 'POT_VISIT_49_Q04', 'POT_VISIT_65_Q01',
       'POT_VISIT_33_Q04', 'POT_VISIT_15_Q03', 'POT_VISIT_61_Q02',
       'POT_VISIT_17_Q03', 'POT_VISIT_33_Q02', 'POT_VISIT_71_Q03',
       'POT_VISIT_16_Q02', 'POT_VISIT_61_Q04', 'POT_VISIT_15_Q02',
       'POT_VISIT_51_Q03', 'POT_VISIT_14_Q03', 'POT_VISIT_71_Q01',
       'POT_VISIT_60_Q03', 'POT_VISIT_14_Q01', 'POT_VISIT_14_Q02',
       'POT_VISIT_71_Q04', 'POT_VISIT_52_Q01', 'POT_VISIT_17_Q02',
       'POT_VISIT_60_Q01', 'POT_VISIT_16_Q03', 'POT_VISIT_54_Q04',
       'POT_VISIT_01_Q01', 'POT_VISIT_42_Q04', 'POT_VISIT_16_Q04',
       'POT_VISIT_16_Q01', 'POT_VISIT_61_Q01', 'POT_VISIT_52_Q04',
       'POT_VISIT_71_Q02', 'POT_VISIT_34_Q02', 'POT_VISIT_34_Q03',
       'POT_VISIT_34_Q04', 'POT_VISIT_52_Q02', 'POT_VISIT_04_Q02',
       'POT_VISIT_60_Q02', 'POT_VISIT_62_Q02', 'POT_VISIT_62_Q03',
       'POT_VISIT_02_Q01', 'POT_VISIT_62_Q04', 'POT_VISIT_52_Q03',
       'POT_VISIT_42_Q01', 'POT_VISIT_01_Q04', 'POT_VISIT_62_Q01',
       'POT_VISIT_42_Q03', 'POT_VISIT_09_Q01', 'POT_VISIT_04_Q01',
       'POT_VISIT_05_Q01', 'POT_VISIT_01_Q02', 'POT_VISIT_14_Q04',
       'POT_VISIT_07_Q01', 'POT_VISIT_07_Q02', 'POT_VISIT_07_Q03',
       'POT_VISIT_07_Q04', 'POT_VISIT_26_Q04', 'POT_VISIT_55_Q02',
       'POT_VISIT_26_Q02', 'POT_VISIT_03_Q04', 'POT_VISIT_57_Q04',
       'POT_VISIT_56_Q03', 'POT_VISIT_18_Q01', 'POT_VISIT_54_Q01',
       'POT_VISIT_54_Q02', 'POT_VISIT_54_Q03', 'POT_VISIT_03_Q01',
       'POT_VISIT_25_Q01', 'RX_THER_17_YR2016', 'RX_THER_36_YR2016',
       'RX_THER_42_YR2016', 'RX_THER_44_YR2016', 'RX_THER_58_YR2016',
       'RX_THER_65_YR2016', 'RX_THER_90_YR2016', 'RX_THER_01_YR2016',
       'RX_THER_05_YR2016', 'RX_THER_22_YR2016', 'RX_THER_37_YR2016',
       'RX_THER_39_YR2016', 'RX_THER_43_YR2016', 'RX_THER_49_YR2016',
       'RX_THER_57_YR2016', 'RX_THER_60_YR2016', 'RX_THER_28_YR2016',
       'RX_THER_27_YR2016', 'RX_THER_34_YR2016', 'RX_THER_16_YR2016',
       'RX_THER_41_YR2016', 'RX_THER_02_YR2016', 'RX_THER_18_YR2016',
       'RX_THER_50_YR2016', 'RX_THER_97_YR2016', 'RX_THER_61_YR2016',
       'RX_THER_66_YR2016', 'RX_THER_56_YR2016', 'RX_THER_03_YR2016',
       'RX_THER_75_YR2016', 'RX_THER_72_YR2016', 'RX_THER_79_YR2016',
       'RX_THER_12_YR2016', 'RX_THER_86_YR2016', 'RX_THER_32_YR2016',
       'RX_THER_33_YR2016', 'RX_THER_21_YR2016', 'RX_THER_46_YR2016',
       'RX_THER_62_YR2016', 'RX_THER_68_YR2016', 'RX_THER_94_YR2016',
       'RX_THER_04_YR2016', 'RX_THER_59_YR2016', 'RX_THER_69_YR2016',
       'RX_THER_85_YR2016', 'RX_THER_11_YR2016', 'RX_THER_30_YR2016',
       'RX_THER_54_YR2016', 'RX_THER_31_YR2016', 'RX_THER_83_YR2016',
       'RX_THER_13_YR2016', 'RX_THER_35_YR2016', 'RX_THER_89_YR2016',
       'RX_THER_78_YR2016', 'RX_THER_53_YR2016', 'RX_THER_77_YR2016',
       'RX_THER_88_YR2016', 'RX_THER_23_YR2016', 'RX_THER_52_YR2016',
       'RX_THER_73_YR2016', 'RX_THER_99_YR2016', 'RX_THER_82_YR2016',
       'RX_THER_07_YR2016', 'RX_THER_93_YR2016', 'RX_THER_74_YR2016',
       'RX_THER_55_YR2016', 'RX_THER_24_YR2016', 'RX_THER_67_YR2016',
       'RX_THER_64_YR2016', 'RX_THER_87_YR2016', 'RX_THER_47_YR2016',
       'RX_THER_38_YR2016', 'RX_THER_40_YR2016', 'RX_THER_09_YR2016',
       'RX_THER_51_YR2016', 'RX_THER_26_YR2016', 'RX_THER_25_YR2016',
       'RX_THER_81_YR2016', 'RX_THER_48_YR2016', 'RX_THER_95_YR2016',
       'RX_THER_45_YR2016', 'RX_THER_76_YR2016', 'RX_THER_80_YR2016',
       'RX_THER_15_YR2016', 'RX_THER_19_YR2016', 'RX_THER_96_YR2016',
       'RX_THER_92_YR2016', 'RX_THER_98_YR2016', 'RX_THER_84_YR2016',
       'RX_THER_08_YR2016']

# Separate binary columns

cols_zo = ['CV_CAD', 'CV_CHF', 'CV_CIR', 'CV_CER',
       'CV_HDZ', 'CV_SNS', 'CV_PVD', 'RES_ALG', 'RES_AST', 'RES_COPD',
       'RES_FAIL', 'RES_INF', 'DIABETES',
       'Diab_Complications', 'PREDIABETES', 'HYPERTENSION', 'HYPERLIPID',
       'RENAL', 'CKD', 'ESRD', 'MUSCUL_OTH', 'MUSCUL_BN', 'OSTEO', 'ARTH', 'College','CDC', 'CDC_HBATEST_GAP',
       'CDC_HBAPOOR_GAP', 'CDC_NPH_GAP', 'CDC_EYE_GAP', 'BCS', 'BCS_GAP',
       'COL', 'COL_GAP', 'AMM', 'AMM_GAP', 'DIAB_PASS', 'ACE_PASS',
       'STATIN_PASS', 'ACE_ELIG', 'DIAB_ELIG', 'STATIN_ELIG']

#### Subset numeric, binary and string columns

In [7]:
# Subset numeric columns
cols_numeric = [x for x in data.columns if x not in cols_string] 
cols_numeric = [x for x in cols_numeric if x not in  cols_zo]
cols_numeric = [x for x in cols_numeric if x not in ['AMI_FLAG', 'ID']]
data_num = data[cols_numeric]

# Subset binary columns
data_zo = data[cols_zo]

# Subset string columns
data_string = data[cols_string]

#### Check Length of Columns

In [8]:
### Check lengths of numeric, binary and categorical columns

print("Length of numeric columns:",len(cols_numeric))
print("Length of string columns:",len(cols_string))
print("Length of binary columns:",len(cols_zo))

Length of numeric columns: 18
Length of string columns: 386
Length of binary columns: 42


### Handling Missing Values

In [10]:
data.isnull().sum()/len(data)

ID                               0.00000
AGE                              0.00000
SEX_CD                           0.00030
AMI_FLAG                         0.00000
ESRD_IND                         0.00032
HOSPICE_IND                      0.00032
ORIG_REAS_ENTITLE_CD             0.00030
RECON_MA_RISK_SCORE_NBR          0.00000
RECON_RX_RISK_SCORE_NBR          0.00000
PCP_ASSIGNMENT                   0.00157
DUAL                             0.00030
INSTITUTIONAL                    0.00030
LIS                              0.00030
MCO_HLVL_PLAN_CD                 0.00033
MCO_PROD_TYPE_CD                 0.00033
CON_VISIT_04_Q01                 0.00000
CON_VISIT_04_Q02                 0.00000
CON_VISIT_04_Q03                 0.00000
CON_VISIT_04_Q04                 0.00000
CON_VISIT_21_Q01                 0.00000
CON_VISIT_21_Q02                 0.00000
CON_VISIT_21_Q03                 0.00000
CON_VISIT_03_Q02                 0.00000
CON_VISIT_03_Q04                 0.00000
CON_VISIT_05_Q02

#### Numeric Data Imputation

In [13]:
#### Impute missing numerical data with mean
data_num = data_num.fillna(data_num.mean())

#### Binary Data Imputation

In [14]:
#### Impute missing records in binary columns with 0
data_zo = data_zo.fillna(0)

#### Categorical Data Imputation

In [15]:
#### Impute categorical missing records with 'NA'
data_string = data_string.fillna('NA')

### Scaling numeric columns and generating dummies for categorical columns

In [16]:
# Import libraries 
from sklearn.preprocessing import StandardScaler


# Define which columns should be encoded vs scaled
columns_to_encode = cols_string
columns_to_scale  = cols_numeric


# Instantiate encoder/scaler
scaler = StandardScaler()

# Scale and Encode Separate Columns
scaled_columns  = scaler.fit_transform(data_num[columns_to_scale]) 
encoded_columns =    pd.get_dummies(data_string)



dat_num = pd.DataFrame(scaled_columns, columns = cols_numeric)
dat_str = encoded_columns
frames = (dat_num, dat_str, data_zo)
processed_data = pd.concat(frames, axis = 1)

In [17]:
processed_data.head(5)

Unnamed: 0,AGE,RECON_MA_RISK_SCORE_NBR,RECON_RX_RISK_SCORE_NBR,Length_residence,Est_BMI_decile,Num_person_household,Online_purchaser,Online_User,Pct_above_poverty_line,Pct_below_poverty_line,...,COL,COL_GAP,AMM,AMM_GAP,DIAB_PASS,ACE_PASS,STATIN_PASS,ACE_ELIG,DIAB_ELIG,STATIN_ELIG
0,0.435343,-0.755072,-0.826286,-0.2057854,1.111078,0.356479,-0.5376197,0.803711,0.6639264,-0.6750171,...,0,0,0,0,0,0,0,0,0,0
1,-2.44666,1.837668,0.172066,-0.7593485,-1.75292,0.356479,2.118002,0.803711,-0.1947291,0.2113169,...,0,0,0,0,0,0,1,0,0,1
2,0.229486,-0.529066,-0.60736,-0.482567,1.929363,-0.882076,-0.5376197,0.803711,0.7712583,-0.7858089,...,1,0,0,0,0,1,1,1,0,1
3,-0.491015,-0.586095,-0.187973,-0.8700611,-0.116349,-0.882076,-0.5376197,0.803711,0.2345986,-0.2318501,...,1,1,0,0,1,1,1,1,1,1
4,0.847058,0.108822,0.241964,1.966651e-16,0.0,0.0,-7.370832e-17,0.0,-1.525278e-15,-3.936114e-16,...,0,0,0,0,0,0,0,0,0,0


In [20]:
### Add the dependent variable 'AMI_FLAG' back to the data for training.
### Create train and test split of 80-20

processed_data['AMI_FLAG'] = data['AMI_FLAG']
print( (processed_data['AMI_FLAG'] !=0).sum(), len(processed_data['AMI_FLAG']))
labels = np.array(processed_data['AMI_FLAG'])

processed_data= processed_data.drop('AMI_FLAG', axis = 1)

feature_list = list(processed_data.columns)

features = np.array(processed_data)

from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.20, random_state = 300)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

2726 100000
Training Features Shape: (80000, 491)
Training Labels Shape: (80000,)
Testing Features Shape: (20000, 491)
Testing Labels Shape: (20000,)


### Checking for Data Imbalance

In [21]:
print("Number of negative labels",(data['AMI_FLAG']==0).sum())
print("Number of positive labels",(data['AMI_FLAG']==1).sum())

Number of negative labels 97274
Number of positive labels 2726


#### We see that there is an imbalance between the positive and negative classes. Hence, we will perform both SMOTE undersampling and oversampling to overcome that 

#### SMOTE Undersampling

In [23]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
rus.fit(train_features, train_labels)
X_us, y_us = rus.fit_sample(train_features, train_labels)



RandomUnderSampler(random_state=0, ratio=None, replacement=False,
                   return_indices=False, sampling_strategy='auto')

In [24]:
X_us.shape

(4362, 491)

In [25]:
train_us = pd.DataFrame(X_us,columns=processed_data.columns)
train_us['AMI_FLAG'] = y_us
train_us.head()

Unnamed: 0,AGE,RECON_MA_RISK_SCORE_NBR,RECON_RX_RISK_SCORE_NBR,Length_residence,Est_BMI_decile,Num_person_household,Online_purchaser,Online_User,Pct_above_poverty_line,Pct_below_poverty_line,...,COL_GAP,AMM,AMM_GAP,DIAB_PASS,ACE_PASS,STATIN_PASS,ACE_ELIG,DIAB_ELIG,STATIN_ELIG,AMI_FLAG
0,0.126557,-0.579759,-0.321175,1.966651e-16,0.0,0.0,-7.370832e-17,0.0,-1.525278e-15,-3.936114e-16,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0
1,-1.314445,-0.229132,0.869726,-0.5932796,0.292793,4.691423,-0.5376197,0.803711,-0.7313887,0.7652757,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0
2,-1.623231,-0.342135,-0.406899,1.966651e-16,0.0,0.0,-7.370832e-17,0.0,-1.525278e-15,-3.936114e-16,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,-0.285158,2.467107,-0.004657,1.966651e-16,0.0,0.0,-7.370832e-17,0.0,-1.525278e-15,-3.936114e-16,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,-1.108587,-0.312564,-0.249958,-0.5379233,-0.525492,0.975757,-0.5376197,-1.416778,0.3419306,-0.3426419,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [26]:
train_us.to_csv("C:/Users/nisha/Downloads/Heart_Disease_Prediction_Data/Undersampled_data")

#### SMOTE Oversampling

In [28]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
ros.fit(train_features, train_labels)
X_os, y_os= ros.fit_sample(train_features, train_labels)

RandomOverSampler(random_state=0, ratio=None, return_indices=False,
                  sampling_strategy='auto')

In [29]:
X_os.shape

(155638, 491)

In [30]:
train_os = pd.DataFrame(X_os,columns=processed_data.columns)
train_os['AMI_FLAG'] = y_os
train_os.head()

Unnamed: 0,AGE,RECON_MA_RISK_SCORE_NBR,RECON_RX_RISK_SCORE_NBR,Length_residence,Est_BMI_decile,Num_person_household,Online_purchaser,Online_User,Pct_above_poverty_line,Pct_below_poverty_line,...,COL_GAP,AMM,AMM_GAP,DIAB_PASS,ACE_PASS,STATIN_PASS,ACE_ELIG,DIAB_ELIG,STATIN_ELIG,AMI_FLAG
0,0.435343,-0.288273,-0.265784,0.126352,-1.343777,-0.262799,-0.53762,-1.416778,-0.946053,0.986859,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
1,-1.417373,-0.334742,0.810378,-0.870061,-1.75292,1.595035,-0.53762,0.803711,0.556594,-0.564225,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
2,-0.593944,-0.799429,-0.19193,-0.316498,0.292793,-0.882076,-0.53762,-1.416778,1.093254,-1.118184,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
3,0.023628,-0.585039,-0.140496,0.01564,-0.934635,3.452868,2.118002,0.803711,-2.4487,2.537944,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1
4,0.023628,-0.429792,0.218225,4.554857,1.929363,-0.882076,-0.53762,0.803711,1.200586,-1.228976,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0


In [31]:
train_os.to_csv("C:/Users/nisha/Downloads/Heart_Disease_Prediction_Data/Oversampled_data")