# Split data for train, test

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import warnings
warnings.filterwarnings('ignore')

## 1. Import Data

In [2]:
df = pd.read_csv("../data/combined_X_y.csv")

# 2. Drop Duplicates

In [3]:
duplicates = df[df.duplicated()]
print("Duplicate Rows : ",duplicates.shape[0], duplicates.shape[0]/df.shape[0]*100)
# Dropping duplicates
print("Shape before dropping duplicates : ", df.shape[0])
df.drop_duplicates(inplace = True)
print("Shape after dropping duplicates : ", df.shape[0])

Duplicate Rows :  24206 9.541942604856512
Shape before dropping duplicates :  253680
Shape after dropping duplicates :  229474


In [4]:
y = df["Diabetes_binary"]
y = y.rename("Diabetes")
X = df.drop("Diabetes_binary", axis = 1)

In [56]:
# Create a list of numerical and categorical columns:

numerical_cols = ["BMI"]
cat_cols = list(set(X.columns) - set(numerical_cols))
print(cat_cols)

['AnyHealthcare', 'HvyAlcoholConsump', 'Fruits', 'GenHlth', 'DiffWalk', 'HeartDiseaseorAttack', 'Education', 'Sex', 'Stroke', 'Veggies', 'CholCheck', 'Age', 'PhysActivity', 'NoDocbcCost', 'HighChol', 'HighBP', 'MentHlth', 'Income', 'PhysHlth', 'Smoker']


In [26]:
X.dtypes

HighBP                  int64
HighChol                int64
CholCheck               int64
BMI                     int64
Smoker                  int64
Stroke                  int64
HeartDiseaseorAttack    int64
PhysActivity            int64
Fruits                  int64
Veggies                 int64
HvyAlcoholConsump       int64
AnyHealthcare           int64
NoDocbcCost             int64
GenHlth                 int64
MentHlth                int64
PhysHlth                int64
DiffWalk                int64
Sex                     int64
Age                     int64
Education               int64
Income                  int64
dtype: object

# 3. Categorical features

## 3.1 Preprocess Categorical features

In [17]:
X["PhysHlth"].value_counts(), X["MentHlth"].value_counts()

(PhysHlth
 0     136578
 30     19385
 2      14491
 1      11073
 3       8435
 5       7595
 10      5588
 15      4914
 7       4531
 4       4521
 20      3273
 14      2584
 25      1336
 6       1328
 8        809
 21       663
 12       578
 28       522
 29       215
 9        179
 18       152
 16       112
 27        99
 17        96
 24        72
 22        70
 26        69
 13        68
 11        60
 23        56
 19        22
 Name: count, dtype: int64,
 MentHlth
 0     152325
 2      12692
 30     12079
 5       8913
 1       8307
 3       7301
 10      6352
 15      5501
 4       3774
 20      3362
 7       3090
 25      1188
 14      1167
 6        988
 8        639
 12       398
 28       327
 21       227
 29       158
 18        97
 9         91
 16        88
 27        79
 22        63
 17        54
 26        45
 11        41
 13        41
 23        38
 24        33
 19        16
 Name: count, dtype: int64)

In [18]:
X["MentHlth"] = X["MentHlth"].apply(lambda x: x if x == 0 else 1)
X["PhysHlth"] = X["PhysHlth"].apply(lambda x: x if x == 0 else 1)

In [20]:
X["PhysHlth"].value_counts(), X["MentHlth"].value_counts()

(PhysHlth
 0    136578
 1     92896
 Name: count, dtype: int64,
 MentHlth
 0    152325
 1     77149
 Name: count, dtype: int64)

In [49]:
X[cat_cols] = X[cat_cols].astype('category')
X_cat_ohe = pd.get_dummies(X[cat_cols], drop_first=True)

## 3.2 Process numerical feature

In [46]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X[numerical_cols] = ss.fit_transform(X[numerical_cols])

In [47]:
X[numerical_cols]

Unnamed: 0,BMI
0,1.666251
1,-0.543144
2,-0.101265
3,-0.248558
4,-0.690437
...,...
253675,2.402716
253676,-1.574195
253677,-0.101265
253678,-0.837730


In [50]:
X_cat_ohe

Unnamed: 0,AnyHealthcare_1,HvyAlcoholConsump_1,Fruits_1,GenHlth_2,GenHlth_3,GenHlth_4,GenHlth_5,DiffWalk_1,HeartDiseaseorAttack_1,Education_2,...,MentHlth_1,Income_2,Income_3,Income_4,Income_5,Income_6,Income_7,Income_8,PhysHlth_1,Smoker_1
0,True,False,False,False,False,False,True,True,False,False,...,True,False,True,False,False,False,False,False,True,True
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,True,False,True,False,False,False,True,True,False,False,...,True,False,False,False,False,False,False,True,True,False
3,True,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,True,False,True,True,False,False,False,False,False,False,...,True,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,True,False,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
253676,True,False,False,False,False,True,False,True,False,True,...,False,False,False,True,False,False,False,False,False,False
253677,True,False,True,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
253678,True,False,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## 3.3 Concat both

In [59]:
final_X = pd.concat([X_cat_ohe, X[numerical_cols]], axis = 1)

In [60]:
final_X

Unnamed: 0,AnyHealthcare_1,HvyAlcoholConsump_1,Fruits_1,GenHlth_2,GenHlth_3,GenHlth_4,GenHlth_5,DiffWalk_1,HeartDiseaseorAttack_1,Education_2,...,Income_2,Income_3,Income_4,Income_5,Income_6,Income_7,Income_8,PhysHlth_1,Smoker_1,BMI
0,True,False,False,False,False,False,True,True,False,False,...,False,True,False,False,False,False,False,True,True,1.666251
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,-0.543144
2,True,False,True,False,False,False,True,True,False,False,...,False,False,False,False,False,False,True,True,False,-0.101265
3,True,False,True,True,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,-0.248558
4,True,False,True,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,-0.690437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,True,False,True,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,2.402716
253676,True,False,False,False,False,True,False,True,False,True,...,False,False,True,False,False,False,False,False,False,-1.574195
253677,True,False,True,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,-0.101265
253678,True,False,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,-0.837730


# 4. Split train and test

In [62]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(final_X, y, stratify=y, test_size=0.15, random_state = 42)

In [64]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(195052, 45) (195052,)
(34422, 45) (34422,)


In [67]:
X_train.to_csv("../data/train/X_train.csv", index=False)
y_train.to_csv("../data/train/y_train.csv", index=False)
X_test.to_csv("../data/test/X_train.csv", index=False)
y_train.to_csv("../data/test/y_train.csv", index=False)