# First Exploratory Notebook

Steps:

1. Data Exploration
2. Duplicates
3. Missing Data
4. Outliers
5. Scaling
6. Balancing
7. Feature Engineering (Encoding, Discretizing, Create New Features)
8. Feature correlation and selection
9. Modelling
10. Further Feature Selection
11. Remodelling

## Data Preparation

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

##Script specific imports for feature encoding

from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(display='diagram')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

from sklearn import set_config; set_config(display='diagram')
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import TransformerMixin, BaseEstimator

In [None]:
file = r'/home/mnm7/code/sandbox/WSC - variable cross-check_v1.csv'
data = r'/home/mnm7/code/sandbox/wsc-dataset-0.2.0.csv'

In [None]:
df = pd.read_csv(file)
data_df = pd.read_csv(data)

In [None]:
data_df['nasal_cong_none'].fillna(0, inplace=True)

In [None]:
deleted = df[df['Proposed Removal'] == 'R']
deleted_cols = deleted.iloc[:, 0]

In [None]:
data_df = data_df.drop(deleted_cols.to_list(), axis=1)

In [None]:
data_df.isnull().sum().sort_values(ascending=False)/len(data_df)

In [None]:
balance_cutoff = 0.9
imbalanced_classes = []
for col in data_df.columns:
    _ = data_df.columns.get_loc(col)
    if data_df.iloc[:, _].value_counts(normalize=True).head(1).values > balance_cutoff:
        imbalanced_classes.append((col, data_df.iloc[:, _].value_counts(normalize=True).head(1).values.astype(float)))

### Imbalanced Classes

In [None]:
imbalanced_classes

In [None]:
len(imbalanced_classes)

In [None]:
imbalanced_list = []
for classes in imbalanced_classes:
    imbalanced_list.append(classes[0])

In [None]:
data_df.drop(imbalanced_list, axis=1, inplace=True)
data_df.drop_duplicates('wsc_id', inplace=True)
data_df.set_index('wsc_id', inplace=True)

### OHE

In [None]:
objlist = []

for n in data_df.dtypes[data_df.dtypes == 'object'].index:
    objlist.append(n)

In [None]:
## Binariser -  should work if nans are present or not...

for i,v in enumerate(objlist):
    
    ##columns with 2 variables eg. [N,Y] or [M,F]
    
    if len(data_df[v].unique()) == 2:
        ##print(data_df[v].unique(),v)
        data_df[objlist[i]].replace\
        ({data_df[objlist[i]].unique()[0]:0,data_df[objlist[i]].unique()[1]:1}, inplace=True)
        
    #### ALL columns with 3 variables - which appear like [N,Y,nan]
    if len(data_df[v].unique()) == 3:
        ##print(data_df[v].unique(),v)
        data_df[objlist[i]].replace\
        ({'N':0,'Y':1}, inplace=True)

In [None]:
#### ONE HOT ENCODER SCRIPT

ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)


## Only variables which need OHE
X1 = data_df[['thyroid_problem']]
X2 = data_df[['hormone_therapy']]

##fit transform, extract column names, make dataframe with column names, drop nan row

X1t = ohe.fit_transform(X1)
colnames = list(ohe.get_feature_names())
X1df = pd.DataFrame(X1t, columns = colnames)
X1df.drop(columns = 'x0_nan', inplace=True)
X1df.index = data_df.index

X2t = ohe.fit_transform(X2)
colnames = list(ohe.get_feature_names())
X2df = pd.DataFrame(X2t, columns = colnames)
X2df.drop(columns = 'x0_nan', inplace=True)
X2df.index = data_df.index

frames = [data_df, X1df, X2df]
data_df1 = pd.concat(frames, axis = 1, verify_integrity=True)

##drop original row names

data_df1.drop(columns = ['thyroid_problem','hormone_therapy'], inplace=True)

In [None]:
data_df1

In [None]:
#### Other confounding targets to remove
# RETAIN TST
targs = ['tst_rem', 'tst_nrem', 'tso', 'totsleep', 'ess','p_eval_sleep', 'a_eval_slept', 'a_eval_hour',
       'a_eval_sleep', 'ps_eds', 'waso', 'se', 'sleepiness']


In [None]:
data_df1

In [None]:
data_df1.drop(targs, axis=1, inplace=True)
data_df1

In [None]:
# # FOR CORRECT PREPROCESSING
# y = data_df1.tst
# X = data_df1.drop('tst', axis=1).fillna(0, axis=1)

In [None]:
y = data_df1.tst
X = data_df1.drop('tst', axis=1).fillna(0, axis=1)

In [None]:
X

### Missing Data

In [None]:
X.shape

In [None]:
X.num_pregnancies.unique()

In [None]:
X.num_pregnancies

In [None]:
data_df1.num_pregnancies.unique()

In [None]:
pd.DataFrame(data_df1.isnull().sum()/data_df1.shape[0]).sort_values(by=0, ascending=False).head(20)

### Feature Selection (VIF, Pearson Correlation, Feature Permutation)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
df = pd.DataFrame()
df["vif_index"] = [vif(X.values, i) for i in range(X.shape[1])]
df["features"] = X.columns
df[df['vif_index'] > 10]
df

In [None]:
df[df['vif_index'] > 10]

In [None]:
data_df1

In [None]:
X

In [None]:
# Calculate Pearson's correlation between Features and Target
target_corr_dict = {'feature': [], 'correlation_with_target': []}
for column in X.columns:
    corr, _ = pearsonr(X[column], y)
    target_corr_dict['feature'].append(column)
    target_corr_dict['correlation_with_target'].append(corr)
target_corr_df = pd.DataFrame(target_corr_dict)
target_corr_df

In [None]:
target_corr_df.sort_values(by='correlation_with_target', ascending=False)

In [None]:
target_corr_df.sort_values(by='correlation_with_target', ascending=False)

In [None]:
target_corr_df.to_csv(index=False, path_or_buf='/home/mnm7/code/pandit-a/dreamteam/notebooks/feature_corr_target.csv')

In [None]:
import seaborn as sns
# Heatmap
corr = data_df1.corr()
sns.heatmap(corr,
xticklabels=corr.columns,
yticklabels=corr.columns,
cmap= 'YlGnBu')
corr_df = corr.unstack().reset_index() # Unstack correlation matrix 
corr_df.columns = ['feature_1','feature_2', 'correlation'] # rename columns
corr_df.sort_values(by='correlation',ascending=False, inplace=True) # sort by correlation
corr_df = corr_df[corr_df['feature_1'] != corr_df['feature_2']] # Remove self correlation
corr_df.head()

In [None]:
corr_df.to_csv(index=False, path_or_buf='/home/mnm7/code/pandit-a/dreamteam/notebooks/feature_correlation.csv')

In [None]:
corr_df[corr_df['correlation'] > 0.5].head(120)

In [None]:
pwd

In [None]:
X.to_csv(path_or_buf='/home/mnm7/code/pandit-a/dreamteam/notebooks/X_csv.csv')

### Train Test Splits

In [None]:
X_split, X_val, y_split, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_split, y_split, test_size=0.3, random_state=42)

### Scaling

In [None]:
numeric_features = ['creatinine', 'glucose', 'hdl', 'ldl','total_cholesterol', 'triglycerides', 'uric_acid', 'weightkg', 'bmi',
       'headcm', 'waist_girth1', 'waist_girth2', 'hip_girth1', 'hip_girth2',
       'neck_girth1', 'neck_girth2', 'sit_sys1', 'sit_dia1', 'sit_sys2',
       'sit_dia2', 'hipgirthm', 'neckgirthm', 'waistgirthm', 'waisthip',
       'sitsysm', 'sitdiam', 'zung_score', 'zung_index', 'state',
       'trait', 'beer_week', 'wine_week', 'hard_week', 'bowls_day', 'packs_week', 'cigars_day',
       'smoke_years', 'pack_years','workday', 'weekend',
       'naps', 'snore_freq',
       'num_pregnancies', 'ahi',
       'minsao2tst', 'ptstl90', 'age', 'heightcm', 'cans_cola', 'cups_coffee',
       'caffeine', 'alcohol_wk', 'smoke', 'eval_general', 'eval_life','eval_health', 
       'snore_vol', 'choke_freq',
       'apnea_freq', 'awake_freq', 'ho_score']


In [None]:
X

In [None]:
X_train

In [None]:
X_scaled = X.copy()
scaler = MinMaxScaler()
X_scaled[numeric_features] = scaler.fit_transform(X_scaled[numeric_features])
X_scaled

## Modelling

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
cv_results = cross_validate(LinearRegression(), X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

cv_results['test_score'].mean()

In [None]:
cv_results['test_score']

## Pipelines

In [None]:
categoric = X.copy()
categoric.drop(columns = numeric_features, inplace=True)
categorical_features = categoric.columns
categoric

In [None]:
numeric_features

In [None]:
categorical_features

In [None]:
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

In [None]:
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='passthrough')
preprocessor

In [None]:
final_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('ridge', Ridge())
])
final_pipe

In [None]:
ridge_pipe_model = final_pipe.fit(X_train, y_train)
y_pred = ridge_pipe_model.predict(X_test)

In [None]:
print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}')
print(f'Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test, y_pred)}')

In [None]:
final_pipe.get_params().keys()

In [None]:
param_grid = {'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid = GridSearchCV(final_pipe, param_grid, n_jobs=-1)
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
y_pred_grid = grid.best_estimator_.predict(X_test)

In [None]:
y_baseline = pd.Series([np.mean(y_train)]*len(y_test))
print(f'Mean Absolute Error (Baseline Prediction):{mean_absolute_error(y_test, y_baseline)}')

In [None]:
print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred_grid)}')
print(f'Mean Absolute Percentage Error: {mean_absolute_percentage_error(y_test, y_pred_grid)}')

In [None]:
X_scaled

In [None]:
X_train

In [None]:
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

In [None]:
########## CUSTOM OHE-2 ##########
#'Need to use sparse=False, handle_unknown='ignore' when instantiating
from sklearn.preprocessing import OneHotEncoder
class CustomOHE1(OneHotEncoder):  
    def transform(self, *args, **kwargs):
        return pd.DataFrame(super().transform(*args, **kwargs), columns=self.get_feature_names())

In [None]:
class CustomColumnTransformer(ColumnTransformer):
    def transform(self, *args, **kwargs):
        return pd.DataFrame(super().transform(*args, **kwargs), columns=self.get_feature_names())
    def fit_transform(self, *args, **kwargs):
        return pd.DataFrame(super().fit_transform(*args, **kwargs), columns=self.get_feature_names())

In [None]:

preprocessor = CustomColumnTransformer([
    #('num_transformer', numerical_transformer, numeric_features),
    ('custom_ohe', CustomOHE1(sparse=False, handle_unknown='ignore'), categorical_features)
])
preprocessor.fit(X_train)
preprocessor.transform(X_train)
variable = preprocessor.fit_transform(X_train)
variable

In [None]:
colname_ohe = CustomColumnTransformer(ColumnTransformer)
type(colname_ohe)

In [None]:
preprocessor2 = ColumnTransformer([
    ('num', numerical_transformer, numeric_features),
    ('cat', colname_ohe, categorical_features)
], remainder='passthrough')
preprocessor2

In [None]:
r_scaler = RobustScaler()
ohe = OneHotEncoder()
categorical_features

In [None]:
X

In [None]:
X[numeric_features]

In [None]:
#NUMERIC FEATURES
r_scaler = StandardScaler()
X_scaled_num = pd.DataFrame(r_scaler.fit_transform(X.copy()[numeric_features]), columns=numeric_features)
X_scaled_num

In [None]:
#CATEGORICAL FEATURES
X_scaled_cat = X.copy()
XS_temp = ohe.fit_transform(X_scaled_cat[categorical_features])
cat_col_names = ohe.get_feature_names(X_scaled_cat[categorical_features].columns)
X_cat_merge = pd.DataFrame(XS_temp, columns = cat_col_names)
X_cat_merge

In [None]:
#CONCATENATE DFS
X_merged = pd.concat([X_scaled_num, X_cat_merge], axis=1)
X_merged

In [None]:
from sklearn.inspection import permutation_importance
ridge_reg = Ridge().fit(X_merged, y) # Fit model
permutation_score = permutation_importance(ridge_reg, X_merged, y, n_repeats=10) # Perform Permutation
importance_df = pd.DataFrame(np.vstack((X_merged.columns, permutation_score.importances_mean)).T) # Unstack results
importance_df.columns=['feature','score_decrease']
importance_df.sort_values(by='score_decrease', ascending = False) # Order by importance

In [None]:
importance_df[importance_df.score_decrease >= 0].sort_values(by='score_decrease', ascending = False).head(30) # Order by importance)

In [None]:
X_merged

In [None]:
X2

In [None]:
numeric_features_2 = pd.Series(numeric_features).drop(['glucose', 'ldl', 'bmi', 'waist_girth2', 'hip_girth2', 'neck_girth2', 'sit_sys2', 'sit_dia2', 'hipgirthm', 'neckgirthm', 'waistgirthm', 'sitsysm', 'sitdiam', 'zung_score', 'state', 'trait', 'beer_week', 'wine_week', 'hard_week', 'bowls_day', 'cigars_day', 'pack_years', 'caffeine'], axis=1)
numeric_features_2

In [None]:
#NUMERIC FEATURES
r_scaler = RobustScaler()
X2_scaled_num = pd.DataFrame(r_scaler.fit_transform(X2.copy()[numeric_features]), columns=numeric_features)
X2_scaled_num

In [None]:
#CATEGORICAL FEATURES
X2_scaled_cat = X2.copy()
XS2_temp = ohe.fit_transform(X2_scaled_cat[categorical_features])
cat_col_names = ohe.get_feature_names(X2_scaled_cat[categorical_features].columns)
X2_cat_merge = pd.DataFrame(XS2_temp, columns = cat_col_names)
X2_cat_merge

In [None]:
#CONCATENATE DFS
X2_merged = pd.concat([X2_scaled_num, X2_cat_merge], axis=1)
X2_merged

In [None]:
# FEATURE IMPORTANCE ON REDUCED

from sklearn.inspection import permutation_importance
ridge_reg = Ridge().fit(X, y) # Fit model
permutation_score = permutation_importance(ridge_reg, X_merged, y, n_repeats=10) # Perform Permutation
importance_df = pd.DataFrame(np.vstack((X_merged.columns, permutation_score.importances_mean)).T) # Unstack results
importance_df.columns=['feature','score_decrease']
importance_df.sort_values(by='score_decrease', ascending = False) # Order by importance

### TRYING ANOTHER RIDGE

In [None]:
X2 = X[['wsc_vst', 'sex', 'age', 'education_survey1', 'creatinine', 'hdl',
       'total_cholesterol', 'triglycerides', 'uric_acid', 'heightcm',
       'weightkg', 'headcm', 'waist_girth1', 'hip_girth1', 'neck_girth1',
       'sit_sys1', 'sit_dia1', 'waisthip', 'zung_index', 'cans_cola',
       'cups_coffee', 'nondrinker', 'alcohol_wk', 'smoke', 'packs_week',
       'smoke_years', 'eval_general', 'eval_life', 'eval_health', 'workday',
       'weekend', 'naps', 'anyinsomnia', 'snore_freq', 'snore_vol',
       'choke_freq', 'apnea_freq', 'awake_freq', 'nasal_cong_none', 'any_cvd',
       'diabetes_ynd', 'asthma_ynd', 'apnea', 'ho_score', 'menopausal_status',
       'num_pregnancies', 'cholesterol_med', 'depression_med', 'dep_ssri_med',
       'htn_med', 'antihistamines_med', 'diabetes_med', 'thyroid_med', 'ahi',
       'minsao2tst', 'ptstl90', 'x0_Hyperthyroid', 'x0_Hypothyroid',
       'x0_Nodule', 'x0_Thyroid Cancer', 'x0_Unknown', 'x0_C', 'x0_N', 'x0_P']]
X2

In [None]:
#NUMERIC FEATURES
r_scaler = Scaler()
X_scaled_num = pd.DataFrame(r_scaler.fit_transform(X.copy()[numeric_features]), columns=numeric_features)
X_scaled_num

In [None]:
counter = 0
for i in my_list:
    counter+=1
counter

In [None]:
X

In [None]:
y

In [None]:
## FINAL
features_to_remove_2 = ['zung_score', 'hipgirthm', 'waistgirthm', 'waist_girth2', 'hip_girth2', 'sit_dia2', 'sitdiam', 'sit_sys2', 'sitsysm', 'neckgirthm', 'neck_girth2', 'ldl', 'bmi', 'caffeine', 'beer_week', 'wine_week', 'hard_week', 'pack_years', 'smoke_curr', 'smoke_quit', 'bowls_day', 'cigars_day', 'trait', 'state', 'glucose', 'thyroid_ynd', 'arrhythmia_ynd', 'arthritis_ynd', 'hypertension_ynd', 'htn_acei_med',  'htn_beta_med',  'htn_diuretic_med']

In [None]:
X.drop(features_to_remove_2, axis=1).columns

In [None]:
features_to_remove = ['zung_score', 'hipgirthm', 'waistgirthm', 'waist_girth2', 'hip_girth2', 'sit_dia2', 'sitdiam', 'sit_sys2', 'sitsysm', 'neckgirthm', 'neck_girth2', 'ldl', 'bmi', 'caffeine', 'beer_week', 'wine_week', 'hard_week', 'pack_years', 'smoke_curr', 'smoke_quit', 'bowls_day', 'cigars_day', 'trait', 'state', 'glucose', 'thyroid_ynd', 'angioplasty_ynd', 'angina_ynd', 'arrhythmia_ynd', 'arthritis_ynd', 'atheroscl_ynd', 'congestivehf_ynd', 'coronarybypass_ynd', 'coronary_artery_stent_ynd', 'coronary_ynd', 'emphysema_ynd', 'heartattack_ynd', 'hypertension_ynd', 'pacemaker_ynd', 'stroke_ynd', 'htn_acei_med', 'htn_alpha_med', 'htn_beta_med', 'htn_arb_med', 'htn_diuretic_med']

## NEW NOTEBOOK

In [2]:
file = r'/home/mnm7/code/sandbox/WSC - variable cross-check_v1.csv'
data = r'/home/mnm7/code/sandbox/wsc-dataset-0.2.0.csv'

In [3]:
df = pd.read_csv(file)
data_df = pd.read_csv(data)

In [4]:
data_df['nasal_cong_none'].fillna(0, inplace=True)

In [5]:
deleted = df[df['Proposed Removal'] == 'R']
deleted_cols = deleted.iloc[:, 0]

In [6]:
data_df = data_df.drop(deleted_cols.to_list(), axis=1)

In [7]:
data_df.isnull().sum().sort_values(ascending=False)/len(data_df)

psg_oxygen         0.997665
psg_cpap           0.935019
cigars_day         0.919844
bowls_day          0.917899
thyroid_problem    0.867704
                     ...   
nasal_cong_none    0.000000
awake_freq         0.000000
apnea_freq         0.000000
choke_freq         0.000000
waso               0.000000
Length: 134, dtype: float64

In [8]:
balance_cutoff = 0.9
imbalanced_classes = []
for col in data_df.columns:
    _ = data_df.columns.get_loc(col)
    if data_df.iloc[:, _].value_counts(normalize=True).head(1).values > balance_cutoff:
        imbalanced_classes.append((col, data_df.iloc[:, _].value_counts(normalize=True).head(1).values.astype(float)))

### Imbalanced Classes

In [9]:
imbalanced_classes

[('race', array([0.95836576])),
 ('coronary_ynd', array([0.9233463])),
 ('angina_ynd', array([0.96730245])),
 ('atheroscl_ynd', array([0.9758661])),
 ('heartattack_ynd', array([0.95679253])),
 ('congestivehf_ynd', array([0.98871595])),
 ('coronarybypass_ynd', array([0.95525292])),
 ('stroke_ynd', array([0.97743191])),
 ('emphysema_ynd', array([0.97898833])),
 ('angioplasty_ynd', array([0.95758755])),
 ('pacemaker_ynd', array([0.99105058])),
 ('coronary_artery_stent_ynd', array([0.9766537])),
 ('asthma_med', array([0.92140078])),
 ('asthma_rescue_med', array([0.96031128])),
 ('asthma_control_med', array([0.93696498])),
 ('dep_maoi_med', array([0.99922179])),
 ('dep_tca_med', array([0.9766537])),
 ('htn_alpha_med', array([0.96264591])),
 ('htn_arb_med', array([0.93190661])),
 ('narcotics_med', array([0.9766537])),
 ('decongestants_med', array([0.95914397])),
 ('anxiety_med', array([0.92801556])),
 ('estrogen_med', array([0.93385214])),
 ('androgen_med', array([0.99688716])),
 ('progester

In [10]:
len(imbalanced_classes)

29

In [11]:
imbalanced_list = []
for classes in imbalanced_classes:
    imbalanced_list.append(classes[0])

In [12]:
data_df.drop(imbalanced_list, axis=1, inplace=True)
data_df.drop_duplicates('wsc_id', inplace=True)
data_df.set_index('wsc_id', inplace=True)

### OHE

In [13]:
objlist = []

for n in data_df.dtypes[data_df.dtypes == 'object'].index:
    objlist.append(n)

In [14]:
## Binariser -  should work if nans are present or not...

for i,v in enumerate(objlist):
    
    ##columns with 2 variables eg. [N,Y] or [M,F]
    
    if len(data_df[v].unique()) == 2:
        ##print(data_df[v].unique(),v)
        data_df[objlist[i]].replace\
        ({data_df[objlist[i]].unique()[0]:0,data_df[objlist[i]].unique()[1]:1}, inplace=True)
        
    #### ALL columns with 3 variables - which appear like [N,Y,nan]
    if len(data_df[v].unique()) == 3:
        ##print(data_df[v].unique(),v)
        data_df[objlist[i]].replace\
        ({'N':0,'Y':1}, inplace=True)

In [15]:
#### ONE HOT ENCODER SCRIPT

ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)


## Only variables which need OHE
X1 = data_df[['thyroid_problem']]
X2 = data_df[['hormone_therapy']]

##fit transform, extract column names, make dataframe with column names, drop nan row

X1t = ohe.fit_transform(X1)
colnames = list(ohe.get_feature_names())
X1df = pd.DataFrame(X1t, columns = colnames)
X1df.drop(columns = 'x0_nan', inplace=True)
X1df.index = data_df.index

X2t = ohe.fit_transform(X2)
colnames = list(ohe.get_feature_names())
X2df = pd.DataFrame(X2t, columns = colnames)
X2df.drop(columns = 'x0_nan', inplace=True)
X2df.index = data_df.index

frames = [data_df, X1df, X2df]
data_df1 = pd.concat(frames, axis = 1, verify_integrity=True)

##drop original row names

data_df1.drop(columns = ['thyroid_problem','hormone_therapy'], inplace=True)

In [16]:
data_df1

Unnamed: 0_level_0,wsc_vst,sex,age,education_survey1,creatinine,glucose,hdl,ldl,total_cholesterol,triglycerides,...,se,waso,x0_Hyperthyroid,x0_Hypothyroid,x0_Nodule,x0_Thyroid Cancer,x0_Unknown,x0_C,x0_N,x0_P
wsc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10119,1,0,52,6.0,0.90,89.0,52.0,149.0,228.0,137.0,...,83.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10191,1,1,50,5.0,0.80,96.0,77.0,122.0,222.0,117.0,...,87.1,44.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10198,1,1,57,4.0,0.90,103.0,48.0,146.0,211.0,85.0,...,83.7,58.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
10226,1,0,55,4.0,1.40,131.0,39.0,107.0,173.0,135.0,...,71.4,91.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10241,1,1,51,4.0,0.90,102.0,37.0,135.0,228.0,281.0,...,81.6,66.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99481,1,0,60,5.0,1.12,99.0,57.0,116.0,193.0,99.0,...,78.0,95.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99651,1,0,69,5.0,1.40,206.0,36.0,112.0,184.0,181.0,...,87.5,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99761,1,1,71,3.0,0.80,122.0,73.0,125.0,234.0,178.0,...,84.6,44.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
99782,1,1,52,3.0,0.80,89.0,68.0,112.0,202.0,109.0,...,83.2,47.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [17]:
#### Other confounding targets to remove
# RETAIN TST
targs = ['tst_rem', 'tst_nrem', 'tso', 'totsleep', 'ess','p_eval_sleep', 'a_eval_slept', 'a_eval_hour',
       'a_eval_sleep', 'ps_eds', 'waso', 'se', 'sleepiness']


In [18]:
data_df1

Unnamed: 0_level_0,wsc_vst,sex,age,education_survey1,creatinine,glucose,hdl,ldl,total_cholesterol,triglycerides,...,se,waso,x0_Hyperthyroid,x0_Hypothyroid,x0_Nodule,x0_Thyroid Cancer,x0_Unknown,x0_C,x0_N,x0_P
wsc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10119,1,0,52,6.0,0.90,89.0,52.0,149.0,228.0,137.0,...,83.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10191,1,1,50,5.0,0.80,96.0,77.0,122.0,222.0,117.0,...,87.1,44.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10198,1,1,57,4.0,0.90,103.0,48.0,146.0,211.0,85.0,...,83.7,58.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
10226,1,0,55,4.0,1.40,131.0,39.0,107.0,173.0,135.0,...,71.4,91.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10241,1,1,51,4.0,0.90,102.0,37.0,135.0,228.0,281.0,...,81.6,66.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99481,1,0,60,5.0,1.12,99.0,57.0,116.0,193.0,99.0,...,78.0,95.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99651,1,0,69,5.0,1.40,206.0,36.0,112.0,184.0,181.0,...,87.5,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99761,1,1,71,3.0,0.80,122.0,73.0,125.0,234.0,178.0,...,84.6,44.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
99782,1,1,52,3.0,0.80,89.0,68.0,112.0,202.0,109.0,...,83.2,47.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
data_df1.drop(targs, axis=1, inplace=True)
data_df1

Unnamed: 0_level_0,wsc_vst,sex,age,education_survey1,creatinine,glucose,hdl,ldl,total_cholesterol,triglycerides,...,minsao2tst,ptstl90,x0_Hyperthyroid,x0_Hypothyroid,x0_Nodule,x0_Thyroid Cancer,x0_Unknown,x0_C,x0_N,x0_P
wsc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10119,1,0,52,6.0,0.90,89.0,52.0,149.0,228.0,137.0,...,93.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10191,1,1,50,5.0,0.80,96.0,77.0,122.0,222.0,117.0,...,94.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10198,1,1,57,4.0,0.90,103.0,48.0,146.0,211.0,85.0,...,87.2,0.7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
10226,1,0,55,4.0,1.40,131.0,39.0,107.0,173.0,135.0,...,89.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10241,1,1,51,4.0,0.90,102.0,37.0,135.0,228.0,281.0,...,84.8,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99481,1,0,60,5.0,1.12,99.0,57.0,116.0,193.0,99.0,...,,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99651,1,0,69,5.0,1.40,206.0,36.0,112.0,184.0,181.0,...,76.9,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99761,1,1,71,3.0,0.80,122.0,73.0,125.0,234.0,178.0,...,89.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
99782,1,1,52,3.0,0.80,89.0,68.0,112.0,202.0,109.0,...,88.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [20]:
# # FOR CORRECT PREPROCESSING
# y = data_df1.tst
# X = data_df1.drop('tst', axis=1).fillna(0, axis=1)

In [21]:
y = data_df1.tst
X = data_df1.drop('tst', axis=1).fillna(0, axis=1)

In [22]:
X

Unnamed: 0_level_0,wsc_vst,sex,age,education_survey1,creatinine,glucose,hdl,ldl,total_cholesterol,triglycerides,...,minsao2tst,ptstl90,x0_Hyperthyroid,x0_Hypothyroid,x0_Nodule,x0_Thyroid Cancer,x0_Unknown,x0_C,x0_N,x0_P
wsc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10119,1,0,52,6.0,0.90,89.0,52.0,149.0,228.0,137.0,...,93.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10191,1,1,50,5.0,0.80,96.0,77.0,122.0,222.0,117.0,...,94.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10198,1,1,57,4.0,0.90,103.0,48.0,146.0,211.0,85.0,...,87.2,0.7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
10226,1,0,55,4.0,1.40,131.0,39.0,107.0,173.0,135.0,...,89.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10241,1,1,51,4.0,0.90,102.0,37.0,135.0,228.0,281.0,...,84.8,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99481,1,0,60,5.0,1.12,99.0,57.0,116.0,193.0,99.0,...,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99651,1,0,69,5.0,1.40,206.0,36.0,112.0,184.0,181.0,...,76.9,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99761,1,1,71,3.0,0.80,122.0,73.0,125.0,234.0,178.0,...,89.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
99782,1,1,52,3.0,0.80,89.0,68.0,112.0,202.0,109.0,...,88.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
X2 = X[['wsc_vst', 'sex', 'age', 'education_survey1', 'creatinine', 'hdl',
       'total_cholesterol', 'triglycerides', 'uric_acid', 'heightcm',
       'weightkg', 'headcm', 'waist_girth1', 'hip_girth1', 'neck_girth1',
       'sit_sys1', 'sit_dia1', 'waisthip', 'zung_index', 'cans_cola',
       'cups_coffee', 'nondrinker', 'alcohol_wk', 'smoke', 'packs_week',
       'smoke_years', 'eval_general', 'eval_life', 'eval_health', 'workday',
       'weekend', 'naps', 'anyinsomnia', 'snore_freq', 'snore_vol',
       'choke_freq', 'apnea_freq', 'awake_freq', 'nasal_cong_none', 'any_cvd',
       'diabetes_ynd', 'asthma_ynd', 'apnea', 'ho_score', 'menopausal_status',
       'num_pregnancies', 'cholesterol_med', 'depression_med', 'dep_ssri_med',
       'htn_med', 'antihistamines_med', 'diabetes_med', 'thyroid_med', 'ahi',
       'minsao2tst', 'ptstl90', 'x0_Hyperthyroid', 'x0_Hypothyroid',
       'x0_Nodule', 'x0_Thyroid Cancer', 'x0_Unknown', 'x0_C', 'x0_N', 'x0_P']]
X2

Unnamed: 0_level_0,wsc_vst,sex,age,education_survey1,creatinine,hdl,total_cholesterol,triglycerides,uric_acid,heightcm,...,minsao2tst,ptstl90,x0_Hyperthyroid,x0_Hypothyroid,x0_Nodule,x0_Thyroid Cancer,x0_Unknown,x0_C,x0_N,x0_P
wsc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10119,1,0,52,6.0,0.90,52.0,228.0,137.0,7.2,176,...,93.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10191,1,1,50,5.0,0.80,77.0,222.0,117.0,3.4,170,...,94.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10198,1,1,57,4.0,0.90,48.0,211.0,85.0,5.9,157,...,87.2,0.7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
10226,1,0,55,4.0,1.40,39.0,173.0,135.0,8.5,173,...,89.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10241,1,1,51,4.0,0.90,37.0,228.0,281.0,7.1,171,...,84.8,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99481,1,0,60,5.0,1.12,57.0,193.0,99.0,4.6,181,...,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99651,1,0,69,5.0,1.40,36.0,184.0,181.0,7.7,171,...,76.9,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99761,1,1,71,3.0,0.80,73.0,234.0,178.0,4.7,152,...,89.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
99782,1,1,52,3.0,0.80,68.0,202.0,109.0,4.4,160,...,88.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [25]:
numeric_features = ['creatinine', 'glucose', 'hdl', 'ldl','total_cholesterol', 'triglycerides', 'uric_acid', 'weightkg', 'bmi',
       'headcm', 'waist_girth1', 'waist_girth2', 'hip_girth1', 'hip_girth2',
       'neck_girth1', 'neck_girth2', 'sit_sys1', 'sit_dia1', 'sit_sys2',
       'sit_dia2', 'hipgirthm', 'neckgirthm', 'waistgirthm', 'waisthip',
       'sitsysm', 'sitdiam', 'zung_score', 'zung_index', 'state',
       'trait', 'beer_week', 'wine_week', 'hard_week', 'bowls_day', 'packs_week', 'cigars_day',
       'smoke_years', 'pack_years','workday', 'weekend',
       'naps', 'snore_freq',
       'num_pregnancies', 'ahi',
       'minsao2tst', 'ptstl90', 'age', 'heightcm', 'cans_cola', 'cups_coffee',
       'caffeine', 'alcohol_wk', 'smoke', 'eval_general', 'eval_life','eval_health', 
       'snore_vol', 'choke_freq',
       'apnea_freq', 'awake_freq', 'ho_score']


In [26]:
categoric = X.copy()
categoric.drop(columns = numeric_features, inplace=True)
categorical_features = categoric.columns
categoric

Unnamed: 0_level_0,wsc_vst,sex,education_survey1,nondrinker,smoke_curr,smoke_quit,anyinsomnia,nasal_cong_none,arrhythmia_ynd,any_cvd,...,diabetes_med,thyroid_med,x0_Hyperthyroid,x0_Hypothyroid,x0_Nodule,x0_Thyroid Cancer,x0_Unknown,x0_C,x0_N,x0_P
wsc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10119,1,0,6.0,0.0,0.0,0.0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10191,1,1,5.0,0.0,0.0,0.0,0,1,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10198,1,1,4.0,1.0,0.0,1986.0,0,0,0,0,...,0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
10226,1,0,4.0,0.0,1.0,0.0,0,0,0,1,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10241,1,1,4.0,1.0,0.0,1974.0,0,0,1,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99481,1,0,5.0,0.0,0.0,0.0,1,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99651,1,0,5.0,0.0,0.0,1965.0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99761,1,1,3.0,0.0,0.0,0.0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
99782,1,1,3.0,0.0,0.0,0.0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [28]:
features_to_remove_2 = ['wsc_vst', 'sex', 'age', 'education_survey1', 'creatinine', 'hdl',
       'total_cholesterol', 'triglycerides', 'uric_acid', 'heightcm',
       'weightkg', 'headcm', 'waist_girth1', 'hip_girth1', 'neck_girth1',
       'sit_sys1', 'sit_dia1', 'waisthip', 'zung_index', 'cans_cola',
       'cups_coffee', 'nondrinker', 'alcohol_wk', 'smoke', 'packs_week',
       'smoke_years', 'eval_general', 'eval_life', 'eval_health', 'workday',
       'weekend', 'naps', 'anyinsomnia', 'snore_freq', 'snore_vol',
       'choke_freq', 'apnea_freq', 'awake_freq', 'nasal_cong_none', 'any_cvd',
       'diabetes_ynd', 'asthma_ynd', 'apnea', 'ho_score', 'menopausal_status',
       'num_pregnancies', 'cholesterol_med', 'depression_med', 'dep_ssri_med',
       'htn_med', 'antihistamines_med', 'diabetes_med', 'thyroid_med', 'ahi',
       'minsao2tst', 'ptstl90', 'x0_Hyperthyroid', 'x0_Hypothyroid',
       'x0_Nodule', 'x0_Thyroid Cancer', 'x0_Unknown', 'x0_C', 'x0_N', 'x0_P']
features_to_remove_2

['wsc_vst',
 'sex',
 'age',
 'education_survey1',
 'creatinine',
 'hdl',
 'total_cholesterol',
 'triglycerides',
 'uric_acid',
 'heightcm',
 'weightkg',
 'headcm',
 'waist_girth1',
 'hip_girth1',
 'neck_girth1',
 'sit_sys1',
 'sit_dia1',
 'waisthip',
 'zung_index',
 'cans_cola',
 'cups_coffee',
 'nondrinker',
 'alcohol_wk',
 'smoke',
 'packs_week',
 'smoke_years',
 'eval_general',
 'eval_life',
 'eval_health',
 'workday',
 'weekend',
 'naps',
 'anyinsomnia',
 'snore_freq',
 'snore_vol',
 'choke_freq',
 'apnea_freq',
 'awake_freq',
 'nasal_cong_none',
 'any_cvd',
 'diabetes_ynd',
 'asthma_ynd',
 'apnea',
 'ho_score',
 'menopausal_status',
 'num_pregnancies',
 'cholesterol_med',
 'depression_med',
 'dep_ssri_med',
 'htn_med',
 'antihistamines_med',
 'diabetes_med',
 'thyroid_med',
 'ahi',
 'minsao2tst',
 'ptstl90',
 'x0_Hyperthyroid',
 'x0_Hypothyroid',
 'x0_Nodule',
 'x0_Thyroid Cancer',
 'x0_Unknown',
 'x0_C',
 'x0_N',
 'x0_P']

In [29]:
numeric_features_2 = [x for x in numeric_features if x in features_to_remove_2]
numeric_features_2

['creatinine',
 'hdl',
 'total_cholesterol',
 'triglycerides',
 'uric_acid',
 'weightkg',
 'headcm',
 'waist_girth1',
 'hip_girth1',
 'neck_girth1',
 'sit_sys1',
 'sit_dia1',
 'waisthip',
 'zung_index',
 'packs_week',
 'smoke_years',
 'workday',
 'weekend',
 'naps',
 'snore_freq',
 'num_pregnancies',
 'ahi',
 'minsao2tst',
 'ptstl90',
 'age',
 'heightcm',
 'cans_cola',
 'cups_coffee',
 'alcohol_wk',
 'smoke',
 'eval_general',
 'eval_life',
 'eval_health',
 'snore_vol',
 'choke_freq',
 'apnea_freq',
 'awake_freq',
 'ho_score']

In [30]:
X2[numeric_features_2]

Unnamed: 0_level_0,creatinine,hdl,total_cholesterol,triglycerides,uric_acid,weightkg,headcm,waist_girth1,hip_girth1,neck_girth1,...,alcohol_wk,smoke,eval_general,eval_life,eval_health,snore_vol,choke_freq,apnea_freq,awake_freq,ho_score
wsc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10119,0.90,52.0,228.0,137.0,7.2,85.5,59.5,92.0,97.0,39.5,...,2,0,1,1,1,1,1,1,1,69
10191,0.80,77.0,222.0,117.0,3.4,73.0,54.9,81.0,107.0,31.5,...,1,0,1,2,2,1,1,1,1,48
10198,0.90,48.0,211.0,85.0,5.9,115.5,56.7,143.0,147.5,39.5,...,0,1,1,1,3,9,1,1,1,67
10226,1.40,39.0,173.0,135.0,8.5,79.2,56.8,98.0,99.5,39.5,...,42,1,1,3,3,4,2,9,1,57
10241,0.90,37.0,228.0,281.0,7.1,100.6,57.7,113.0,111.5,42.0,...,0,1,2,1,5,2,9,9,1,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99481,1.12,57.0,193.0,99.0,4.6,84.0,58.0,96.0,102.5,40.0,...,0,0,1,1,2,2,1,1,1,50
99651,1.40,36.0,184.0,181.0,7.7,102.5,60.0,112.0,116.0,46.0,...,1,1,1,1,2,4,9,9,2,63
99761,0.80,73.0,234.0,178.0,4.7,94.0,54.5,104.0,132.0,35.5,...,3,0,1,1,2,3,9,9,9,56
99782,0.80,68.0,202.0,109.0,4.4,63.5,56.0,81.0,95.0,36.0,...,2,0,1,2,1,9,1,1,1,63


In [32]:
#NUMERIC FEATURES
r_scaler = RobustScaler()
X2_scaled_num = pd.DataFrame(r_scaler.fit_transform(X2.copy()[numeric_features_2]), columns=numeric_features_2)
X2_scaled_num

Unnamed: 0,creatinine,hdl,total_cholesterol,triglycerides,uric_acid,weightkg,headcm,waist_girth1,hip_girth1,neck_girth1,...,alcohol_wk,smoke,eval_general,eval_life,eval_health,snore_vol,choke_freq,apnea_freq,awake_freq,ho_score
0,-0.40,0.157895,0.557692,0.120879,0.85,-0.090909,0.735294,-0.318182,-0.606061,0.090909,...,0.0,-1.0,0.0,-1.0,-1.0,-0.333333,-0.125,0.0,0.0,0.428571
1,-0.80,1.473684,0.442308,-0.098901,-1.05,-0.545455,-0.617647,-0.818182,0.000000,-1.363636,...,-0.2,-1.0,0.0,0.0,0.0,-0.333333,-0.125,0.0,0.0,-1.071429
2,-0.40,-0.052632,0.230769,-0.450549,0.20,1.000000,-0.088235,2.000000,2.454545,0.090909,...,-0.4,0.0,0.0,-1.0,1.0,1.000000,-0.125,0.0,0.0,0.285714
3,1.60,-0.526316,-0.500000,0.098901,1.50,-0.320000,-0.058824,-0.045455,-0.454545,0.090909,...,8.0,0.0,0.0,1.0,1.0,0.166667,0.000,1.0,0.0,-0.428571
4,-0.40,-0.631579,0.557692,1.703297,0.80,0.458182,0.205882,0.636364,0.272727,0.545455,...,-0.4,0.0,1.0,-1.0,3.0,-0.166667,0.875,1.0,0.0,-0.071429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118,0.48,0.421053,-0.115385,-0.296703,-0.45,-0.145455,0.294118,-0.136364,-0.272727,0.181818,...,-0.4,-1.0,0.0,-1.0,0.0,-0.166667,-0.125,0.0,0.0,-0.928571
1119,1.60,-0.684211,-0.288462,0.604396,1.10,0.527273,0.882353,0.590909,0.545455,1.272727,...,-0.2,0.0,0.0,-1.0,0.0,0.166667,0.875,1.0,1.0,0.000000
1120,-0.80,1.263158,0.673077,0.571429,-0.40,0.218182,-0.735294,0.227273,1.515152,-0.636364,...,0.2,-1.0,0.0,-1.0,0.0,0.000000,0.875,1.0,8.0,-0.500000
1121,-0.80,1.000000,0.057692,-0.186813,-0.55,-0.890909,-0.294118,-0.818182,-0.727273,-0.545455,...,0.0,-1.0,0.0,0.0,-1.0,1.000000,-0.125,0.0,0.0,0.000000


In [34]:
categorical_features_2 = [x for x in categorical_features if x in features_to_remove_2]
categorical_features_2

['wsc_vst',
 'sex',
 'education_survey1',
 'nondrinker',
 'anyinsomnia',
 'nasal_cong_none',
 'any_cvd',
 'diabetes_ynd',
 'asthma_ynd',
 'apnea',
 'menopausal_status',
 'cholesterol_med',
 'depression_med',
 'dep_ssri_med',
 'htn_med',
 'antihistamines_med',
 'diabetes_med',
 'thyroid_med',
 'x0_Hyperthyroid',
 'x0_Hypothyroid',
 'x0_Nodule',
 'x0_Thyroid Cancer',
 'x0_Unknown',
 'x0_C',
 'x0_N',
 'x0_P']

In [38]:
X

Unnamed: 0_level_0,wsc_vst,sex,age,education_survey1,creatinine,glucose,hdl,ldl,total_cholesterol,triglycerides,...,minsao2tst,ptstl90,x0_Hyperthyroid,x0_Hypothyroid,x0_Nodule,x0_Thyroid Cancer,x0_Unknown,x0_C,x0_N,x0_P
wsc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10119,1,0,52,6.0,0.90,89.0,52.0,149.0,228.0,137.0,...,93.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10191,1,1,50,5.0,0.80,96.0,77.0,122.0,222.0,117.0,...,94.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10198,1,1,57,4.0,0.90,103.0,48.0,146.0,211.0,85.0,...,87.2,0.7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
10226,1,0,55,4.0,1.40,131.0,39.0,107.0,173.0,135.0,...,89.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10241,1,1,51,4.0,0.90,102.0,37.0,135.0,228.0,281.0,...,84.8,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99481,1,0,60,5.0,1.12,99.0,57.0,116.0,193.0,99.0,...,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99651,1,0,69,5.0,1.40,206.0,36.0,112.0,184.0,181.0,...,76.9,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99761,1,1,71,3.0,0.80,122.0,73.0,125.0,234.0,178.0,...,89.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
99782,1,1,52,3.0,0.80,89.0,68.0,112.0,202.0,109.0,...,88.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [37]:
X2

Unnamed: 0_level_0,wsc_vst,sex,age,education_survey1,creatinine,hdl,total_cholesterol,triglycerides,uric_acid,heightcm,...,minsao2tst,ptstl90,x0_Hyperthyroid,x0_Hypothyroid,x0_Nodule,x0_Thyroid Cancer,x0_Unknown,x0_C,x0_N,x0_P
wsc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10119,1,0,52,6.0,0.90,52.0,228.0,137.0,7.2,176,...,93.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10191,1,1,50,5.0,0.80,77.0,222.0,117.0,3.4,170,...,94.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10198,1,1,57,4.0,0.90,48.0,211.0,85.0,5.9,157,...,87.2,0.7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
10226,1,0,55,4.0,1.40,39.0,173.0,135.0,8.5,173,...,89.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10241,1,1,51,4.0,0.90,37.0,228.0,281.0,7.1,171,...,84.8,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99481,1,0,60,5.0,1.12,57.0,193.0,99.0,4.6,181,...,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99651,1,0,69,5.0,1.40,36.0,184.0,181.0,7.7,171,...,76.9,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99761,1,1,71,3.0,0.80,73.0,234.0,178.0,4.7,152,...,89.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
99782,1,1,52,3.0,0.80,68.0,202.0,109.0,4.4,160,...,88.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [40]:
X2_cat = X2[categorical_features_2].reset_index()
X2_cat

Unnamed: 0,wsc_id,wsc_vst,sex,education_survey1,nondrinker,anyinsomnia,nasal_cong_none,any_cvd,diabetes_ynd,asthma_ynd,...,diabetes_med,thyroid_med,x0_Hyperthyroid,x0_Hypothyroid,x0_Nodule,x0_Thyroid Cancer,x0_Unknown,x0_C,x0_N,x0_P
0,10119,1,0,6.0,0.0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10191,1,1,5.0,0.0,0,1,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,10198,1,1,4.0,1.0,0,0,0,0,0,...,0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,10226,1,0,4.0,0.0,0,0,1,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10241,1,1,4.0,1.0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118,99481,1,0,5.0,0.0,1,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1119,99651,1,0,5.0,0.0,0,0,0,0,1,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1120,99761,1,1,3.0,0.0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1121,99782,1,1,3.0,0.0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [42]:
#CONCATENATE DFS
X2_merged = pd.concat([X2_scaled_num, X2_cat], axis=1)
X2_merged

Unnamed: 0,creatinine,hdl,total_cholesterol,triglycerides,uric_acid,weightkg,headcm,waist_girth1,hip_girth1,neck_girth1,...,diabetes_med,thyroid_med,x0_Hyperthyroid,x0_Hypothyroid,x0_Nodule,x0_Thyroid Cancer,x0_Unknown,x0_C,x0_N,x0_P
0,-0.40,0.157895,0.557692,0.120879,0.85,-0.090909,0.735294,-0.318182,-0.606061,0.090909,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.80,1.473684,0.442308,-0.098901,-1.05,-0.545455,-0.617647,-0.818182,0.000000,-1.363636,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.40,-0.052632,0.230769,-0.450549,0.20,1.000000,-0.088235,2.000000,2.454545,0.090909,...,0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.60,-0.526316,-0.500000,0.098901,1.50,-0.320000,-0.058824,-0.045455,-0.454545,0.090909,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.40,-0.631579,0.557692,1.703297,0.80,0.458182,0.205882,0.636364,0.272727,0.545455,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118,0.48,0.421053,-0.115385,-0.296703,-0.45,-0.145455,0.294118,-0.136364,-0.272727,0.181818,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1119,1.60,-0.684211,-0.288462,0.604396,1.10,0.527273,0.882353,0.590909,0.545455,1.272727,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1120,-0.80,1.263158,0.673077,0.571429,-0.40,0.218182,-0.735294,0.227273,1.515152,-0.636364,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1121,-0.80,1.000000,0.057692,-0.186813,-0.55,-0.890909,-0.294118,-0.818182,-0.727273,-0.545455,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [44]:
from sklearn.model_selection import train_test_split
X_, X_val, y_, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.3, random_state=42)

In [46]:
model = Ridge()

In [47]:
model.fit(X_train, y_train)

In [48]:
cv_results = cross_validate(Ridge(), X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

cv_results['test_score'].mean()

-49.4607539871496

In [49]:
y_baseline = pd.Series([np.mean(y_train)]*len(y_test))
print(f'Mean Absolute Error (Baseline Prediction):{mean_absolute_error(y_test, y_baseline)}')

Mean Absolute Error (Baseline Prediction):47.0436297095056
