In [None]:
### Script for baseline OLS of WASO

In [84]:
##Generic imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [85]:
##Script specific imports for feature encoding

from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(display='diagram')

In [86]:
file = r'/Users/anand/Documents/Sleep/WSC - variable cross-check_v1.xlsx'
data = r'/Users/anand/Documents/Sleep/wsc-dataset-0.2.0.csv'

In [149]:
df = pd.read_excel(file)
data_df = pd.read_csv(data)

In [150]:
deleted = df[df['Proposed Removal'] == 'R']
deleted_cols = deleted.iloc[:, 0]
#deleted_cols.to_list()

In [151]:
data_df = data_df.drop(deleted_cols.to_list(), axis=1)

In [152]:
## sort out nasal_cong_none
data_df.nasal_cong_none.replace({np.nan:0,'Y':1}, inplace=True)

In [153]:
balance_cutoff = 0.975
imbalanced_classes = []
for col in data_df.columns:
    _ = data_df.columns.get_loc(col)
    if data_df.iloc[:, _].value_counts(normalize=True).head(1).values > balance_cutoff:
        imbalanced_classes.append((col, data_df.iloc[:, _].value_counts(normalize=True).head(1).values.astype(float)))

In [154]:
imbalanced_classes

[('atheroscl_ynd', array([0.9758661])),
 ('congestivehf_ynd', array([0.98871595])),
 ('stroke_ynd', array([0.97743191])),
 ('emphysema_ynd', array([0.97898833])),
 ('pacemaker_ynd', array([0.99105058])),
 ('coronary_artery_stent_ynd', array([0.9766537])),
 ('dep_maoi_med', array([0.99922179])),
 ('dep_tca_med', array([0.9766537])),
 ('narcotics_med', array([0.9766537])),
 ('androgen_med', array([0.99688716])),
 ('stimulants_med', array([0.9848249])),
 ('psg_oxygen', array([1.]))]

In [155]:
imbalanced_list = []
for classes in imbalanced_classes:
    imbalanced_list.append(classes[0])

In [156]:
imbalanced_list

['atheroscl_ynd',
 'congestivehf_ynd',
 'stroke_ynd',
 'emphysema_ynd',
 'pacemaker_ynd',
 'coronary_artery_stent_ynd',
 'dep_maoi_med',
 'dep_tca_med',
 'narcotics_med',
 'androgen_med',
 'stimulants_med',
 'psg_oxygen']

In [157]:
data_df.drop(imbalanced_list, axis=1, inplace=True)
data_df.drop_duplicates('wsc_id', inplace=True)
data_df.set_index('wsc_id', inplace=True)
#data_df.fillna(0, inplace=True)

In [158]:
objlist = []

for n in data_df.dtypes[data_df.dtypes == 'object'].index:
    objlist.append(n)

In [159]:
## Binariser -  should work if nans are present or not...

for i,v in enumerate(objlist):
    
    ##columns with 2 variables eg. [N,Y] or [M,F]
    
    if len(data_df[v].unique()) == 2:
        ##print(data_df[v].unique(),v)
        data_df[objlist[i]].replace\
        ({data_df[objlist[i]].unique()[0]:0,data_df[objlist[i]].unique()[1]:1}, inplace=True)
        
    #### ALL columns with 3 variables - which appear like [N,Y,nan]
    if len(data_df[v].unique()) == 3:
        ##print(data_df[v].unique(),v)
        data_df[objlist[i]].replace\
        ({'N':0,'Y':1}, inplace=True)

In [160]:
#### ONE HOT ENCODER SCRIPT

ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)


## Only variables which need OHE
X1 = data_df[['thyroid_problem']]
X2 = data_df[['hormone_therapy']]

##fit transform, extract column names, make dataframe with column names, drop nan row

X1t = ohe.fit_transform(X1)
colnames = list(ohe.get_feature_names())
X1df = pd.DataFrame(X1t, columns = colnames)
X1df.drop(columns = 'x0_nan', inplace=True)
X1df.index = data_df.index

X2t = ohe.fit_transform(X2)
colnames = list(ohe.get_feature_names())
X2df = pd.DataFrame(X2t, columns = colnames)
X2df.drop(columns = 'x0_nan', inplace=True)
X2df.index = data_df.index

frames = [data_df, X1df, X2df]
data_df = pd.concat(frames, axis = 1)

##drop original row names

data_df.drop(columns = ['thyroid_problem','hormone_therapy'], inplace=True)

In [220]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [162]:
#### Keep main target (WAS0)
y = data_df.waso

In [145]:
#### Other confounding targets to remove including target

targs = ['tst', 'tst_rem', 'tst_nrem', 'tso', 'totsleep', 'ess','p_eval_sleep', 'a_eval_slept', 'a_eval_hour',
       'a_eval_sleep', 'ps_eds','se', 'sleepiness']

In [164]:
## Drop confounding targets

X = data_df.drop(columns = targs)

In [166]:
#### Scaling numerical features - categorical features and targets removed from this listt

numeric_features = ['creatinine', 'glucose', 'hdl', 'ldl','total_cholesterol', 'triglycerides', 'uric_acid', 'weightkg', 'bmi',
       'headcm', 'waist_girth1', 'waist_girth2', 'hip_girth1', 'hip_girth2',
       'neck_girth1', 'neck_girth2', 'sit_sys1', 'sit_dia1', 'sit_sys2',
       'sit_dia2', 'hipgirthm', 'neckgirthm', 'waistgirthm', 'waisthip',
       'sitsysm', 'sitdiam', 'zung_score', 'zung_index', 'state',
       'trait', 'beer_week', 'wine_week', 'hard_week', 'bowls_day', 'packs_week', 'cigars_day',
       'smoke_years', 'pack_years','workday', 'weekend',
       'naps', 'snore_freq',
       'num_pregnancies', 'psg_cpap', 'ahi',
       'minsao2tst', 'ptstl90', 'age', 'heightcm', 'cans_cola', 'cups_coffee',
       'caffeine', 'alcohol_wk','eval_general', 'eval_life','eval_health', 
       'snore_vol', 'choke_freq',
       'apnea_freq', 'awake_freq', 'ho_score']

In [167]:
#### Ordinal variables to be aware of - should not be scaled
ords = ['menopausal_status']


In [226]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

In [227]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

In [229]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),('regression', LinearRegression())])


In [230]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [231]:
cv_results = cross_validate(pipe, X_train, y_train, cv=10, scoring=('r2','neg_mean_absolute_error'))
sorted(cv_results.keys())

['fit_time', 'score_time', 'test_neg_mean_absolute_error', 'test_r2']

In [232]:
cv_results['test_neg_mean_absolute_error'].mean()

-29.76890230103651

In [234]:
cv_results['test_r2'].mean()

0.04916012571149275

In [233]:
y_train.min(), y_train.max()

(3.0, 291.5)