In [442]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [443]:
file = r'/Users/richard/Documents/WSC - variable cross-check_v1.xlsx'
data = r'/Users/richard/Downloads/wsc-dataset-0.2.0.csv'

In [444]:
# Had to add the nrows variable as the Excel was pulling in 1,000s of rows for some reason
df = pd.read_excel(file)
data_df = pd.read_csv(data)

In [445]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Check,Patient input,Dr Input,Calculated,Feature,Target,Proposed Removal,Action to take,Composite,Unnamed: 10
234,se,,,,,,T,,,,
235,waso,,,D,,,T,,,,
236,mean_desat_perc,,,,,,,R,,,
237,mean_desat_dur,,,,,,,R,,,
238,cvd_stroke_death,,,,,,,,,,


In [446]:
data_df.fillna(0, inplace=True)

In [450]:
deleted = df[df['Proposed Removal'] == 'R']
deleted_cols = deleted.iloc[:, 0]

In [451]:
deleted_cols.to_list()

['vst_year',
 'arm_left1',
 'arm_left2',
 'arm_right1',
 'arm_right2',
 'ankle_left1',
 'ankle_left2',
 'ankle_right1',
 'ankle_right2',
 'sit_sys_repeat',
 'sit_dia_repeat',
 'arm_left_repeat',
 'arm_right_repeat',
 'ankle_left_repeat',
 'ankle_right_repeat',
 'sit_sys_repeat2',
 'sit_dia_repeat2',
 'arm_left_repeat2',
 'arm_right_repeat2',
 'ankle_left_repeat2',
 'ankle_right_repeat2',
 'ep1',
 'ep2',
 'ep3',
 'ep4',
 'ep5',
 'ep6',
 'ep7',
 'ep8',
 'zung1_scored',
 'zung2_scored',
 'zung3_scored',
 'zung4_scored',
 'zung5_scored',
 'zung6_scored',
 'zung7_scored',
 'zung8_scored',
 'zung9_scored',
 'zung10_scored',
 'zung11_scored',
 'zung12_scored',
 'zung13_scored',
 'zung14_scored',
 'zung15_scored',
 'zung16_scored',
 'zung17_scored',
 'zung18_scored',
 'zung19_scored',
 'zung20_scored',
 'smoker',
 'totsleepnap',
 'ps_diff',
 'ps_backsleep',
 'ps_wakerepeat',
 'ps_tooearly',
 'ps_notrested',
 'ps_wakeup',
 'ps_nightmare',
 'ninsomnia',
 'ninsomniadays',
 'nasal_cong_today',
 'n

In [452]:
data_df = data_df.drop(deleted_cols.to_list(), axis=1)

In [453]:
data_df.isnull().sum().sort_values(ascending=False)/len(data_df)

wsc_id               0.0
stroke_ynd           0.0
num_pregnancies      0.0
hormone_therapy      0.0
menopausal_status    0.0
                    ... 
zung_index           0.0
zung_score           0.0
ess                  0.0
sitdiam              0.0
waso                 0.0
Length: 134, dtype: float64

In [454]:
balance_cutoff = 0.9
imbalanced_classes = []
for col in data_df.columns:
    _ = data_df.columns.get_loc(col)
    if data_df.iloc[:, _].value_counts(normalize=True).head(1).values > balance_cutoff:
        imbalanced_classes.append((col, data_df.iloc[:, _].value_counts(normalize=True).head(1).values.astype(float)))

In [455]:
imbalanced_classes

[('race', array([0.95836576])),
 ('bowls_day', array([0.97003891])),
 ('cigars_day', array([0.97276265])),
 ('coronary_ynd', array([0.9233463])),
 ('angina_ynd', array([0.96692607])),
 ('atheroscl_ynd', array([0.97548638])),
 ('heartattack_ynd', array([0.95642023])),
 ('congestivehf_ynd', array([0.98871595])),
 ('coronarybypass_ynd', array([0.95525292])),
 ('stroke_ynd', array([0.97743191])),
 ('emphysema_ynd', array([0.97898833])),
 ('angioplasty_ynd', array([0.95758755])),
 ('pacemaker_ynd', array([0.99105058])),
 ('coronary_artery_stent_ynd', array([0.9766537])),
 ('asthma_med', array([0.92140078])),
 ('asthma_rescue_med', array([0.96031128])),
 ('asthma_control_med', array([0.93696498])),
 ('dep_maoi_med', array([0.99922179])),
 ('dep_tca_med', array([0.9766537])),
 ('htn_alpha_med', array([0.96264591])),
 ('htn_arb_med', array([0.93190661])),
 ('narcotics_med', array([0.9766537])),
 ('decongestants_med', array([0.95914397])),
 ('anxiety_med', array([0.92801556])),
 ('estrogen_med'

In [456]:
len(imbalanced_classes)

31

In [457]:
imbalanced_list = []
for classes in imbalanced_classes:
    imbalanced_list.append(classes[0])

In [458]:
imbalanced_list

['race',
 'bowls_day',
 'cigars_day',
 'coronary_ynd',
 'angina_ynd',
 'atheroscl_ynd',
 'heartattack_ynd',
 'congestivehf_ynd',
 'coronarybypass_ynd',
 'stroke_ynd',
 'emphysema_ynd',
 'angioplasty_ynd',
 'pacemaker_ynd',
 'coronary_artery_stent_ynd',
 'asthma_med',
 'asthma_rescue_med',
 'asthma_control_med',
 'dep_maoi_med',
 'dep_tca_med',
 'htn_alpha_med',
 'htn_arb_med',
 'narcotics_med',
 'decongestants_med',
 'anxiety_med',
 'estrogen_med',
 'androgen_med',
 'progesterone_med',
 'sedative_med',
 'stimulants_med',
 'psg_cpap',
 'psg_oxygen']

In [459]:
data_df.drop(imbalanced_list, axis=1, inplace=True)
data_df.drop_duplicates('wsc_id', inplace=True)
data_df.set_index('wsc_id', inplace=True)
data_df.fillna(0, inplace=True)