# First Exploratory Notebook

## Data Preparation

### Generic Imports and Preparation

In [1]:
##Generic imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
##Script specific imports for feature encoding

from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(display='diagram')

In [3]:
file = r'/home/mnm7/code/sandbox/WSC - variable cross-check_v1.csv'
data = r'/home/mnm7/code/sandbox/wsc-dataset-0.2.0.csv'

In [4]:
df = pd.read_csv(file)
data_df = pd.read_csv(data)

In [5]:
deleted = df[df['Proposed Removal'] == 'R']
deleted_cols = deleted.iloc[:, 0]
#deleted_cols.to_list()

In [6]:
data_df = data_df.drop(deleted_cols.to_list(), axis=1)

In [7]:
data_df.isnull().sum().sort_values(ascending=False)/len(data_df)

psg_oxygen         0.997665
psg_cpap           0.935019
cigars_day         0.919844
bowls_day          0.917899
thyroid_problem    0.867704
                     ...   
coronary_ynd       0.000000
awake_freq         0.000000
apnea_freq         0.000000
choke_freq         0.000000
waso               0.000000
Length: 134, dtype: float64

### Imbalanced Classes

In [8]:
balance_cutoff = 0.9
imbalanced_classes = []
for col in data_df.columns:
    _ = data_df.columns.get_loc(col)
    if data_df.iloc[:, _].value_counts(normalize=True).head(1).values > balance_cutoff:
        imbalanced_classes.append((col, data_df.iloc[:, _].value_counts(normalize=True).head(1).values.astype(float)))

In [9]:
imbalanced_classes

[('race', array([0.95836576])),
 ('nasal_cong_none', array([1.])),
 ('coronary_ynd', array([0.9233463])),
 ('angina_ynd', array([0.96730245])),
 ('atheroscl_ynd', array([0.9758661])),
 ('heartattack_ynd', array([0.95679253])),
 ('congestivehf_ynd', array([0.98871595])),
 ('coronarybypass_ynd', array([0.95525292])),
 ('stroke_ynd', array([0.97743191])),
 ('emphysema_ynd', array([0.97898833])),
 ('angioplasty_ynd', array([0.95758755])),
 ('pacemaker_ynd', array([0.99105058])),
 ('coronary_artery_stent_ynd', array([0.9766537])),
 ('asthma_med', array([0.92140078])),
 ('asthma_rescue_med', array([0.96031128])),
 ('asthma_control_med', array([0.93696498])),
 ('dep_maoi_med', array([0.99922179])),
 ('dep_tca_med', array([0.9766537])),
 ('htn_alpha_med', array([0.96264591])),
 ('htn_arb_med', array([0.93190661])),
 ('narcotics_med', array([0.9766537])),
 ('decongestants_med', array([0.95914397])),
 ('anxiety_med', array([0.92801556])),
 ('estrogen_med', array([0.93385214])),
 ('androgen_med',

In [10]:
len(imbalanced_classes)

30

In [11]:
imbalanced_list = []
for classes in imbalanced_classes:
    imbalanced_list.append(classes[0])

In [12]:
imbalanced_list

['race',
 'nasal_cong_none',
 'coronary_ynd',
 'angina_ynd',
 'atheroscl_ynd',
 'heartattack_ynd',
 'congestivehf_ynd',
 'coronarybypass_ynd',
 'stroke_ynd',
 'emphysema_ynd',
 'angioplasty_ynd',
 'pacemaker_ynd',
 'coronary_artery_stent_ynd',
 'asthma_med',
 'asthma_rescue_med',
 'asthma_control_med',
 'dep_maoi_med',
 'dep_tca_med',
 'htn_alpha_med',
 'htn_arb_med',
 'narcotics_med',
 'decongestants_med',
 'anxiety_med',
 'estrogen_med',
 'androgen_med',
 'progesterone_med',
 'sedative_med',
 'stimulants_med',
 'psg_cpap',
 'psg_oxygen']

In [None]:
data_df.drop(imbalanced_list, axis=1, inplace=True)
data_df.drop_duplicates('wsc_id', inplace=True)
data_df.set_index('wsc_id', inplace=True)

### OHE

In [None]:
## Features to OHE = ?thyroid problem, hormone therapy
## Features to binarise = all the _ynd, apnea, smoke, smoke_curr, sex, nondrinker, sleepiness, nasal_cong_none

In [None]:
objlist = []

for n in data_df.dtypes[data_df.dtypes == 'object'].index:
    objlist.append(n)

In [None]:
## Binariser -  should work if nans are present or not...

for i,v in enumerate(objlist):
    
    ##columns with 2 variables eg. [N,Y] or [M,F]
    
    if len(data_df[v].unique()) == 2:
        ##print(data_df[v].unique(),v)
        data_df[objlist[i]].replace\
        ({data_df[objlist[i]].unique()[0]:0,data_df[objlist[i]].unique()[1]:1}, inplace=True)
        
    #### ALL columns with 3 variables - which appear like [N,Y,nan]
    if len(data_df[v].unique()) == 3:
        ##print(data_df[v].unique(),v)
        data_df[objlist[i]].replace\
        ({'N':0,'Y':1}, inplace=True)

In [None]:
#### ONE HOT ENCODER SCRIPT

ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)


## Only variables which need OHE
X1 = data_df[['thyroid_problem']]
X2 = data_df[['hormone_therapy']]

##fit transform, extract column names, make dataframe with column names, drop nan row

X1t = ohe.fit_transform(X1)
colnames = list(ohe.get_feature_names())
X1df = pd.DataFrame(X1t, columns = colnames)
X1df.drop(columns = 'x0_nan', inplace=True)

X2t = ohe.fit_transform(X2)
colnames = list(ohe.get_feature_names())
X2df = pd.DataFrame(X2t, columns = colnames)
X2df.drop(columns = 'x0_nan', inplace=True)

frames = [data_df, X1df, X2df]
data_df1 = pd.concat(frames, axis = 1)

##drop original row names

data_df1.drop(columns = ['thyroid_problem','hormone_therapy'], inplace=True)

In [None]:
data_df1

## Modelling

In [None]:
data_df1

In [None]:
X = data_df1.drop('waso', 'tst', 'tso', 'se')
X

In [None]:
from sklearn.model_selection import train_test_split
X_split, X_val, y_split, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
######## MACHINE LEARNING ########

## FEATURE IMPORTANCE and PRINCIPAL COMPONENT ANALYSIS
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA

## METRICS
#### REGRESSION METRICS
from sklearn.metrics import r2_score
#### CLASSIFICATION METRICS
from sklearn.metrics import accuracy_score, recall_score, precision_score,f1_score
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score
from sklearn.metrics import classification_report, average_precision_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

## HYPERPARAMETERS TUNING
import itertools
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

## LINEAR MODELS
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.datasets import make_regression

## SUPPORT VECTOR MACHINES
from sklearn.svm import SVC, LinearSVC, SVR

## KMEANS
from sklearn.cluster import KMeans

## NEIGHBORS
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

## ENSEMBLE METHODS
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier
from sklearn.ensemble import VotingRegressor, VotingClassifier

## XGBOOST

from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz