In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('ggplot')
sns.set_palette("rainbow")

In [2]:
#εδώ κανω import τις μεθόδους που θα χρειαστώ
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [3]:
data_path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
columns = 'age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target'.split(',')

heart_disease = pd.read_csv(data_path, names=columns)
print(f'Rows: {heart_disease.shape[0]}\nColumns: {heart_disease.shape[1]}')
print(heart_disease.dtypes)
heart_disease.head()

Rows: 303
Columns: 14
age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca           object
thal         object
target        int64
dtype: object


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


# Part 1

In [4]:
#Missing Values : Replace '?' with None
for column in heart_disease.select_dtypes('object'):
    heart_disease[column] = heart_disease[column].replace('?', np.nan).astype(float)

#Fill in None with most frequent value
heart_disease["ca"] = heart_disease["ca"].fillna(heart_disease['ca'].mode()[0])
heart_disease["thal"] = heart_disease["thal"].fillna(heart_disease['thal'].mode()[0])

#Turn object type into integers
heart_disease['ca'] = heart_disease['ca'].astype(float).astype(int)
heart_disease['thal'] = heart_disease['thal'].astype(float).astype(int)

In [5]:
#Target Variable
heart_disease['target'] = heart_disease['target'].map(lambda x:0 if x==0 else 1)
heart_disease.groupby("target").size()

target
0    164
1    139
dtype: int64

# Part 2

## Get Dummies 

Αρχικά, θα οριστικοποιήσω το dataset μου μετατρέποντας τις κατηγορικές μεταβλητές σε dummies. (Οι binary κατηγορικές μεταβλητές είναι ήδη σε αυτή τη μορφή).  Οι κατηγορικές μεταβλητές μου είναι οι cp, restecg, slope, ca(ordinal) , thal. Για να προκύψουν και οι ονομασίες στις κολώνες, θα αντικαταστήσω τις τιμές με τις ονομασίες τους. Επειδή οι τιμές της μεταβλητής ca έχουν μια σειρά, θα την θεωρήσουμε ως μεταβλητή ordinal, και συνεπώς δεν θα χρειαστεί να την μετατρέψω.

In [6]:
heart_disease['cp'].replace({1: "typical", 2: "atypical", 3: "non anginal", 4: "asymptomatic"}, inplace=True)
heart_disease['restecg'].replace({0: "normal", 1: "abnormality", 2: "hypertrophy"}, inplace=True)
heart_disease['slope'].replace({1: "up", 2: "flat", 3: "down"}, inplace=True)
heart_disease['thal'].replace({3: "normal", 6: "fixed", 7: "reversable"}, inplace=True)
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,typical,145.0,233.0,1.0,hypertrophy,150.0,0.0,2.3,down,0,fixed,0
1,67.0,1.0,asymptomatic,160.0,286.0,0.0,hypertrophy,108.0,1.0,1.5,flat,3,normal,1
2,67.0,1.0,asymptomatic,120.0,229.0,0.0,hypertrophy,129.0,1.0,2.6,flat,2,reversable,1
3,37.0,1.0,non anginal,130.0,250.0,0.0,normal,187.0,0.0,3.5,down,0,normal,0
4,41.0,0.0,atypical,130.0,204.0,0.0,hypertrophy,172.0,0.0,1.4,up,0,normal,0


In [7]:
heart_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    object 
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    object 
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    object 
 11  ca        303 non-null    int32  
 12  thal      303 non-null    object 
 13  target    303 non-null    int64  
dtypes: float64(8), int32(1), int64(1), object(4)
memory usage: 32.1+ KB


In [8]:
categorical = heart_disease[["cp", "restecg", "slope", "thal"]]
dummies = pd.get_dummies(categorical, drop_first = True)
print(dummies.shape)
dummies.head()

(303, 9)


Unnamed: 0,cp_atypical,cp_non anginal,cp_typical,restecg_hypertrophy,restecg_normal,slope_flat,slope_up,thal_normal,thal_reversable
0,0,0,1,1,0,0,0,0,0
1,0,0,0,1,0,1,0,1,0
2,0,0,0,1,0,1,0,0,1
3,0,1,0,0,1,0,0,1,0
4,1,0,0,1,0,0,1,1,0


In [9]:
heart_disease = pd.concat([heart_disease,dummies], axis=1)
heart_disease.drop(categorical, axis=1, inplace = True)
print(heart_disease.shape)
heart_disease.head()

(303, 19)


Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,cp_atypical,cp_non anginal,cp_typical,restecg_hypertrophy,restecg_normal,slope_flat,slope_up,thal_normal,thal_reversable
0,63.0,1.0,145.0,233.0,1.0,150.0,0.0,2.3,0,0,0,0,1,1,0,0,0,0,0
1,67.0,1.0,160.0,286.0,0.0,108.0,1.0,1.5,3,1,0,0,0,1,0,1,0,1,0
2,67.0,1.0,120.0,229.0,0.0,129.0,1.0,2.6,2,1,0,0,0,1,0,1,0,0,1
3,37.0,1.0,130.0,250.0,0.0,187.0,0.0,3.5,0,0,0,1,0,0,1,0,0,1,0
4,41.0,0.0,130.0,204.0,0.0,172.0,0.0,1.4,0,0,1,0,0,1,0,0,1,1,0


## TrainTestSplit 

Για να προχωρήσω στο feature extraction/selection κομμάτι, θα πρέπει να διαχωρίσω τις ανεξάρτητες Χ από την εξαρτημένη Υ
μεταβλητή (target). Θα χρησιμοποιήσω τη μέθοδο train_test_split. Με τη μέθοδο αυτή, χωρίζω επίσης τα δεδομένα σε δεδομένα
train, που θα τα χρησιμοποιήσω για να κατασκευάσω το μοντέλο μου, και δεδομένα test, που θα τα χρησιμοποιήσω για να κάνω
evaluate στο μοντέλο που προέκυψε.

In [10]:
X = heart_disease.drop('target',axis=1)
y = heart_disease['target']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

## Standardization 

Τώρα που χώρισα τα δεδομένα μου, χρειάζεται να αντιμετωπίσω την διαφορά που έχουν το range των τιμών των στηλών πέρα από τις κατηγορικές, καθώς αυτή η διαφορά θα δώσει στο μοντέλο μας ανακριβή αποτελέσματα.-- Όπως παρατηρήσαμε στο προηγούμενο κομμάτι, η κατανομή των δεδομένων κάθε στήλης φαίνεται να πλησιάζει πολύ την κανονική, και συνεπώς μπορούμε να εφαρμόσουμε τον StandardScaler--. Οι κολώνες που χρειάζονται κανονικοποίηση είναι όλες οι συνεχείς και η ordinal, προκειμένου να έρθουν στην ίδια κλίμακα.
Αυτές είναι οι age, trestbps, chol, thalach, oldpeak και ca.

In [12]:
columns = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']
scaler = StandardScaler()
X_train.loc[:, columns] = scaler.fit_transform(X_train.loc[:, columns])
X_test.loc[:, columns] = scaler.transform(X_test.loc[:, columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [13]:
X_train.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,cp_atypical,cp_non anginal,cp_typical,restecg_hypertrophy,restecg_normal,slope_flat,slope_up,thal_normal,thal_reversable
248,-0.228995,1.0,-0.361192,-0.625789,0.0,0.835856,0.0,-0.104372,1.397407,0,0,0,0,1,0,1,0,1
37,0.335155,1.0,1.075366,0.584069,0.0,-1.786835,1.0,-0.434139,0.342858,0,0,0,1,0,1,0,0,0
34,-1.131635,1.0,-0.07388,-0.228804,0.0,1.351028,1.0,-0.599023,-0.711691,0,1,0,0,1,0,1,1,0
55,-0.003335,1.0,-0.418654,0.395029,0.0,-1.927337,1.0,0.88493,0.342858,0,0,0,1,0,1,0,0,1
91,0.899305,0.0,1.649989,-1.533182,0.0,-0.241321,0.0,4.182603,2.451956,0,0,0,1,0,0,0,0,1


Χρειάζεται να επιλέξω τα γνωρίσματα που είναι πιο απαραίτητα για την πρόβλεψη του y. Θα κάνω την επιλογή αυτή χρησιμοποιώντας το SelectFromModel. Στο επόμενο κομμάτι, θα υλοποιήσω τους αλγορίθμους Logistic Regression, Random Forest και Naïve Bayes. Συνεπώς, θα πρέπει να επιλέξω τα σωστά γνωρίσματα για κάθε αλγόριθμο ξεχωριστά. Επίσης, ζητείται να δω την βελτίωση που γίνεται πριν και μετά την
επιλογή των γνωρισμάτων. Θα υπολογίσω λοιπόν το accuracy στο αρχικό dataset, και έπειτα σε αυτό που έχει τις στήλες που επέλεξε το SelectFromModel.

## Feature Selection 

### Logistic Regression

In [14]:
#Original Accuracy
lr = LogisticRegression(random_state=123).fit(X_train, y_train)
print(accuracy_score(y_test, lr.predict(X_test)))

0.85


In [15]:
#Επιλογή Στηλών
lr_sfm = SelectFromModel(lr, prefit=False).fit(X_train, y_train)
print(lr_sfm.get_support())

#Απ'οτι φαίνεται, το SelectFromModel επέλεξε 9 στήλες - είναι αυτές
#που έχουν True. Αυτές είναι οι: 
sel_cols_lr = X_train.columns[(lr_sfm.get_support())]
print(sel_cols_lr)

[False  True False False False False  True False  True  True  True  True
 False False  True False  True  True]
Index(['sex', 'exang', 'ca', 'cp_atypical', 'cp_non anginal', 'cp_typical',
       'slope_flat', 'thal_normal', 'thal_reversable'],
      dtype='object')


In [16]:
#New Accuracy
lr_model = LogisticRegression(random_state=123).fit(X_train[sel_cols_lr], y_train)
print(accuracy_score(y_test, lr_model.predict(X_test[sel_cols_lr])))

0.83


### Random Forest

In [17]:
#Original Accuracy
rf = RandomForestClassifier(random_state=123).fit(X_train, y_train)
print(accuracy_score(y_test, rf.predict(X_test)))

0.79


In [18]:
#Επιλογή Στηλών
rf_sfm = SelectFromModel(rf, prefit=False).fit(X_train, y_train)
print(rf_sfm.get_support())
#Απ'οτι φαίνεται, το SelectFromModel επέλεξε 9 στήλες - είναι αυτές
#που έχουν True. Αυτές είναι οι: 
sel_cols_rf = X_train.columns[(rf_sfm.get_support())]
print(sel_cols_rf)

[ True False  True  True False  True  True  True  True False False False
 False False False False  True  True]
Index(['age', 'trestbps', 'chol', 'thalach', 'exang', 'oldpeak', 'ca',
       'thal_normal', 'thal_reversable'],
      dtype='object')


In [19]:
#New Accuracy
rf_model = RandomForestClassifier(random_state=123).fit(X_train[sel_cols_rf], y_train)
print(accuracy_score(y_test, rf_model.predict(X_test[sel_cols_rf])))

0.75


### Naive Bayes

In [20]:
#Original Accuracy
nb = GaussianNB().fit(X_train, y_train) 
print(accuracy_score(y_test, nb.predict(X_test)))

0.84


KBest

In [21]:
#Επιλογή Στηλών : To kbest δεν λαμβάνει υπ'οψιν το μοντέλο που θα εφαρμόσουμε στα δεδομένα μας. Επομένως, θα εφαρμόσουμε τις στήλες που 
#θα επιλέξει στο μοντέλο NaiveBayes αλλά και στο MLP. 
kbest = SelectKBest(k=9).fit(X_train, y_train)
print(kbest.get_support())
#Απ'οτι φαίνεται, το SelectKBest επέλεξε 9 στήλες - είναι αυτές
#που έχουν True. Αυτές είναι οι: 
sel_cols_kbest = X_train.columns[(kbest.get_support())]
print(sel_cols_kbest)

[False False False False False  True  True  True  True False  True False
 False False  True  True  True  True]
Index(['thalach', 'exang', 'oldpeak', 'ca', 'cp_non anginal', 'slope_flat',
       'slope_up', 'thal_normal', 'thal_reversable'],
      dtype='object')


In [22]:
#New Accuracy
nb_model = GaussianNB().fit(X_train[sel_cols_kbest], y_train)
print(accuracy_score(y_test, nb_model.predict(X_test[sel_cols_kbest])))

0.8


In [23]:
#save results
import pickle
with open('part2_results/part2.pkl', 'wb') as h:
    pickle.dump([X_train, X_test, y_train, y_test, sel_cols_lr, sel_cols_rf, sel_cols_kbest], h)

-----------------------------

### MLP

In [24]:
#Old Accuracy
mlp = MLPClassifier(random_state=123).fit(X_train, y_train)
print(accuracy_score(y_test, mlp.predict(X_test)))

0.85




In [25]:
#New Accuracy
mlp_model = MLPClassifier(random_state=123).fit(X_train[sel_cols_kbest], y_train)
print(accuracy_score(y_test, mlp_model.predict(X_test[sel_cols_kbest])))

0.82


