# Tugas Besar IF3170
## Aplikasi Berbasis Web untuk Klasifikasi Penyakit Jantung Berdasarkan Data Klinis 

13516081 Rabbi Fijar Mayoza <br>
13516106 Kurniandha Sukma Yunastrian <br>
13516108 Krishna Aurelio Noviandri <br>
13516137 Hafizh Budiman <br>
13516148 Trian Annas Thoriq Sumarjadi <br>

Import necessary library

In [239]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

from subprocess import call
from IPython.display import Image

Define helper function

In [253]:
# Create table for missing data analysis
def draw_missing_data_table(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

def show_model_visualization(target,hypotesis):
    visualise_df = pd.DataFrame({'target': target,'hypotesis': hypotesis})
    lines = visualise_df.plot.line()

# show correct prediction percentage
def show_prediction_percentage(feature,label,prediction):
    print("Number of correct prediction from %d data is: %d" 
      %(feature.shape[0], (label == y_predict).sum()))
    
# show performance
def show_performance(y_test, y_predict):
    print('Accuracy score: ',accuracy_score(y_test, y_predict))
    print('Precision score: ',precision_score(y_test,y_predict,average="micro"))
    print('Recall score: ',precision_score(y_test,y_predict,average="micro"))

# show confusion matrix
def conf_matrix(y_test, y_predict):
    y_actu = pd.Series(y_test, name='Actual')
    y_pred = pd.Series(y_predict, name='Predicted')
    return pd.crosstab(y_actu, y_pred)

### Load Dataset

In [262]:
# load HeartDisease dataset from external csv using pandas
def build_dataframe_Heart_Disease(Heart_Disease_file):
    Heart_Disease_df = pd.read_csv(Heart_Disease_file);
    return Heart_Disease_df

# show train dataframe
Heart_Disease_file = "data/tubes2_HeartDisease_train.csv"
train_df = build_dataframe_Heart_Disease(Heart_Disease_file)
train_df.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14
0,54,1,4,125,216,0,0,140,0,0.0,?,?,?,1
1,55,1,4,158,217,0,0,110,1,2.5,2,?,?,1
2,54,0,3,135,304,1,0,170,0,0.0,1,0,3,0
3,48,0,3,120,195,0,0,125,0,0.0,?,?,?,0
4,50,1,4,120,0,0,1,156,1,0.0,1,?,6,3


In [263]:
# show test dataframe
Heart_Disease_file = "data/tubes2_HeartDisease_test.csv"
test_df = build_dataframe_Heart_Disease(Heart_Disease_file)
test_df.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,60,1,2,160,267,1,1,157,0,0.5,2,?,?
1,61,1,4,148,203,0,0,161,0,0.0,1,1,7
2,54,1,4,130,242,0,0,91,1,1.0,2,?,?
3,48,1,4,120,260,0,0,115,0,2.0,2,?,?
4,57,0,1,130,308,0,0,98,0,1.0,2,?,?


In [264]:
#Replacing '?' with NaN
new_train_df = train_df.replace('?', np.nan)
new_train_df.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14
0,54,1,4,125,216,0,0,140,0,0.0,,,,1
1,55,1,4,158,217,0,0,110,1,2.5,2.0,,,1
2,54,0,3,135,304,1,0,170,0,0.0,1.0,0.0,3.0,0
3,48,0,3,120,195,0,0,125,0,0.0,,,,0
4,50,1,4,120,0,0,1,156,1,0.0,1.0,,6.0,3


In [265]:
# Analyze missing data
draw_missing_data_table(new_train_df)

Unnamed: 0,Total,Percent
Column12,514,0.65982
Column13,408,0.523748
Column11,262,0.336329
Column6,78,0.100128
Column10,49,0.062901
Column4,47,0.060334
Column9,44,0.056483
Column8,44,0.056483
Column5,24,0.030809
Column7,2,0.002567


In [266]:
#drop columns that have > 50% missing values 
new_train_df.drop('Column12', axis=1, inplace=True)
new_train_df.drop('Column13', axis=1, inplace=True)

#since column 5 has < 10% missing value, drop rows that have null value of column 5
#since column 8 has < 10% missing value, drop rows that have null value of column 8
#since column 9 has < 10% missing value, drop rows that have null value of column 9
#since column 10 has < 10% missing value, drop rows that have null value of column 10
#since column 7 has < 10% missing value, drop rows that have null value of column 7
new_train_df.drop(new_train_df[pd.isnull(new_train_df['Column5'])].index, inplace=True)
new_train_df.drop(new_train_df[pd.isnull(new_train_df['Column8'])].index, inplace=True)
new_train_df.drop(new_train_df[pd.isnull(new_train_df['Column9'])].index, inplace=True)
new_train_df.drop(new_train_df[pd.isnull(new_train_df['Column10'])].index, inplace=True)
new_train_df.drop(new_train_df[pd.isnull(new_train_df['Column7'])].index, inplace=True)  

new_train_df.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column14
0,54,1,4,125,216,0,0,140,0,0.0,,1
1,55,1,4,158,217,0,0,110,1,2.5,2.0,1
2,54,0,3,135,304,1,0,170,0,0.0,1.0,0
3,48,0,3,120,195,0,0,125,0,0.0,,0
4,50,1,4,120,0,0,1,156,1,0.0,1.0,3


Change NaN with mode of the columns

In [267]:
value = new_train_df.loc[:,"Column11"].mode()[0]
new_train_df['Column11'].fillna(value, inplace=True)

value = new_train_df.loc[:,"Column6"].mode()[0]
new_train_df['Column6'].fillna(value, inplace=True)

value = new_train_df.loc[:,"Column10"].mode()[0]
new_train_df['Column10'].fillna(value, inplace=True)

value = new_train_df.loc[:,"Column4"].mode()[0]
new_train_df['Column4'].fillna(value, inplace=True)

value = new_train_df.loc[:,"Column9"].mode()[0]
new_train_df['Column9'].fillna(value, inplace=True)

value = new_train_df.loc[:,"Column8"].mode()[0]
new_train_df['Column8'].fillna(value, inplace=True)

value = new_train_df.loc[:,"Column5"].mode()[0]
new_train_df['Column5'].fillna(value, inplace=True)

new_train_df.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column14
0,54,1,4,125,216,0,0,140,0,0.0,2,1
1,55,1,4,158,217,0,0,110,1,2.5,2,1
2,54,0,3,135,304,1,0,170,0,0.0,1,0
3,48,0,3,120,195,0,0,125,0,0.0,2,0
4,50,1,4,120,0,0,1,156,1,0.0,1,3


### Learning Phase

Kami menggunakan metode split-train learning dengan membagi dataset menjadi dua bagian. Train dataframe akan dibagi menjadi 90% data training dan 10% data test.

In [363]:
#splitting the dataset into the source variables (independant variables) and the target variable (dependant variable)
y = new_train_df.Column14
x = new_train_df.drop('Column14', axis=1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=350)

#### 1. Naive Bayes

In [365]:
# gaussian naive bayes model 
naive_bayes_model = GaussianNB().fit(X_train,y_train)
y_predict = naive_bayes_model.predict(X_test)

show_performance(y_test,y_predict)
conf_matrix(y_test,y_predict)

Accuracy score:  0.6901408450704225
Precision score:  0.6901408450704225
Recall score:  0.6901408450704225


Predicted,0,1,3
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,1,0
1,2,0,0
2,0,0,1


#### 2. Decision Tree

In [366]:
# decision trees id3 model
dt_model = tree.DecisionTreeClassifier(criterion="entropy",min_samples_split=44,min_samples_leaf=29).fit(X_train,y_train)
y_predict = dt_model.predict(X_test)

show_performance(y_test,y_predict)
conf_matrix(y_test,y_predict)

Accuracy score:  0.5211267605633803
Precision score:  0.5211267605633803
Recall score:  0.5211267605633803


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2,1
1,1,1
2,1,0


####  3. k-Nearest Neighbors (KNN)

In [367]:
knn_model = KNeighborsClassifier(n_neighbors=100).fit(X_train,y_train)
y_predict = knn_model.predict(X_test)

show_performance(y_test,y_predict)
conf_matrix(y_test,y_predict)

Accuracy score:  0.4647887323943662
Precision score:  0.4647887323943662
Recall score:  0.4647887323943662


Predicted,0
Actual,Unnamed: 1_level_1
0,3
1,2
2,1


#### 4. Multi Layer Perceptron (MLP)

In [368]:
mlp_model = MLPClassifier(momentum=0.8,solver='lbfgs',alpha=0.1,hidden_layer_sizes=(100, 6), random_state=1).fit(X_train,y_train)
y_predict = mlp_model.predict(X_test)

show_performance(y_test,y_predict)
conf_matrix(y_test,y_predict)

Accuracy score:  0.5774647887323944
Precision score:  0.5774647887323944
Recall score:  0.5774647887323944


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3,0
1,2,0
2,0,1
