# Web Scraping and Medical Prediction
#### Author: Miguel Martinez

In [1]:
import html5lib
import requests
from bs4 import BeautifulSoup
import csv
import re # regular expressions library

from sklearn import preprocessing
from sklearn.model_selection  import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC

import numpy as np
import pandas as pd

###  Read Data from Page

In [2]:
my_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/reprocessed.hungarian.data'

headers = list(range(0,13))
headers.append('label')

df = pd.read_csv(my_url, delimiter=' ', names=headers)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,label
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,-9.0,-9.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,-9.0,-9.0,3.0
4,54.0,1.0,3.0,150.0,-9.0,0.0,0.0,122.0,0.0,0.0,-9.0,-9.0,-9.0,0.0


### Read Page Containing Headers

In [3]:
my_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/heart-disease.names'

webpage = requests.get(my_url)

webpage_content = webpage.text # this returns the webpage html content

print(webpage_content)

Publication Request: 
   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
   This file describes the contents of the heart-disease directory.

   This directory contains 4 databases concerning heart disease diagnosis.
   All attributes are numeric-valued.  The data was collected from the
   four following locations:

     1. Cleveland Clinic Foundation (cleveland.data)
     2. Hungarian Institute of Cardiology, Budapest (hungarian.data)
     3. V.A. Medical Center, Long Beach, CA (long-beach-va.data)
     4. University Hospital, Zurich, Switzerland (switzerland.data)

   Each database has the same instance format.  While the databases have 76
   raw attributes, only 14 of them are actually used.  Thus I've taken the
   liberty of making 2 copies of each database: one with all the attributes
   and 1 with the 14 attributes actually used in past experiments.

   The authors of the databases have requested:

      ...that any publications resulting from the use of th

### FInd location of headers and pull out via regex

In [4]:
# find location of start
startString = '-- 1. #3  (age) '
endString = '-- 14. #58 (num)'


In [5]:
r = requests.get(my_url, stream=True)

columns = []
parsing = False
for line in r.iter_lines():
    if line:
        line = str(line)
        if startString in line:
            parsing = True
        if endString in line:
            parsing = False
            # add 'label' for last
            columns.append('label')
        if parsing == True:
            col = re.split(r'[()]', line)[1]
            #print(col)
            columns.append(col)
            

print(columns)

# update df with column names 
df.columns = columns

['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'label']


In [6]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,-9.0,-9.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,-9.0,-9.0,3.0
4,54.0,1.0,3.0,150.0,-9.0,0.0,0.0,122.0,0.0,0.0,-9.0,-9.0,-9.0,0.0


### Drop any observations that contain blanks or missing values

In [7]:
before = df.shape[1]
df.dropna(inplace=True)
print('Number of Rows droppped:', before - df.shape[1])

Number of Rows droppped: 0


In [8]:
# Double check that there is nothing to drop
df.isnull().any()

age         False
sex         False
cp          False
trestbps    False
chol        False
fbs         False
restecg     False
thalach     False
exang       False
oldpeak     False
slope       False
ca          False
thal        False
label       False
dtype: bool

In [9]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,-9.0,-9.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,-9.0,-9.0,-9.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,-9.0,-9.0,3.0
4,54.0,1.0,3.0,150.0,-9.0,0.0,0.0,122.0,0.0,0.0,-9.0,-9.0,-9.0,0.0


### Normalize Features

In [10]:
X = df.drop('label', axis=1)
X = pd.DataFrame(preprocessing.scale(X), index = X.index, columns = X.columns)
X.head()

# extract y
y = df['label']

### Split dataset into training and testing

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

### PCA on training set

In [12]:
finalPCA = None
for i in range(1,len(X.columns) +1 ):
    pca = PCA(n_components=i)
    pca.fit(X_train)

    varianceSum = pca.explained_variance_ratio_.sum()
    print('Components:', i, '| Sum of Variance Across Components:', varianceSum)
    if varianceSum >= .95:
        finalPCA = pca
        break
    
# transform both training and testing
X_train_pca = finalPCA.transform(X_train)
X_test_pca = finalPCA.transform(X_test)
    
    

Components: 1 | Sum of Variance Across Components: 0.229109719757
Components: 2 | Sum of Variance Across Components: 0.351201501158
Components: 3 | Sum of Variance Across Components: 0.450327637212
Components: 4 | Sum of Variance Across Components: 0.546158211885
Components: 5 | Sum of Variance Across Components: 0.631349604474
Components: 6 | Sum of Variance Across Components: 0.70840051408
Components: 7 | Sum of Variance Across Components: 0.77856798277
Components: 8 | Sum of Variance Across Components: 0.838568069343
Components: 9 | Sum of Variance Across Components: 0.886698263733
Components: 10 | Sum of Variance Across Components: 0.924562872546
Components: 11 | Sum of Variance Across Components: 0.961627537558


### Non-Linear SVM Classifier

In [13]:
clf = SVC(C=1, kernel='rbf', gamma=0.1, random_state=5)
clf.fit(X_train_pca, y_train)
y_predict_svm = clf.predict(X_test_pca)

score_svm = accuracy_score(y_test, y_predict_svm)
print(score_svm)

0.779661016949


### ANN MLP Classifier

In [14]:
my_ANN = MLPClassifier(hidden_layer_sizes=(100,20), activation= 'logistic', 
                       solver='adam', alpha=1e-5, random_state=5, 
                       learning_rate_init = 0.02)



# Fit ANN
my_ANN.fit(X_train, y_train)

# Predict on X_test
y_predict_ann = my_ANN.predict(X_test)

score_ann = accuracy_score(y_test, y_predict_ann)
print(score_ann)


0.728813559322
