This uses the `processed.cleveland.data` file from the [Heart Disease Data Set](https://archive.ics.uci.edu/ml/datasets/Heart+Disease).

In [None]:
import numpy as np
import pandas as pd
from sklearn import linear_model

In [None]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'

columns = [
    'age', 'sex', 'cp',
    'trestbps', 'chol', 'fbs',
    'restecg', 'thalach', 'exang',
    'oldpeak', 'slope', 'ca',
    'thal', 'num'
]

In [None]:
df = pd.read_csv(url, na_values=['?'], names=columns)
df.head()

In [None]:
df[df['num'] > 1] = 1
df.rename(columns={'num': 'label'}, inplace=True)
df.head()

In [None]:
df['ca'] = df['ca'].fillna(df['ca'].value_counts().idxmax())
df['thal'] = df['thal'].fillna(df['thal'].value_counts().idxmax())

In [None]:
df.head()

In [None]:
df.to_csv('heart-disease-no-predictions.csv', index=False)

In [None]:
X = df.values[:,:-1]
y = df.values[:,-1]

In [None]:
clf = linear_model.Lasso(alpha=0.1)
clf.fit(X, y)

In [None]:
probs = clf.predict(X)

In [None]:
predictions = np.where(probs > 0.5, 1, 0)

In [None]:
df['prediction'] = predictions

In [None]:
num_to_label = { 1: '+', 0: '-' }
df['label'].replace(num_to_label, inplace=True)
df['prediction'].replace(num_to_label, inplace=True)

In [None]:
df.head()

In [None]:
df.to_csv('heart-disease-with-predictions.csv', index=False)