In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from statistics import mode, mean

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
data = pd.read_csv('../input/heart-disease-prediction-using-logistic-regression/framingham.csv')

### Clearing data ###

In [None]:
fig, ax = plt.subplots(figsize=(9,9))

sns.heatmap(data.corr(), square=True, annot=True, cbar=False,  ax=ax);
# and we can see that here no height correlation

In [None]:
data.isnull().sum()

Delete from data that rows where >=2 no data available.

In [None]:
data = data.dropna(axis='rows', thresh=15)
data.isnull().sum()

So we have categorical data:
- education
- BPMeds
To fill data by mode.

Continuous data:
- cigsPerDay
- totChol
- BMI
- heartRate
- glucose
To fill data by mean.

In [None]:
data["education"]=data["education"].fillna(mode(data["education"]))
data["BPMeds"]=data["BPMeds"].fillna(mode(data["BPMeds"]))

data["cigsPerDay"]=data["cigsPerDay"].fillna((data["cigsPerDay"].mean()))
data["totChol"]=data["totChol"].fillna((data["totChol"].mean()))
data["BMI"]=data["BMI"].fillna((data["BMI"].mean()))
data["heartRate"]=data["heartRate"].fillna((data["heartRate"].mean()))
data["glucose"]=data["glucose"].fillna(data["glucose"].mean())

In [None]:
data.isnull().any()

In [None]:
for col in data.columns[:-1]:
    pd.crosstab(data[col], data.TenYearCHD).plot(kind='bar')
    plt.xlabel(col)

Only 'currentSmoker' not be a good predictor of the outcome.

In [None]:
data = data.drop(columns='currentSmoker')

In [None]:
X = data[['male','age','education','cigsPerDay','BPMeds','prevalentStroke','prevalentHyp','diabetes','totChol','sysBP','diaBP','BMI','heartRate','glucose']]
y = pd.Series(data['TenYearCHD'])

### Train model ###

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

We need to normalize our data, and shift the mean to the origin. This is important to get accurate results because of the nature of the logistic equation. This is done by the normalize method.
StandardScaler transforms the data in such a manner that it has mean as 0 and standard deviation as 1. In short, it standardizes the data. 

Check that the mean of each feature (column) is 0

In [None]:
np.around(X_train.mean(axis = 0), 10)

Check that the std of each feature (column) is 1

In [None]:
X_train.std(axis = 0)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train);

In [None]:
labels = model.predict(X_test)

In [None]:
accuracy_score(y_test, labels)

### Research model ###

It is the best score in this model and you will see why we stay on it:
Creating an array containing labels depending on the specified threshold and looking at the results classification_report.

In [None]:
acc = np.array([])
for i in range(0, 100, 10):
    y_pred_new_threshold = (model.predict_proba(X_test)[:, 1]>= i/100).astype(int)
    newscore = accuracy_score(y_test, y_pred_new_threshold)
    acc = np.append(acc, [y_pred_new_threshold])
acc = acc.astype(int)
acc = acc.reshape(10,-1)

In [None]:
i=0
for l in acc:
    print('***', i, '***')
    matrix = confusion_matrix(y_test, l)
    print('\n', matrix)
    print(classification_report(y_test, l))
    i+=1

![image.png](attachment:image.png)

In some situations it is possible to maximize either recall or precision at the expense of another metric. For example, when pre-screening patients for follow — up, we would probably like to get a review of about 1.0-we want to find all patients who actually have the disease and we can accept low accuracy if the cost of follow-up is not significant. However, there is a simpler metric that takes into account both accuracy and recall, and so you can aim to maximize this number to make the model better. This F1-score that is a harmonic mean of precision and recall.
**And now we see that F-1-score of model = 86%**.
So it is better result who don`t have disease. And for creatind better model to need more data.

In [None]:
logit_roc_auc = roc_auc_score(y_test, labels)
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()