In [None]:
%matplotlib inline #display plots inline

#import packages
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns

#load dataset
df = pd.read_csv("stroke_data.csv")

#check for missing values
msno.bar(df)
fig = plt.figure(figsize=(25, 20))
sns.heatmap(df.corr(), annot=True)
plt.savefig("Heatmap.png")

In [None]:
#Check the dataset variables and summary statistics
df.info()
df.describe()
df.head()

In [None]:
#Look for response categories
print(df['gender'].value_counts())

#Convert string data to numerical
def gender_to_numeric(x):
        if x=='Female': return 0
        if x=='Male':   return 1

df['gender'] = df['gender'].apply(gender_to_numeric)

In [None]:
#Look for response categories
print(df['ever_married'].value_counts())

#Convert string data to numerical
def ever_married_to_numeric(x):
        if x=='Yes': return 1
        if x=='No':   return 0

df['ever_married'] = df['ever_married'].apply(ever_married_to_numeric)

In [None]:
#Look for response categories
print(df['Residence_type'].value_counts())

#Convert string data to numerical
def Residence_type_to_numeric(x):
        if x=='Urban': return 1
        if x=='Rural':   return 0

df['Residence_type'] = df['Residence_type'].apply(Residence_type_to_numeric)

In [None]:
print(df['work_type'].value_counts())
import category_encoders as ce
encoder=ce.OneHotEncoder(cols='work_type',handle_unknown='return_nan',return_df=True,use_cat_names=True)
df = encoder.fit_transform(df)

In [None]:
print(df['smoking_status'].value_counts())
encoder=ce.OneHotEncoder(cols='smoking_status',handle_unknown='return_nan',return_df=True,use_cat_names=True)
df = encoder.fit_transform(df)

In [None]:
y = df['stroke'].values
df_temp = df.copy(deep=True)
df_temp.drop('stroke', inplace=True, axis=1)
X = df_temp.values

In [None]:
std_scl = StandardScaler()
std_scl.fit_transform(X)

In [None]:
model = LogisticRegression(solver='liblinear')
model.fit(X, y)
y_pred = model.predict(X)
print("prediction for datapoint 0:", model.predict([X[0]]))
print(model.score(X, y))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y, y_pred)
print(confusion_matrix(y, y_pred))
print("accuracy:", accuracy_score(y, y_pred))
print("precision:", precision_score(y, y_pred))
print("recall:", recall_score(y, y_pred))
print("f1 score:", f1_score(y, y_pred))
sns.heatmap(cf_matrix, annot=True, cmap='Blues')
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)

In [None]:
# Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=None)
pca.fit(X)

# Get the eigenvalues
print("Eigenvalues:")
print(pca.explained_variance_)
print()

# Get explained variances
print("Variances (Percentage):")
print(pca.explained_variance_ratio_ * 100)
print()

# Make the scree plot
plt.plot(np.cumsum(pca.explained_variance_ratio_ * 100))
plt.xlabel("Number of components (Dimensions)")
plt.ylabel("Explained variance (%)")

In [None]:
# Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Get the transformed dataset
X_pca = pd.DataFrame(X_pca)
print(X_pca.head())
print("\nSize: ")
print(X_pca.shape)

In [None]:
fig = plt.figure(figsize=(10, 8))
sns.heatmap(X_pca.corr(), annot=True)

In [None]:
# Make train and test sets
from sklearn.model_selection import train_test_split
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.20, shuffle=True, random_state=2)

# Initialize the logistic regression model
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=2500)

# Train the model
clf.fit(X_train_pca, y_train)

# Make predictions
y_pred = clf.predict(X_test_pca) # Predictions
y_true = y_test # True values

# Measure accuracy
from sklearn.metrics import accuracy_score
import numpy as np
print("Train accuracy:", np.round(accuracy_score(y_train, clf.predict(X_train_pca)), 2))
print("Test accuracy:", np.round(accuracy_score(y_true, y_pred), 2))

# Make the confusion matrix
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_true, y_pred)
print("\nTest confusion_matrix")
sns.heatmap(cf_matrix, annot=True, cmap='Blues')
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# Construct pipeline
sc = StandardScaler()
pca = PCA(n_components=2)
log_reg = LogisticRegression(max_iter=2500)

log_reg_model = Pipeline([
    ('std_scaler', sc),
    ('pca', pca),
    ('regressor', log_reg)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=2)
log_reg_model.fit(X_train,y_train)
y_pred = log_reg_model.predict(X_test) # Predictions
y_true = y_test # True values
cf_matrix = confusion_matrix(y_true, y_pred)
print("\nTest confusion_matrix")
sns.heatmap(cf_matrix, annot=True, cmap='Blues')
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)

In [None]:
from sklearn.metrics import precision_recall_fscore_support
print(precision_recall_fscore_support(y_true, y_pred))

In [None]:
model.predict_proba(X_test)

In [None]:
model.predict_proba(X_test)[:, 1]

In [None]:
y_pred = model.predict_proba(X_test)[:, 1] > 0.1

In [None]:
cf_matrix = confusion_matrix(y_true, y_pred)
print("\nTest confusion_matrix")
sns.heatmap(cf_matrix, annot=True, cmap='Blues')
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('True', fontsize=12)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

y_pred_proba = model.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1])

plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('1 - specificity')
plt.ylabel('sensitivity')
plt.show()

In [38]:
model1 = LogisticRegression()
model1.fit(X_train, y_train)
y_pred_proba1 = model1.predict_proba(X_test)
print("model 1 AUC score:", roc_auc_score(y_test, y_pred_proba1[:, 1]))

model2 = LogisticRegression()
model2.fit(X_train[:, 0:2], y_train)
y_pred_proba2 = model2.predict_proba(X_test[:, 0:2])
print("model 1 AUC score:", roc_auc_score(y_test, y_pred_proba2[:, 1]))

model 1 AUC score: 0.8210214318285454
model 1 AUC score: 0.803776478879078


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
