In [1]:
import numpy as np
import pandas as pd
import category_encoders as ce

from sklearn.model_selection import train_test_split, KFold, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

In [2]:
def handle_data(csv):
    df = pd.read_csv(csv)
    
    # Encode gender
    df = df[df.gender != "Other"]
    one_hot = ce.OneHotEncoder()
    gender_encoded = one_hot.fit_transform(df.gender)
    df = df.join(gender_encoded)
    del df["gender"]
    
    # Encode work_type, Residence_type, smoking_status features
    target_encoder = ce.TargetEncoder(cols=["work_type", "Residence_type", "smoking_status"])
    df = target_encoder.fit_transform(df, df.stroke)
    
    # fill None values with mean
    df.bmi = df.bmi.fillna(df.bmi.mean())
    
    # Encode ever_marries feature
    df.ever_married = df.ever_married.replace(["Yes", "No"], [1, 0])
    
    # Scaling
    cols_to_scale = ["age", "avg_glucose_level", "bmi"]
    scaler = StandardScaler()
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    
    return df

In [3]:
def split_data(df):
    features = df.loc[:, df.columns != "stroke"]
    target = df.stroke
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)
    return x_train, x_test, y_train, y_test

In [4]:
def balance_splitting(df, rows_to_use):
    zero_stroke = df[df.stroke == 0].sample(rows_to_use)
    new_df = df[df.stroke == 1].append(zero_stroke, ignore_index=True)
    features = new_df.loc[:, new_df.columns != "stroke"]
    target = new_df.stroke
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)
    return x_train, x_test, y_train, y_test

In [5]:
path = "healthcare.csv"
data = handle_data(path)

In [6]:
x_train, x_test, y_train, y_test = split_data(data)

In [7]:
x_train_balanced, x_test_balanced, y_train_balanced, y_test_balanced = balance_splitting(data, 400)

In [9]:
knn = KNeighborsClassifier()
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [10]:
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [11]:
predict = knn.predict(x_test)

In [13]:
accuracy_score(y_test, predict)

0.9374021909233177

In [14]:
confusion_matrix(y_test, predict)

array([[1198,    0],
       [  80,    0]], dtype=int64)

In [16]:
tn, fp, fn, tp = confusion_matrix(y_test, predict).ravel()
(tn, fp, fn, tp)

(1198, 0, 80, 0)

In [17]:
y_test.value_counts()

0    1198
1      80
Name: stroke, dtype: int64

In [18]:
knn.fit(x_train_balanced, y_train_balanced)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [19]:
predict = knn.predict(x_test_balanced)

In [20]:
accuracy_score(y_test_balanced, predict)

0.588957055214724

In [21]:
confusion_matrix(y_test_balanced, predict)

array([[78, 19],
       [48, 18]], dtype=int64)

In [22]:
tn, fp, fn, tp = confusion_matrix(y_test_balanced, predict).ravel()
(tn, fp, fn, tp)

(78, 19, 48, 18)

In [23]:
y_test_balanced.value_counts()

0    97
1    66
Name: stroke, dtype: int64

In [24]:
predict_2 = knn.predict(x_test)

In [25]:
accuracy_score(y_test, predict_2)

0.7151799687010955

In [27]:
kf = KFold()
kf

KFold(n_splits=5, random_state=None, shuffle=False)

In [None]:
features = np.array(data.loc[:, data.coumns != "stroke"])
target = np.array(data.stroke)

for train_index, test_index in kf.split(features):
    x_train, y_train