# Starting with the brain stroke dataset

We will be looking out for this data as to find 

> IMporting the relevant libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("../input/full-filled-brain-stroke-dataset/full_data.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.stroke.value_counts()

> We can see that the number of other class is very less. there are two option .

In [None]:
data.describe()

In [None]:
data.isnull().sum()

# Performing Visual EDA

In [None]:
sns.set_theme(style='darkgrid')
sns.histplot(data.age)


In [None]:
sns.countplot(data.iloc[:,5])

In [None]:
with sns.axes_style('white'):
    sns.jointplot("age", "bmi" , data, kind='hex')

# Model Building

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , precision_score , roc_auc_score , confusion_matrix ,recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
X = data.drop("stroke",axis= "columns")

In [None]:
X.head()

In [None]:
y = data.stroke

In [None]:
y.head()

In [None]:
y.shape

In [None]:
onehot_encoder = OneHotEncoder(sparse=False)
X = onehot_encoder.fit_transform(X)

In [None]:
X.ndim

In [None]:
X.size

In [None]:
X.shape

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.4)

In [None]:
shapes = [X_train , X_test , y_train , y_test]

for i in shapes:
    print(i.size)

# Starting off the modelling without balancing the class

# Logistic Regression

In [None]:
logit = LogisticRegression()
logit.fit(X_train, y_train)

In [None]:
y_pred = logit.predict(X_test)

In [None]:
from sklearn import metrics

print("The accuracy score when using the logistic Regression model is {} %.".format(round(metrics.accuracy_score(y_pred, y_test)*100,2)))

In [None]:
metrics.precision_score(y_test,y_pred)

In [None]:
metrics.recall_score(y_test,y_pred)

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(logit, X_test, y_test)  
plt.show()

In [None]:
metrics.confusion_matrix(y_test,y_pred)

We can see both the function provides us the same type of result.

In [None]:
from sklearn.metrics import classification_report

print(metrics.classification_report(y_test,y_pred))

In [None]:
logit_fpr, logit_tpr, thresholds = metrics.roc_curve(y_test, y_pred)
logit_auc = metrics.roc_auc_score(y_test, y_pred)

> Now we will be using the smote technique to increase the number of sample in our predictor variable y

# K-nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors= 100)
knn.fit(X_train,y_train)

y_valid_preds = knn.predict_proba(X_test)
#Evaluation
precision = metrics.accuracy_score(y_pred, y_test) * 100
print("Accuracy with K-NN: {0:.2f}%".format(precision))
print(classification_report(y_test,y_pred))

In [None]:
logit_fpr, logit_tpr, thresholds = metrics.roc_curve(y_test, y_pred)
logit_auc = metrics.roc_auc_score(y_test, y_pred)

# Support Vector Machine

In [None]:
from sklearn.svm import SVC

svm = SVC(gamma='auto',kernel='linear',probability=True)

svm.fit(X_test,y_test)

In [None]:
y_pred = svm.predict(X_test)
#Evaluation
precision = metrics.accuracy_score(y_pred, y_test) * 100
print("Accuracy with SVM: {0:.2f}%".format(precision))

# Starting again with modelling by removing the imbalance class problem.

# Using Smote

SMOTE algorithm works in 4 simple steps:

1. Choose a minority class as the input vector
2. Find its k nearest neighbors (k_neighbors is specified as an argument in the SMOTE() function)
3. Choose one of these neighbors and place a synthetic point anywhere on the line joining the point under consideration and its chosen neighbor
4. Repeat the steps until data is balanced

So here we will be applying the smote techniques on the train dataset and then fitting the model out of it. We will not be over or under sampling anything on the test dataset.

In [None]:
from collections import Counter
from imblearn.combine import SMOTEENN

#before using any under or oversampling techniques.
print("This is our response variable {} before using any sampling techniques".format(Counter(y)))

#now we will going to balance the sampling techniques.
smote = SMOTEENN()

X_train_re,y_train_re = smote.fit_resample(X_train,y_train)

print("This is our response variable after using the sampling techniques {}.".format(Counter(y_train_re)))

# Scaling the dataset

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scale = scaler.fit_transform(X_train_re)

#X_test = scaler.fit_transform(X_test)

In [None]:
X_train_1, X_test_1 ,y_train_1 , y_test_1 = train_test_split(X_train_scale,y_train_re,test_size=0.3)

In [None]:
a = [X_train_1,y_train_1 , X_test_1 , y_test_1 ]

for i in a:
    print(i.shape)

# Using Logistic Regression again

In [None]:
logit_bal = LogisticRegression()
#fitting the model
logit_bal.fit(X_train_1,y_train_1)

#predicting the dataset
y_pred = logit_bal.predict(X_test_1)
#finding out the accuracy score
print("Accuracy score of the logistic regression {} .  ".format((round(metrics.accuracy_score(y_pred,y_test_1),2)*100)))

In [None]:
prob_predict = logit_bal.predict_proba(X_test_1)[::,1]

fpr , tpr ,_ = metrics.roc_curve(y_test_1,prob_predict)

#creating the roc curve
plt.plot(fpr , tpr)
plt.ylabel("True positive rate")
plt.xlabel("False positive rate")
plt.show()

# Using Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

#creating the object of the model
model_2 = SVC()
model_2.probability = True
model_2.fit(X_train_1,y_train_1)

#finding out the predicted results
prediction = model_2.predict(X_test_1)
#finding out the classification matrix
print(classification_report(y_test_1,prediction))


In [None]:
#model_2.probability = True
#finding out the probabilites of the above curve and finding out the details out of it to plot the roc curve in the smoother way 
prob_prediction = model_2.predict_proba(X_test_1)[::,1]
#plotting the roc curve
fpr , tpr ,_ = metrics.roc_curve(y_test_1 , prob_prediction)

In [None]:
#plotting the roc curve
plt.plot(fpr,tpr)
plt.ylabel("True positive rate")
plt.xlabel("False positive rate")
plt.show()

# Using the GANS to generate the artificial data, because of the smaller datasize

If you want to learn about them please have a look at this paper -https://arxiv.org/pdf/1907.00503.pdf

Or you can also read this medium article which provide a very good introduction to the tabular gans-https://towardsdatascience.com/review-of-gans-for-tabular-data-a30a2199342

In [None]:
 pip install tabgan

In [None]:
from tabgan.sampler import OriginalGenerator, GANGenerator

In [None]:
# random input data
train = pd.DataFrame(X)
target = pd.DataFrame(y)
test = pd.DataFrame(X)

# generate data
new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train, target, test, )
new_train2, new_target2 = GANGenerator().generate_data_pipe(train, target,test, )