In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import seaborn as sns
import tensorflow as tf
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.image as mpimg
from sqlalchemy import create_engine
from config import db_password
import psycopg2

In [2]:
# Import data from PostgreSQL database
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/strokes_db"
engine = create_engine(db_string)
stroke_df = pd.read_sql_table('total_stroke_data', con=engine)

stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,18069,Male,70.0,1,0,Yes,Self-employed,Urban,104.24,34.7,formerly smoked,0
1,49086,Female,23.0,0,0,No,Private,Urban,60.5,27.1,formerly smoked,0
2,19671,Female,58.0,0,0,Yes,Govt_job,Urban,93.15,34.7,never smoked,0
3,59225,Male,48.0,1,0,Yes,Govt_job,Urban,55.25,49.7,never smoked,0
4,25175,Female,56.0,0,0,No,Private,Rural,108.5,28.0,never smoked,0


In [3]:
# Drop nun
stroke_df = stroke_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [4]:
print('\nMissing values : ',stroke_df.isnull().sum().values.sum())


Missing values :  0


In [5]:
cat_df = stroke_df[['gender','hypertension','heart_disease','ever_married','residence_type','smoking_status','stroke']]
summary = pd.concat([pd.crosstab(cat_df[x], cat_df.stroke) for x in cat_df.columns[:-1]], keys=cat_df.columns[:-1])
summary.head(30)

Unnamed: 0,stroke,0,1
gender,Female,18335,328
gender,Male,11636,249
gender,Other,7,0
hypertension,0,26856,415
hypertension,1,3122,162
heart_disease,0,28550,452
heart_disease,1,1428,125
ever_married,No,8200,60
ever_married,Yes,21778,517
residence_type,Rural,14932,283


In [6]:
stroke_df.nunique()

id                   30555
gender                   3
age                    104
hypertension             2
heart_disease            2
ever_married             2
work_type                5
residence_type           2
avg_glucose_level    11218
bmi                    530
smoking_status           4
stroke                   2
dtype: int64

In [7]:
selected_data = stroke_df[(stroke_df["bmi"] > 25) & (stroke_df["age"] > 40)]
selected_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,18069,Male,70.0,1,0,Yes,Self-employed,Urban,104.24,34.7,formerly smoked,0
2,19671,Female,58.0,0,0,Yes,Govt_job,Urban,93.15,34.7,never smoked,0
3,59225,Male,48.0,1,0,Yes,Govt_job,Urban,55.25,49.7,never smoked,0
4,25175,Female,56.0,0,0,No,Private,Rural,108.5,28.0,never smoked,0
6,67653,Female,59.0,0,0,Yes,Private,Urban,152.38,38.5,formerly smoked,0


In [21]:
selected_data.count()

id                   2228
gender               2228
age                  2228
hypertension         2228
heart_disease        2228
ever_married         2228
work_type            2228
Residence_type       2228
avg_glucose_level    2228
bmi                  2228
smoking_status       2228
stroke               2228
dtype: int64

In [23]:
selected_data.to_csv("Stroke_selected_data.csv")