In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder


In [2]:
df = pd.read_csv('dataset.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
43395,56196,Female,10.0,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,5450,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,28375,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,27973,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [3]:
df.nunique()

id                   43400
gender                   3
age                    104
hypertension             2
heart_disease            2
ever_married             2
work_type                5
Residence_type           2
avg_glucose_level    12543
bmi                    555
smoking_status           3
stroke                   2
dtype: int64

In [4]:
df.smoking_status.value_counts()

never smoked       16053
formerly smoked     7493
smokes              6562
Name: smoking_status, dtype: int64

In [5]:
df.isnull().sum()

id                       0
gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64

In [6]:
df.stroke.value_counts()

0    42617
1      783
Name: stroke, dtype: int64

In [7]:
df = df.drop(['id'], axis = 1)
df = df.dropna()
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
6,Female,52.0,0,0,Yes,Private,Urban,77.59,17.7,formerly smoked,0
7,Female,75.0,0,1,Yes,Self-employed,Rural,243.53,27.0,never smoked,0
8,Female,32.0,0,0,Yes,Private,Rural,77.67,32.3,smokes,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [8]:
df = df.sample(frac=1).reset_index(drop=True)

In [9]:
df = df.rename(columns={'Residence_type': 'residence_type'})
df = df.loc[df.gender != 'Other']

In [10]:
df.to_csv('clean_data.csv', index=False)

In [11]:
df.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [12]:
import pandas as pd
from sqlalchemy import create_engine, MetaData, Table
import psycopg2
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
from sqlalchemy import Column, Integer, String, Float

connection_string = "postgres:postgres@localhost:5432/stroke_db"
engine = create_engine(f'postgresql://{connection_string}')

In [13]:
engine.table_names()

['stroke_data', 'Stroke_data']

In [14]:
df.to_sql(name="stroke_data", con=engine, if_exists='append', index=False)


In [15]:
df['ever_married'] = LabelEncoder().fit_transform(df['ever_married'])
df['residence_type'] = LabelEncoder().fit_transform(df['residence_type'])
df['gender'] = LabelEncoder().fit_transform(df['gender'])

In [16]:
df = pd.get_dummies(df, columns = ['work_type', 'smoking_status'])
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,50.0,0,0,1,1,114.76,20.8,0,0,0,0,1,0,0,1,0
1,1,47.0,0,1,1,1,82.79,33.7,0,0,0,1,0,0,1,0,0
2,1,42.0,1,0,1,1,72.22,28.6,0,0,0,1,0,0,0,0,1
3,0,20.0,0,0,0,1,177.57,18.4,0,0,0,1,0,0,0,1,0
4,1,62.0,1,0,1,0,125.90,33.2,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29067,0,82.0,0,0,0,1,87.19,24.3,0,0,0,1,0,0,0,1,0
29068,1,79.0,0,0,1,0,87.09,26.6,0,0,0,1,0,0,1,0,0
29069,1,72.0,0,0,1,1,87.29,30.8,0,0,0,0,1,0,0,0,1
29070,0,61.0,0,0,1,1,205.82,27.5,0,0,0,0,1,0,1,0,0


In [17]:
df.to_csv('model_clean_data.csv', index=False)