## The Plan!

### First Model
- gender (1 for Male, 0 for Female)
- age (float)
- income (float)
- education (float)
- bmi (float)
- smokes (0/1)
- eats_fruits (0/1)
- eats_veggies (0/1)
- drinks_alcohol (0/1)
- physical_activity (0/1)

### Second Model
#### Attributes from first model
- gender (1 for Male, 0 for Female)
- age (float)
- income (float)
- education (float)
- bmi (float)
- smokes (0/1)
- eats_fruits (0/1)
- eats_veggies (0/1)
- drinks_alcohol (0/1)
- physical_activity (0/1)
#### New attributes
- diabetes (0/1)
- high_blood_pressure (0/1)
- high_cholesterol (0/1)
- cholesterol_checked (0/1)
- heart_disease (0/1)
- general_health (float)
- mental_health (float)
- physical_health (float)
- difficulty_walking (0/1)
- has_healthcare (0/1)
- medical_costs (0/1)

In [49]:
import pandas as pd

df = pd.read_csv("strokes2.csv")

print(f"Total of {df.shape[0]} entries")
print(f"{len(df[df['Stroke'] == 1])} of these had strokes")

new_column_names = {
    "Diabetes_binary" : "diabetes", "HighBP" : "high_blood_pressure", 
    "HighChol" : "high_cholesterol", "CholCheck" : "cholesterol_checked", 
    "BMI" : "bmi", "Smoker" : "smokes", "Stroke" : "stroke", 
    "HeartDiseaseorAttack" : "heart_disease", "PhysActivity" : "physical_activity", 
    "Fruits" : "eats_fruits", "Veggies" : "eats_veggies",
    "HvyAlcoholConsump" : "drinks_alcohol", "AnyHealthcare" : "has_healthcare", 
    "NoDocbcCost" : "medical_costs", "GenHlth" : "general_health", 
    "MentHlth" : "mental_health", "PhysHlth" : "physical_health", "DiffWalk" : "difficulty_walking", 
    "Sex" : "gender", "Age" : "age", "Education" : "education", "Income": "income"}

df = df.rename(mapper = lambda col: new_column_names[col], axis=1)

df.head()

Total of 70692 entries
4395 of these had strokes


Unnamed: 0,diabetes,high_blood_pressure,high_cholesterol,cholesterol_checked,bmi,smokes,stroke,heart_disease,physical_activity,eats_fruits,...,has_healthcare,medical_costs,general_health,mental_health,physical_health,difficulty_walking,gender,age,education,income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [50]:
import random

with_stroke = df[df["stroke"] == 1]

random.seed(1234)
random_choices = random.sample(df.index[df["stroke"] == 0].tolist(), len(df[df["stroke"] == 1]))
without_stroke = df.take(random_choices)

balanced_df = pd.concat([with_stroke, without_stroke], axis=0).reset_index()


In [51]:
general_columns = [
    "gender", "age", "income", "education",
    "bmi", "smokes", "eats_fruits", "eats_veggies", 
    "drinks_alcohol", "physical_activity"
]
specific_columns = general_columns + [
    "diabetes", "high_blood_pressure", "high_cholesterol", 
    "cholesterol_checked", "heart_disease", "general_health", 
    "mental_health", "physical_health", "difficulty_walking", 
    "has_healthcare", "medical_costs"
]

Y = balanced_df["stroke"]

general_x = balanced_df[general_columns]
specific_x = balanced_df[specific_columns]

Y.to_csv("Y.csv", index=False)
general_x.to_csv("general_x.csv", index=False)
specific_x.to_csv("specific_x.csv", index=False)