# 1-Import Libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
# from plotly.subplots import make_subplots as ms
from datasist.structdata import detect_outliers
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
import csv

## 2-Read & Understand Data 

In [None]:
data = pd.read_csv(r"C:\Users\Lenovo\Desktop\mid\Heart_Attack_project\Sourse\heart_2022_with_nans.csv")

In [None]:
print(data.shape)
data

In [None]:
# """
# Cell generated by Data Wrangler.
# """
# def clean_data(data):
#     # Sort by column: 'physicalhealthdays' (descending)
#     data = data.sort_values(['physicalhealthdays'], ascending=[False])
#     return data

# data_clean = clean_data(data.copy())
# data_clean.head()

##### 2.1-Change column names to lowercase to be easier to work with

In [None]:
data.columns = data.columns.str.strip().str.lower()
data.columns

##### 2.2-Show summary of data

In [None]:
data.info()

In [None]:
data.describe()

##### 2.3-Show number of unique values for each column and their unique values

In [None]:
for col in data.columns:
    print('Column Name: ',col)
    print(data[col].nunique())
    print('-'*30)
    print(data[col].unique())
    print('='*30)

# 3-Data Cleaning

#### 3.1-Dropping columns which we don't need.

In [None]:
data = data.drop(['lastcheckuptime', 'removedteeth', 'chestscan', 'hivtesting', 'fluvaxlast12', 'pneumovaxever', 'tetanuslast10tdap'], axis= 1)

#### 3.2-Check for missing values

In [None]:
data.isna().sum()

In [None]:
# visualize missing values
na_counts = data.isna().sum()
fig = px.bar(x=na_counts.values, y=na_counts.index, orientation='h', title='Missing Values', labels={'x':'Count', 'y':'Column'})
fig.update_layout(template='plotly_white')
fig.show()

#### 3.3-Drop null values in hadheartattack column as it is the target variable so we can not impute it

In [None]:
data.dropna(subset=['hadheartattack'], inplace=True)
data.reset_index(drop=True, inplace=True)

#### 3.4-Check & Drop duplicates

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)

#### 3.5-Replaces missing values in numeric columns of a dataset with the median value of each column.

In [None]:
numeric_columns = data.select_dtypes(include=['number']).columns
imputer = SimpleImputer(strategy='median')
for col in numeric_columns:
    data[col] = imputer.fit_transform(data[[col]])

#### 3.6-Replaces missing values in some of categorical columns of a dataset with the mode value of each column.

In [None]:
# categorical_columns = data.select_dtypes(include=['object']).columns
categorical_columns = ['covidpos', 'highrisklastyear', 'alcoholdrinkers', 'ecigaretteusage', 'smokerstatus', 'difficultyerrands',
                     'difficultydressingbathing', 'difficultywalking', 'difficultyconcentrating', 'blindorvisiondifficulty',
                     'deaforhardofhearing']
new_data = data[categorical_columns].copy()
imputer = SimpleImputer(strategy='most_frequent')
for col in new_data.columns:
    # data[col] = imputer.fit_transform(data[[col]])
    data[col] = imputer.fit_transform(new_data[[col]]).flatten()


##### There I try to fill missing values using KNN imputation algorithm but it take more than 20 minutes running so I hash it 

In [None]:
# from sklearn.impute import KNNImputer
# numeric_columns = data.select_dtypes(include=['number']).columns
# imputer = KNNImputer(n_neighbors=3)
# data[numeric_columns] = imputer.fit_transform(data[numeric_columns])
# data.isna().sum()/len(data)*100

##### 3.7-Remove rows with missing values

In [None]:
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

##### 3.8-Checking for null values again to be sure we don't have any more null values

In [None]:
data.isnull().sum() / len(data) * 100

##### 3.9-Check and remove outliers

In [None]:
# visualize outliers
data_columns = data[['sleephours', 'weightinkilograms', 'heightinmeters', 'bmi']]

for col in data_columns:
    fig = px.box(data, x=col)
    fig.show()

In [None]:
# Detecting and removing outliers using module detect_outliers from datasist.structdata library 
outliers_indices = detect_outliers(data, 0, ['sleephours', 'weightinkilograms', 'heightinmeters', 'bmi'])
print(len(outliers_indices))
data.drop(outliers_indices, inplace=True)
data.reset_index(inplace= True, drop= True)

In [None]:
# visualize again after removing outliers to be sure there are no more outliers
data_columns = data[['sleephours', 'weightinkilograms', 'heightinmeters', 'bmi']]

for col in data_columns:
    fig = px.box(data, x=col)
    fig.show()

In [None]:
# Show descriptive statistics after removing outliers
data.describe()

In [None]:
# Show shape after removing outliers and cleaning the data
data.shape

## 4-Feature Engineering

##### 4.1-Handle some categorical columns variables

In [None]:
data['haddiabetes'].replace({'No, pre-diabetes or borderline diabetes' : 'Borderline', 'Yes, but only during pregnancy (female)' : 'During Pregnancy'} , inplace=True)
data['smokerstatus'].replace({'Current smoker - now smokes some days' : 'Current smoker(Some days)',
                                    'Current smoker - now smokes every day' : 'Current smoker(Every day)'}, inplace=True)


data['ecigaretteusage'].replace({'Not at all (right now)' : 'Not at all',
                                        'Never used e-cigarettes in my entire life' : 'Never',
                                        'Use them every day' : 'Everyday',
                                        'Use them some days' : 'Somedays'}, inplace=True)

                                        
data['raceethnicitycategory'].replace({'White only, Non-Hispanic' : 'White',
                                             'Black only, Non-Hispanic' : 'Black',
                                             'Other race only, Non-Hispanic' : 'Other Race',
                                             'Multiracial, Non-Hispanic' : 'Multi Racial'}, inplace=True)    

data['covidpos'].replace({'Tested positive using home test without a health professional' : 'Yes'}, inplace=True)


##### 4.2-Change the format of the Age column to make it clearer

In [None]:
def handle_age(age):
    age = age.copy()  
    age[age == 'Age 80 or older'] = '80+'
    age[age != '80+'] = age.str.split(' ').str[1] + '-' + age.str.split(' ').str[3]
    return age

data['agecategory'] = handle_age(data['agecategory'])
 

In [None]:
# from sklearn.preprocessing import LabelEncoder

# cat_data = data[['generalhealth', 'physicalactivities', 'hadheartattack', 'hadangina',
#                  'hadstroke', 'hadasthma', 'hadskincancer', 'hadcopd',
#                  'haddepressivedisorder', 'hadkidneydisease', 'hadarthritis',
#                  'haddiabetes', 'deaforhardofhearing', 'blindorvisiondifficulty',
#                  'difficultyconcentrating', 'difficultywalking', 'difficultydressingbathing',
#                  'difficultyerrands', 'smokerstatus', 'ecigaretteusage', 'raceethnicitycategory',
#                  'agecategory', 'highrisklastyear', 'covidpos']]

# label_encoder = LabelEncoder()

# for col in cat_data.columns:
#     data[col] = label_encoder.fit_transform(data[col])

##### save cleaned data to csv file

In [None]:
# save cleaned data to csv file 
# cleaned_data = data.to_csv('cleaned_data.csv', index=False)

# 5-Data Analysis

##### 5.1-Show distribution of gender

In [None]:
gender_count = data['sex'].value_counts()
gender_count

In [None]:
# plot pie chart for gender distribution
px.pie(data, names='sex', color_discrete_sequence=px.colors.sequential.Cividis)

##### 5.2-Show distribution of age

In [None]:
age_category_count = data['agecategory'].value_counts()
age_category_count

In [None]:
# plot histogram for age distribution
px.histogram(data, x='agecategory', marginal='box')

##### 5.3-observing the prevalence of heart diseases among different genders.

In [None]:
# pivot table for prevalence of heart attack among different genders
counts = data.groupby(['hadheartattack', 'sex']).size().reset_index(name='count')
counts

In [None]:
# plot bar chart for prevalence of heart attack among different genders
fig = px.bar(counts, x='hadheartattack', y='count', color='sex',
             title="Prevalence of Heart Attacks Among Different Genders",
             labels={'hadheartattack': 'Had Heart Attack', 'count': 'Count'},
             barmode='group',
             template='plotly_dark')
fig.show()

##### 5.4-observing the prevalence of heart diseases among different ages.

In [None]:
# plot bar chart for prevalence of heart attack among different ages
counts = data.groupby(['hadheartattack', 'agecategory']).size().reset_index(name='count')
fig = px.bar(counts, x='agecategory', y='count', color='hadheartattack',
             title="Prevalence of Heart Attacks Among Different Ages",
             labels={'hadheartattack': 'Had Heart Attack', 'count': 'Count', 'agecategory': 'Age Category'},
             barmode='group',
             template='plotly_dark')
fig.show()

#### From prvious analysis:
>>*  From previous cells we note that females are more than males in the sample data population.

>>* From previous cells we note that mostly individuals are aged around 65-69

>>* Many individuals did not have any heart disease.

>>* Many individuals who have heart diseas is males although males less than females.

>>* People who are older than 80 are the largest group to suffer from heart disease.

>>* People who are between 18-24 and 25-29 are the Lowest group to suffer from heart disease.

#### ----------------------------------------------------------------------------------------------

# 5.5- Life style Analysis:

##### In life style analysis We will study ['physicalactivities', 'smokerstatus', 'ecigaretteusage', 'alcoholdrinkers', 'sleephours'] columns.

##### 5.5.1-Displaying the count of values in each column for life style data.

In [None]:
# count the values of life style columns
life_style_df = data[['physicalactivities', 'smokerstatus', 'ecigaretteusage', 'alcoholdrinkers']]
for col in life_style_df.columns:
    print(data[col].value_counts(normalize=True)*100, end = '\n\n')

##### 5.5.2-Explore the relation between each life style factor with gender.

In [None]:
for col in life_style_df.columns:
    counts = data.groupby([col, 'sex']).size().reset_index(name='count')

    fig = px.bar(counts, x=col, y='count', color='sex',
                title=f"Prevalence of {col} Among Different Genders",
                labels={col: col , 'count': 'Count'},
                barmode='group',
                template='plotly_dark')
    fig.show()

#### In this part we find:
>>* Many people engage in physical activities, but the largest percentage are women.

>>* The percentage of smokers is fairly similar between men and women, but we must be careful because the percentage of women in our data is higher than men.

>>* The rate of quitting smoking is higher in men.

>>* The percentage of e-cigarette smokers is higher among men than women.

>>* The percentage of alcohol users in our data is especially high among males.

##### 5.5.3-Explore the relation between each life style factor with heart disease status.

In [None]:
for col in life_style_df.columns:
    counts = data.groupby([col, 'hadheartattack']).size().reset_index(name='count')

    fig = px.bar(counts, x=col, y='count', color='hadheartattack',
             title=f"Prevalence of Heart Attacks Among Different {col} ",
             labels={'hadheartattack': 'Had Heart Attack', 'count': 'Count', col: col},
             barmode='group',
             template='plotly_dark')
    fig.show()

#### From the previous analysis:
>>* The largest percentage of people who exert physical effort do not suffer from heart disease.

>>* Quite many of those who quit smoking have heart disease, and the largest percentage of those who do not smoke do not have any heart disease.

>>* There is almost no relationship between alcohol and heart disease because the percentage of alcohol users who are sick is close to those who do not drink and are sick.

In [None]:
def calculate_percentage(data, group_columns, count_column, percentage_column):
    grouped_data = data.groupby(group_columns).size().reset_index(name='count')
    grouped_data[percentage_column] = 0
    for i in range(len(grouped_data)):
        group_value = grouped_data[group_columns[-1]][i]
        total_count = grouped_data[grouped_data[group_columns[-1]] == group_value]['count'].sum()
        percentage_value = float(grouped_data['count'][i]) / total_count * 100
        grouped_data.loc[i, percentage_column] = np.round(percentage_value, decimals=2).astype(int)  # Convert to integer
    return grouped_data


In [None]:
hadheartattack_smoker = calculate_percentage(data, ['hadheartattack', 'smokerstatus'], 'count', 'percentage')
hadheartattack_smoker

In [None]:
never_count = hadheartattack_smoker.loc[hadheartattack_smoker['smokerstatus'] == 'Never smoked', 'count'].iloc[1]
everyday_count = hadheartattack_smoker.loc[hadheartattack_smoker['smokerstatus'] == 'Current smoker(Every day)', 'count'].iloc[1]

px.pie(names=['Never', 'Everyday'],
       values=[never_count, everyday_count], 
       title='Prevalence of smoking status among had heart attack status', 
       color_discrete_sequence=px.colors.sequential.Blues_r,
       template='plotly_dark')

In [None]:
hadheartattack_ecigarette = calculate_percentage(data, ['hadheartattack', 'ecigaretteusage'], 'count', 'percentage')
hadheartattack_ecigarette

In [None]:
never_count = hadheartattack_ecigarette.loc[hadheartattack_ecigarette['ecigaretteusage'] == 'Never', 'count'].iloc[1]
everyday_count = hadheartattack_ecigarette.loc[hadheartattack_ecigarette['ecigaretteusage'] == 'Everyday', 'count'].iloc[1]

px.pie(names=['Never', 'Everyday'],
       values=[never_count, everyday_count], 
       title='Prevalence of Ecigarette Usage among Different Genders', 
       color_discrete_sequence=px.colors.sequential.Blues_r,
       template='plotly_dark')

In [None]:
hadheartattack_alcohol = calculate_percentage(data, ['hadheartattack', 'alcoholdrinkers'], 'count', 'percentage')
hadheartattack_alcohol

In [None]:
hadheartattack_physicalactivities = calculate_percentage(data, ['hadheartattack', 'physicalactivities'], 'count', 'percentage')
hadheartattack_physicalactivities

#### From the previous analysis:
>>* The percentage of people with a heart attack who smoke regularly every day is approximately two and a half times the percentage of non-smokers.

>>* Almost the percentage of people who have a heart attack and quit smoking is close to those who smoke regularly, and this means that they quit smoking because of the disease.

>>* The percentage of alcoholics who suffer from heart attacks is half that of those who do not drink alcohol.

>>* The percentage of those who exert physical effort who suffer from heart attacks is half the percentage of those who do not exert physical effort.

##### 5.5.4-Explore the relation between each life style factor with age.

In [None]:
for col in life_style_df.columns:
    counts = data.groupby([col, 'agecategory']).size().reset_index(name='count')
    
    fig = px.bar(counts, x='agecategory', y='count', color=col,
            title=f"Prevalence of {col} Among Different Ages",
            labels={col: col, 'count': 'Count', 'agecategory': 'Age Category'},
            barmode='group',
            template='plotly_dark')
    fig.show()

#### In this part:
>>* A large percentage of our age category engage in physical activity.

>>* Most of people in our data either do not smoke or have quit smoking.

>>* Most of them drink alcohol especially people between 55-74 but this may be because this is the largest age category in our data.

##### 5.5.5-Show the distribution of sleep hours.

In [None]:
# plot pie chart for distribution of sleep hours
px.pie(data, names='sleephours', title='Distribution of Sleep Hours', template='plotly_dark', hole=0.5, color_discrete_sequence=px.colors.sequential.Cividis)

In [None]:
hadheartattack_sleephours = calculate_percentage(data, ['hadheartattack', 'sleephours'], 'count', 'percentage')
hadheartattack_sleephours


#### From the previous analysis:
>>* Many people sleep between 6 to 8 hours, which is the normal rate.

>>* People who sleep an average of 7 hours are less likely to have heart attacks.

>>* People who sleep less than 6 hours or more than 8 hours are more susceptible to heart attacks.

##### ----------------------------------------------------------------

# 5.6- Chronic Diseas Analysis:

##### In chronic diseas analysis We will study ['hadangina', 'hadstroke', 'hadasthma', 'hadskincancer', 'hadcopd', 'haddepressivedisorder', 'hadkidneydisease', 'hadarthritis' 'haddiabetes' ] columns.

##### 5.6.1-Displaying the count of values in each column for chronic diseas data.

In [None]:
# count the values of cronic diseases
cronic_diseas_df = data[['hadangina', 'hadstroke', 'hadasthma',
       'hadskincancer', 'hadcopd', 'haddepressivedisorder', 'hadkidneydisease',
       'hadarthritis', 'haddiabetes' ]] 
for col in cronic_diseas_df.columns:
    print(data[col].value_counts(normalize=True)*100, end = '\n\n')

##### 5.6.2-Explore the relation between each chronic diseas factor with gender.

In [None]:
# plot bar chart for distribution of cronic diseases among different genders
for col in cronic_diseas_df.columns:
    counts = data.groupby([col, 'sex']).size().reset_index(name='count')

    fig = px.bar(counts, x=col, y='count', color='sex',
                title=f"Prevalence of {col} Among Different Genders",
                labels={col: col , 'count': 'Count'},
                barmode='group',
                template='plotly_dark')
    fig.show()

##### 5.6.3-Explore the relation between each chronic diseas factor with heart attack.

In [None]:
# plot bar chart for distribution of cronic diseases among different ages
for col in cronic_diseas_df.columns:
    counts = data.groupby([col, 'hadheartattack']).size().reset_index(name='count')

    fig = px.bar(counts, x=col, y='count', color='hadheartattack',
             title=f"Prevalence of Heart Attacks Among Different {col} ",
             labels={'hadheartattack': 'Had Heart Attack', 'count': 'Count', col: col},
             barmode='group',
             template='plotly_dark')
    fig.show()

##### 5.6.4-Explore the relation between each chronic diseas factor with age.

In [None]:
# plot bar chart for distribution of cronic diseases among different ages
for col in cronic_diseas_df.columns:
    counts = data.groupby([col, 'agecategory']).size().reset_index(name='count')
    
    fig = px.bar(counts, x='agecategory', y='count', color=col,
            title=f"Prevalence of {col} Among Different Ages",
            labels={col: col, 'count': 'Count', 'agecategory': 'Age Category'},
            barmode='group',
            template='plotly_dark')
    fig.show()

In [None]:
# plot bar chart for distribution of cronic diseases among different general health measures
for col in cronic_diseas_df.columns:
    counts = data.groupby([col, 'generalhealth']).size().reset_index(name='count')
    fig = px.bar(counts, x=col, y='count', color='generalhealth', title=f"Prevalence of {col} Among Different General Health Measures",
                labels={col: col, 'count': 'Count', 'generalhealth': 'General Health'}, barmode='group', template='plotly_dark')
    fig.show()

#### From the previous analysis:
>>* The percentage of males and females suffering from these chronic diseases is very similar.

>>* There is no strong relationship between heart attacks and these diseases.

>>* Most of these diseases are prevalent in people over the age of 60.

##### ------------------------------------------------------------------------------------------

# 5.7-Other Problems Analysis:

##### In other problems analysis We will study ['deaforhardofhearing', 'blindorvisiondifficulty', 'difficultyconcentrating', 'difficultywalking', 'difficultydressingbathing', 'difficultyerrands'] columns.

##### 5.7.1-Displaying the count of values in each column for other problems data.

In [None]:
# count the values of other problems
other_problems_df = data[['deaforhardofhearing',
       'blindorvisiondifficulty', 'difficultyconcentrating',
       'difficultywalking', 'difficultydressingbathing', 'difficultyerrands']]
for col in other_problems_df.columns:
    print(data[col].value_counts(normalize=True)*100, end = '\n\n')

##### 5.7.2-Explore the relation between each other problems factor with gender.

In [None]:
# plot bar chart for distribution of other problems among different genders
for col in other_problems_df.columns:
    counts = data.groupby([col, 'sex']).size().reset_index(name='count')

    fig = px.bar(counts, x=col, y='count', color='sex',
                title=f"Prevalence of {col} Among Different Genders",
                labels={col: col , 'count': 'Count'},
                barmode='group',
                template='plotly_dark')
    fig.show()


##### 5.7.3-Explore the relation between each other problems factor with heart attack.

In [None]:
# plot bar chart for distribution of other problems among different ages
for col in other_problems_df.columns:
    counts = data.groupby([col, 'hadheartattack']).size().reset_index(name='count')

    fig = px.bar(counts, x=col, y='count', color='hadheartattack',
             title=f"Prevalence of Heart Attacks Among Different {col} ",
             labels={'hadheartattack': 'Had Heart Attack', 'count': 'Count', col: col},
             barmode='group',
             template='plotly_dark')
    fig.show()

##### 5.7.4-Explore the relation between each other problems factor with age.

In [None]:
# plot bar chart for distribution of other problems among different ages
for col in other_problems_df.columns:
    counts = data.groupby([col, 'agecategory']).size().reset_index(name='count')
    
    fig = px.bar(counts, x='agecategory', y='count', color=col,
            title=f"Prevalence of {col} Among Different Ages",
            labels={col: col, 'count': 'Count', 'agecategory': 'Age Category'},
            barmode='group',
            template='plotly_dark')
    fig.show()

In [None]:
# plot bar chart for distribution of some difficults among different general health measures
for col in other_problems_df.columns:
    counts = data.groupby([col, 'generalhealth']).size().reset_index(name='count')
    fig = px.bar(counts, x=col, y='count', color='generalhealth', title=f"Prevalence of {col} Among Different General Health Measures",
                labels={col: col, 'count': 'Count', 'generalhealth': 'General Health'}, barmode='group', template='plotly_dark')
    fig.show()

#### From the previous analysis:
>>* The percentage of males and females suffering from these difficulties is very similar.

>>* There is no strong relationship between heart attacks and these difficulties.

>>* Most of these difficulties are prevalent in people over the age of 50 except difficulty concentrating , it is almost the same in all age category.

##### ------------------------------------------------------------------------------------

# 5.8 Other General Analysis

##### 5.8.1-Looking at some other personal factors among different genders and age groups, such as Physical Health Days, Mental Health Days, and Sleep Hours.

In [None]:
# pivot table for distribution of personal factors among different genders 
personal_factor_among_genders = data.groupby('sex').agg({'physicalhealthdays':'mean', 'mentalhealthdays':'mean', 'sleephours':'mean'})
personal_factor_among_genders

In [None]:
# plot pie chart for distribution of personal factors among different genders
dff = data[['physicalhealthdays', 'mentalhealthdays', 'sleephours']]

for col in dff.columns:    
    fig = px.pie(personal_factor_among_genders, values=col, names=personal_factor_among_genders.index, color=col , title=f"Distribution of {col} Among Different Genders",
                  template='plotly_dark', hole=0.5, color_discrete_sequence=px.colors.sequential.Cividis)
    fig.show()

#### From the previous analysis:
>>* Females sleep better than males, which makes them less susceptible to heart disease.

>>* Females experience more days of physical and mental fatigue than males.

In [None]:
# pivot table for distribution of personal factors among different ages
personal_factor_among_agegroups = data.groupby('agecategory').agg({'physicalhealthdays':'mean', 'mentalhealthdays':'mean', 'sleephours':'mean'})
personal_factor_among_agegroups

In [None]:
# plot line chart for distribution of personal factors among different ages
px.line(personal_factor_among_agegroups, x=personal_factor_among_agegroups.index, y=['physicalhealthdays', 'mentalhealthdays', 'sleephours'], template='plotly_dark')

#### From the previous analysis:
>>* People older than 50 are most susceptible to physical fatigue.

>>* Young people between the ages of 18 and 45 are the most suffering from mental problems.

In [None]:
# pivot table for distribution of personal factors among different general health
personal_factor_among_gneral_health = data.groupby('generalhealth').agg({'physicalhealthdays':'mean', 'mentalhealthdays':'mean', 'sleephours':'mean'})
personal_factor_among_gneral_health

In [None]:
# plot line chart for distribution of personal factors among different general health
px.line(personal_factor_among_gneral_health, x=personal_factor_among_gneral_health.index, y=['physicalhealthdays', 'mentalhealthdays', 'sleephours'], template='plotly_dark')

#### From the previous analysis:
>>* The healthiest people sleep for approximately 7 hours. They suffer from physical and mental problems at a rate of only one to two times a month. The fewer hours of sleep and the greater the number of times the problems recur, the less healthy they are.

In [None]:
# plot bar chart for distribution of personal factors among different general health

counts = data.groupby(['sex', 'generalhealth']).size().reset_index(name='count')

fig = px.bar(counts, x='sex', y='count', color='generalhealth',
             title="Prevalence of Heart Attacks Among Different Genders",
             labels={'generalhealth': 'General Health', 'count': 'Count', 'sex': 'Sex'},
             barmode='group',
             template='plotly_dark')

fig.show()


In [None]:
counts = data.groupby(['sleephours', 'generalhealth']).size().reset_index(name='count')

fig = px.bar(counts, x='sleephours', y='count', color='generalhealth',
             title="Prevalence of Heart Attacks Among Different Sleep Hours",
             labels={'generalhealth': 'General Health', 'count': 'Count', 'sleephours': 'Sleep Hours'},
             barmode='group',
             
             template='plotly_dark')
fig.add_trace(px.line(counts, x='sleephours', y='count', color='generalhealth').data[0])

fig.show()

In [None]:
counts = data.groupby(['agecategory', 'generalhealth']).size().reset_index(name='count')

fig = px.bar(counts, x='agecategory', y='count', color='generalhealth',
             title="Prevalence of Heart Attacks Among Different Age Categories",
             labels={'generalhealth': 'General Health', 'count': 'Count', 'agecategory': 'Age Category'},
             barmode='group',
             
             template='plotly_dark')
fig.add_trace(px.line(counts, x='agecategory', y='count', color='generalhealth').data[0])

fig.show()

#### From the previous analysis:
>>* The general health of females is better than that of males.

>>* The general health of those who sleep 6 to 8 hours is the highest among all.

>>* Most of those who suffer from poor general health are those over 50.

In [None]:
# top 10 states with highest number of heart attacks    top_state = data.groupby('state')['city'].count().nlargest(5)
top_10_states_with_heart_attacks = data.groupby('state')['hadheartattack'].count().nlargest(10)
top_10_states_with_heart_attacks

In [None]:
px.bar(top_10_states_with_heart_attacks, x=top_10_states_with_heart_attacks.index, y=top_10_states_with_heart_attacks.values, template='plotly_dark')

In [None]:
# least 10 states with lowest number of heart attacks
least_10_states_with_heart_attacks = data.groupby('state')['hadheartattack'].count().nsmallest(10)
least_10_states_with_heart_attacks

In [None]:
px.bar(least_10_states_with_heart_attacks, x=least_10_states_with_heart_attacks.index, y=least_10_states_with_heart_attacks.values, template='plotly_dark')

In [None]:
data['generalhealth'].value_counts()

In [None]:
px.pie(data, names='generalhealth', color_discrete_sequence=px.colors.sequential.Cividis)

In [None]:
# top 10 states with best general health
top_10_states_with_best_general_health = data.groupby('state')['generalhealth'].value_counts().nlargest(10).reset_index()
top_10_states_with_best_general_health

In [None]:
# worst 10 states with general health
worst_10_states_with_worst_general_health = data.groupby('state')['generalhealth'].value_counts().nsmallest(10).reset_index()
worst_10_states_with_worst_general_health

#### From the previous analysis:
>>*  Washington is the state with the most cases of heart attacks, while Virgin Islands is the least.

>>* Washington is the best states in terms of public health, while District of Columbia is the least.


In [None]:
# pip freeze > requirements.txt