In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats


csv_file = Path('Resources/framingham.csv')

heart_analysis_df = pd.read_csv(csv_file)

heart_analysis_df

## Male vs Female Population

### Analysis:

In [None]:
male_count = (heart_analysis_df['male'] == 1).sum()
female_count = (heart_analysis_df['male'] == 0).sum()
print(f'males: {male_count}')
print(f'females: {female_count}')
print(f'total population: {male_count+female_count}')
labels = ['female', 'male']
sizes = [female_count, male_count]
sex_population_df = pd.DataFrame({'Sex': labels, 'Count': sizes})
sex_population_df.plot.pie(y='Count', labels=sex_population_df['Sex'], autopct='%1.1f%%', legend=False)
plt.title('Distribution of Male and Female')
plt.show()

In [None]:
diabetes_count = (heart_analysis_df['diabetes'] == 1).sum()
non_diabetes_count = (heart_analysis_df['diabetes'] == 0).sum()
print(f'Diabetes: {diabetes_count}')
print(f'Non-diabetic: {non_diabetes_count}')
print(f'Total population: {diabetes_count+non_diabetes_count}')
labels = ['Diabetes', 'Non-diabetic']
sizes = [diabetes_count, non_diabetes_count]
diabetes_population_df = pd.DataFrame({'diabetes': labels, 'Count': sizes})
diabetes_population_df.plot.pie(y='Count', labels=diabetes_population_df['diabetes'], autopct='%1.1f%%', legend=False, startangle=90)
plt.title('Distribution of Diabetes and Non-diabetic')
plt.show()

In [None]:
contingency_table = pd.crosstab(heart_analysis_df['diabetes'], heart_analysis_df['TenYearCHD'])

# Perform the Chi-square test
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)

# Output the results
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p_value}")
print(f"Degrees of freedom: {dof}")
print(f"Expected frequencies:\n{expected}")

In [None]:
smoker_count = (heart_analysis_df['currentSmoker'] == 1).sum()
non_smoker_count = (heart_analysis_df['currentSmoker'] == 0).sum()
print(f'Diabetes: {smoker_count}')
print(f'Non-diabetic: {non_smoker_count}')
print(f'Total population: {smoker_count+non_smoker_count}')
labels = ['Smoker', 'Non-smoker']
sizes = [smoker_count, non_smoker_count]
smoker_population_df = pd.DataFrame({'currentSmoker': labels, 'Count': sizes})
smoker_population_df.plot.pie(y='Count', labels=smoker_population_df['currentSmoker'], autopct='%1.1f%%', legend=False)
plt.title('Distribution of Smoker and Non-Smoker')
plt.show()

In [None]:
male_df = heart_analysis_df[heart_analysis_df['male'] == 1]
smoker_count = (male_df['currentSmoker'] == 1).sum()
non_smoker_count = (male_df['currentSmoker'] == 0).sum()
print(f'Diabetes: {smoker_count}')
print(f'Non-diabetic: {non_smoker_count}')
print(f'Total population: {smoker_count+non_smoker_count}')
labels = ['Smoker', 'Non-smoker']
sizes = [smoker_count, non_smoker_count]
smoker_population_df = pd.DataFrame({'currentSmoker': labels, 'Count': sizes})
smoker_population_df.plot.pie(y='Count', labels=smoker_population_df['currentSmoker'], autopct='%1.1f%%', legend=False)
plt.title('Male Distribution of Smoker and Non-Smoker')
plt.show()

In [None]:
female_df = heart_analysis_df[heart_analysis_df['male'] == 0]
smoker_count = (female_df['currentSmoker'] == 1).sum()
non_smoker_count = (female_df['currentSmoker'] == 0).sum()
print(f'Diabetes: {smoker_count}')
print(f'Non-diabetic: {non_smoker_count}')
print(f'Total population: {smoker_count+non_smoker_count}')
labels = ['Smoker', 'Non-smoker']
sizes = [smoker_count, non_smoker_count]
smoker_population_df = pd.DataFrame({'currentSmoker': labels, 'Count': sizes})
smoker_population_df.plot.pie(y='Count', labels=smoker_population_df['currentSmoker'], autopct='%1.1f%%', legend=False)
plt.title('Female Distribution of Smoker and Non-Smoker')
plt.show()

In [None]:

education_counts_df = heart_analysis_df

education_counts_df['education'] = education_counts_df['education'].astype(str)

education_counts_df['education'] = education_counts_df['education'].replace('nan', 'Not Specified')

education_counts_df = education_counts_df['education'].value_counts().sort_index()

education_counts_df.plot(kind='bar')
plt.title('Education Level Distribution')
plt.xlabel('Education Level')
plt.ylabel('Number of Individuals')
plt.xticks(rotation=0) 
plt.show()

In [None]:
# Create a histogram for the age distribution
age_histogram = heart_analysis_df
plt.hist(age_histogram['age'], bins=10, edgecolor='black') 
plt.title('Age Histogram')
plt.xlabel('Age')
plt.ylabel('Number of people')
plt.grid(axis='y', alpha=0.75) 

plt.show()

In [None]:
number_of_male_TYCHD = male_df[male_df['TenYearCHD'] == 1]['TenYearCHD'].sum()
number_of_female_TYCHD = female_df[female_df['TenYearCHD'] == 1]['TenYearCHD'].sum()

male_percentage = round((number_of_male_TYCHD/male_df["TenYearCHD"].count()) * 100, 2)
female_percentage = round((number_of_female_TYCHD/female_df["TenYearCHD"].count()) * 100, 2)

print(f'Male % of heart disease: {male_percentage}%')
print(f'Female % of heart disease: {female_percentage}%')

## Atuls's Section

In [None]:
print('This is your teammates Atul\'s Section')

## Ezrelle's Section

In [None]:
import numpy as np
# Generate some random data
data = {'x': np.random.rand(10),
        'y': np.random.rand(10)}
df = pd.DataFrame(data)

# Plot the data
plt.scatter(df['x'], df['y'])

# Coordinates where you want to place the plus sign
plus_x, plus_y = 0.5, 0.5 # Adjust these values as needed

# Plotting the plus sign
plt.plot(plus_x, plus_y, marker='+', markersize=20, color='red')

# Display the plot
plt.show()

# BPMeds Analysis
Analysis here is based on certain key variables such as age, blood pressure medication usage (BPMeds), and the incidence of Ten Year CHD.

### Analysis
1. Approximately 15% of the Framingham Dataset has the incidence of Ten Year CHD (Coronary Heart Disease)
2. Of these 15%, 33% of individuals reported using the Blood Pressure Medications

### Age - BPMeds - TenYearCHD Analyis
1. The majority of individuals in this dataset fall within the age range of 30 to 70 years.
2. The distribution of age in the dataset is relatively normal, mean age is approximately 49 years

In [None]:
bpmeds_df = heart_analysis_df[["BPMeds","TenYearCHD"]]
# drop null values
bpmeds_df = bpmeds_df.dropna()
bpmeds_df


In [None]:
# Percentage of TenYearCHD in the dataset
chd_percentage = round( bpmeds_df[bpmeds_df["TenYearCHD"] == 1]["TenYearCHD"].count() / bpmeds_df["TenYearCHD"].count() * 100 , 2)
chd_percentage
print(f"Approximately {chd_percentage}% of the dataset has the incidence of Ten Year CHD")

# BPMeds effect on Ten Year CHD Analysis
no_bpmeds_count = bpmeds_df[ (bpmeds_df["BPMeds"] == 0)]["BPMeds"].count()
no_bpmeds_no_tychd_sum = bpmeds_df[ (bpmeds_df["BPMeds"] == 0) & (bpmeds_df["TenYearCHD"] == 0) ]["BPMeds"].count()
no_bpmeds_with_tychd_sum = bpmeds_df[ (bpmeds_df["BPMeds"] == 0) & (bpmeds_df["TenYearCHD"] == 1) ]["BPMeds"].count()

bpmeds_count = bpmeds_df[ (bpmeds_df["BPMeds"] == 1)]["BPMeds"].count()
bpmeds_no_tychd_sum = bpmeds_df[ (bpmeds_df["BPMeds"] == 1) & (bpmeds_df["TenYearCHD"] == 0) ]["BPMeds"].count()
bpmeds_with_tychd_sum = bpmeds_df[ (bpmeds_df["BPMeds"] == 1) & (bpmeds_df["TenYearCHD"] == 1) ]["BPMeds"].count()

print(f"No BPMeds and no CHD in Ten Years Percentage: {round(no_bpmeds_no_tychd_sum / no_bpmeds_count * 100 , 2)}%")
print(f"No BPMeds and with CHD in Ten Years Percentage: {round(no_bpmeds_with_tychd_sum / no_bpmeds_count * 100 , 2)}%")
print(f"BPMeds and no CHD in Ten Years Percentage: {round(bpmeds_no_tychd_sum / bpmeds_count * 100 , 2)}%")
print(f"BPMeds and with CHD in Ten Years Percentage: {round(bpmeds_with_tychd_sum / bpmeds_count * 100 , 2)}%")


In [None]:
bpmeds_chd_counts = bpmeds_df.groupby(['BPMeds', 'TenYearCHD']).size().unstack()
print(bpmeds_chd_counts)

# Plot the relationship between BPMeds and TenYearCHD using matplotlib
plt.figure(figsize=(6,4))
bpmeds_chd_counts.plot(kind='bar', stacked=True)
plt.title('Relationship between Blood Pressure Medication Usage and Ten Year CHD')
plt.xlabel('BPMeds')
plt.ylabel('Number of Individuals')
plt.xticks(ticks=[0,1], labels=['No Medication', 'Medication'])
plt.legend(title='Ten Year CHD', loc='upper right')
plt.show()

In [None]:
# Age - BPMeds - TenYearCHD
bpmeds_age_chd_df = heart_analysis_df[["BPMeds", "TenYearCHD", "age"]]

#Drop null values
bpmeds_age_chd_df = bpmeds_age_chd_df.dropna()
bpmeds_age_chd_df

In [None]:
# Average Age in the given dataset with BPMeds
average_age = bpmeds_age_chd_df["age"].mean()
print(average_age)

#  scatter plot to show age vs. TenYearCHD and BPMeds
plt.figure(figsize=(10,6))
for (row, values) in (bpmeds_age_chd_df.groupby('BPMeds')):
    plt.scatter(values['age'], values['TenYearCHD'], label='BPMeds: ' + str(row))
plt.title('Age vs. Ten Year CHD by Blood Pressure Medication Usage')
plt.xlabel('Age')
plt.ylabel('Ten Year CHD')
plt.legend()
plt.show()

## Tianyue's Section

In [None]:
print('This is your teammates Tianyue\'s Section')