In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import warnings
# warnings.filterwarnings('ignore')

In [2]:
# Set plotting style
# plt.style.use('seaborn-v0_8')
# sns.set_palette("husl")

# Load the dataset
df = pd.read_csv("/Users/phuonguyennguyen/Documents/Programming languages/EDA_heart/data/Heart_disease_cleveland_new.csv")

In [6]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,0,145,233,1,2,150,0,2.3,2,0,2,0
1,67,1,3,160,286,0,2,108,1,1.5,1,3,1,1
2,67,1,3,120,229,0,2,129,1,2.6,1,2,3,1
3,37,1,2,130,250,0,0,187,0,3.5,2,0,1,0
4,41,0,1,130,204,0,2,172,0,1.4,0,0,1,0


In [7]:
#find missing values 
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
    print(missing_data[missing_data > 0])
else:
    print("No missing values found!")

No missing values found!


In [8]:
#check for duplicates    
duplicates = df.duplicated().sum()
if duplicates.sum() > 0:
    print(f"{duplicates} duplicate rows found")
else:
    print("No duplicated values found!")

No duplicated values found!


In [9]:
#Understand about the dataset
df.info()
df.shape
df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

Summary of the dataset!
The dataset consists of 303 individuals' data. There are 14 variables in this dataset. There is no missing values or duplicated values. Meaning of each column name:
Age: Patients Age in years (Numeric)
Sex: Gender (Male : 1; Female : 0) (Nominal)
cp: Type of chest pain experienced by patient. This term categorized into 4 category.
0 typical angina, 1 atypical angina, 2 non- anginal pain, 3 asymptomatic (Nominal)
trestbps: patient's level of blood pressure at resting mode in mm/HG (Numerical)
chol: Serum cholesterol in mg/dl (Numeric)
fbs: Blood sugar levels on fasting > 120 mg/dl represents as 1 in case of true and 0 as false (Nominal)
restecg: Result of electrocardiogram while at rest are represented in 3 distinct values
0 : Normal 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of >
0.05 mV) 2: showing probable or definite left ventricular hypertrophyby Estes' criteria (Nominal)
thalach: Maximum heart rate achieved (Numeric)
exang: Angina induced by exercise 0 depicting NO 1 depicting Yes (Nominal)
oldpeak: Exercise induced ST-depression in relative with the state of rest (Numeric)
slope: ST segment measured in terms of slope during peak exercise
0: up sloping; 1: flat; 2: down sloping(Nominal)
ca: The number of major vessels (0–3)(nominal)
thal: A blood disorder called thalassemia
0: NULL 1: normal blood flow 2: fixed defect (no blood flow in some part of the heart) 3: reversible defect (a blood flow is observed but it is not normal(nominal)
target: It is the target variable which we have to predict 1 means patient is suffering from heart disease and 0 means patient is normal.

Insights Found:     
Some features such as thal, cp, slope, restecg, ca are categorical but represented as numbers 

In [22]:
#T-test and chi-squared test
def perform_statistical_tests():
    # Test 1: Age difference between groups
    no_disease = df[df['target'] == 0]['age']
    disease = df[df['target'] == 1]['age']
    
    t_stat, p_value = stats.ttest_ind(no_disease, disease)
    print(f"Age difference t-test:")
    print(f"  Mean age (No disease): {no_disease.mean():.1f} years")
    print(f"  Mean age (Disease): {disease.mean():.1f} years")
    print(f"  p-value: {p_value:.4f}")
    print(f"  Significant: {'Yes' if p_value < 0.05 else 'No'}")
    
    # Test 2: Gender association
    from scipy.stats import chi2_contingency
    gender_crosstab = pd.crosstab(df['sex'], df['target'])
    chi2, p_value, dof, expected = chi2_contingency(gender_crosstab)
    print(f"\nGender-Disease association (Chi-square test):")
    print(f"  Chi-square statistic: {chi2:.4f}")
    print(f"  p-value: {p_value:.4f}")
    print(f"  Significant: {'Yes' if p_value < 0.05 else 'No'}")

from scipy import stats
perform_statistical_tests()

Age difference t-test:
  Mean age (No disease): 52.6 years
  Mean age (Disease): 56.6 years
  p-value: 0.0001
  Significant: Yes

Gender-Disease association (Chi-square test):
  Chi-square statistic: 22.0426
  p-value: 0.0000
  Significant: Yes


In [24]:
#key takeaways and clinical recommendations
def generate_insights():
    # Calculate key statistics
    male_disease_rate = df[df['sex'] == 1]['target'].mean() * 100
    female_disease_rate = df[df['sex'] == 0]['target'].mean() * 100
    
    avg_age_disease = df[df['target'] == 1]['age'].mean()
    avg_age_no_disease = df[df['target'] == 0]['age'].mean()
    
    print("DEMOGRAPHIC RISK FACTORS:")
    print(f"• Male heart disease rate: {male_disease_rate:.1f}%")
    print(f"• Female heart disease rate: {female_disease_rate:.1f}%")
    print(f"• Average age with disease: {avg_age_disease:.1f} years")
    print(f"• Average age without disease: {avg_age_no_disease:.1f} years")
    
    print("\nCLINICAL RECOMMENDATIONS:")
    print("• Focus screening on males (higher risk)")
    print("• Implement age-based screening protocols")
    print("• Monitor chest pain symptoms carefully")
    print("• Regular cardiovascular health assessments")

generate_insights()

DEMOGRAPHIC RISK FACTORS:
• Male heart disease rate: 55.3%
• Female heart disease rate: 25.8%
• Average age with disease: 56.6 years
• Average age without disease: 52.6 years

CLINICAL RECOMMENDATIONS:
• Focus screening on males (higher risk)
• Implement age-based screening protocols
• Monitor chest pain symptoms carefully
• Regular cardiovascular health assessments
