## Purpose of This Notebook  

This notebook is mainly for **testing and experiments**.  
Here I tried out different ideas and steps during the project to see what works best.  
It’s not a final analysis, but more of a **scratchpad** that shows some of the process behind the work.


In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import shapiro

from src.overview_functions import convert_to_string_to_lower

from scipy.stats import chi2_contingency

In [3]:
data = pd.read_csv('../data/cleaned_data/dataset2_cleaned.csv')
data

Unnamed: 0,age,gender,cholesterol,pressure_high,heart_rate,smoking,alcohol_intake,exercise_hours,family_history,diabetes,obesity,stress_level,blood_sugar,exercise_induced_angina,chest_pain_type,heart_disease
0,75,0,228,119,66,1,2,1,0,0,1,8,119,1,atypical angina,1
1,48,1,204,165,62,1,0,5,0,0,0,9,70,1,typical angina,0
2,53,1,234,91,67,0,2,3,1,0,1,5,196,1,atypical angina,1
3,69,0,192,90,72,1,0,4,0,1,0,7,107,1,non-anginal pain,0
4,62,0,172,163,93,0,0,6,0,1,0,2,183,1,asymptomatic,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,56,0,269,111,86,0,2,5,0,1,1,10,120,0,non-anginal pain,1
996,78,0,334,145,76,0,0,6,0,0,0,10,196,1,typical angina,1
997,79,1,151,179,81,0,1,4,1,0,1,8,189,1,asymptomatic,0
998,60,0,326,151,68,2,0,8,1,1,0,5,174,1,atypical angina,1


In [9]:
data.smoking.value_counts()

smoking
0    338
1    336
2    326
Name: count, dtype: int64

In [5]:
data_columns = data.columns.drop('heart_disease')
data_columns

Index(['age', 'gender', 'cholesterol', 'pressure_high', 'heart_rate',
       'smoking', 'alcohol_intake', 'exercise_hours', 'family_history',
       'diabetes', 'obesity', 'stress_level', 'blood_sugar',
       'exercise_induced_angina', 'chest_pain_type'],
      dtype='object')

In [6]:
def multi_chi_square(df, column_name):
    contingency_table = pd.crosstab(df[column_name], df['heart_disease'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    return {
        "Feature": column_name,
        "Chi-square": round(chi2, 3),
        "Degrees of freedom": dof,
        "p-value": round(p, 3)
    }

categorical_features = data.columns.drop(['cholesterol', 'pressure_high', 'heart_rate', 'blood_sugar', 'heart_disease'])


results = [multi_chi_square(data, col) for col in categorical_features]
df_results = pd.DataFrame(results)
print(df_results)

                    Feature  Chi-square  Degrees of freedom  p-value
0                       age     592.649                  54    0.000
1                    gender       0.748                   1    0.387
2                   smoking       8.255                   2    0.016
3            alcohol_intake       4.090                   2    0.129
4            exercise_hours      13.631                   9    0.136
5            family_history       0.797                   1    0.372
6                  diabetes       0.210                   1    0.647
7                   obesity       0.283                   1    0.595
8              stress_level       6.945                   9    0.643
9   exercise_induced_angina       0.004                   1    0.951
10          chest_pain_type       2.428                   3    0.488


In [7]:
data1 = pd.read_csv('../data/cleaned_data/dataset1_cleaned.csv')
data1

Unnamed: 0,age,gender,heart_rate,pressure_high,pressure_low,glucose,kcm,troponin,heart_disease
0,64,1,66,160,83,160.0,1.80,0.012,0
1,21,1,94,98,46,296.0,6.75,1.060,1
2,55,1,64,160,77,270.0,1.99,0.003,0
3,64,1,70,120,55,270.0,13.87,0.122,1
4,55,1,64,112,65,300.0,1.08,0.003,0
...,...,...,...,...,...,...,...,...,...
1311,44,1,94,122,67,204.0,1.63,0.006,0
1312,66,1,84,125,55,149.0,1.33,0.172,1
1313,45,1,85,168,104,96.0,1.24,4.250,1
1314,54,1,58,117,68,443.0,5.80,0.359,1


In [8]:
categorical_features = data1.columns.drop(['heart_rate', 'pressure_high', 'pressure_low', 'glucose', 'kcm', 'troponin', 'heart_disease'])


results = [multi_chi_square(data, col) for col in categorical_features]
df_results = pd.DataFrame(results)
print(df_results)

  Feature  Chi-square  Degrees of freedom  p-value
0     age     592.649                  54    0.000
1  gender       0.748                   1    0.387
