In [34]:
import pandas as pd
from scipy.stats import t

In [35]:
cleaned_data_path = 'output/cleaned_data.csv'
df = pd.read_csv(cleaned_data_path)

df.head(), df.describe(), df.dtypes

(   age     sex    dataset               cp  trestbps   chol    fbs  \
 0   63    Male  Cleveland   typical angina     145.0  233.0   True   
 1   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
 2   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
 3   37    Male  Cleveland      non-anginal     130.0  250.0  False   
 4   41  Female  Cleveland  atypical angina     130.0  204.0  False   
 
           restecg  thalch  exang  oldpeak  num  
 0  lv hypertrophy   150.0  False      2.3    0  
 1  lv hypertrophy   108.0   True      1.5    2  
 2  lv hypertrophy   129.0   True      2.6    1  
 3          normal   187.0  False      3.5    0  
 4  lv hypertrophy   172.0  False      1.4    0  ,
               age   trestbps        chol      thalch     oldpeak         num
 count  694.000000  694.00000  694.000000  694.000000  694.000000  694.000000
 mean    52.907781  132.35879  241.743516  141.347262    0.880403    0.845821
 std      9.488513   17.20061   46.65

In [36]:
point_estimates = df[['age', 'chol', 'trestbps']].mean()
point_estimates

age          52.907781
chol        241.743516
trestbps    132.358790
dtype: float64

In [37]:
def calculate_confidence_intervals(data, variables, confidence_levels=None):
    if confidence_levels is None:
        confidence_levels = [0.9, 0.95, 0.99]
    n = len(data)
    df = n - 1
    std_devs = data[variables].std()
    point_estimates = data[variables].mean()

    ci_dict = {}

    for level in confidence_levels:
        t_value = t.ppf(1 - (1 - level) / 2, df)
        margin_of_error = t_value * (std_devs / (n ** 0.5))

        ci_lower = point_estimates - margin_of_error
        ci_upper = point_estimates + margin_of_error
        ci_dict[level] = (ci_lower, ci_upper)

    return ci_dict


variables = ['age', 'chol', 'trestbps']
confidence_intervals = calculate_confidence_intervals(df, variables)

for conf_level, intervals in confidence_intervals.items():
    print(f"Confidence Level {conf_level * 100}%:")
    for var in variables:
        print(f"  {var.capitalize()}: [{intervals[0][var]:.2f}, {intervals[1][var]:.2f}]")

Confidence Level 90.0%:
  Age: [52.31, 53.50]
  Chol: [238.83, 244.66]
  Trestbps: [131.28, 133.43]
Confidence Level 95.0%:
  Age: [52.20, 53.61]
  Chol: [238.27, 245.22]
  Trestbps: [131.08, 133.64]
Confidence Level 99.0%:
  Age: [51.98, 53.84]
  Chol: [237.17, 246.32]
  Trestbps: [130.67, 134.05]
