In [1]:
import pandas as pd
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path = "/content/drive/MyDrive/Colab Notebooks/heart.xlsx"
xls = pd.ExcelFile(file_path)

In [4]:
# Load the main dataset
df = pd.read_excel(xls, sheet_name='heart')
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


(a) Chi-square Test: Difference in Heart Disease Presence between Males and Females

In [5]:
contingency_table = pd.crosstab(df['sex'], df['target'])

# Perform Chi-square test for independence
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print("\nChi-square Test Results:")
print(f"Chi-square Statistic: {chi2:.4f}")
print(f"P-value: {p:.6f}")



Chi-square Test Results:
Chi-square Statistic: 22.7172
P-value: 0.000002


Conclusion: There is a significant difference between male and female patients regarding heart disease occurrence.

# **b)**

In [6]:
# Identifying categorical and continuous variables
continuous_features = ["age", "trestbps", "chol", "thalach", "oldpeak"]
categorical_binary_features = ["sex", "fbs", "exang"]
categorical_nominal_features = ["restecg", "thal"]
categorical_ordinal_features = ["cp", "slope", "ca"]

# Standardizing continuous features (Z-score normalization)
scaler = StandardScaler()
df[continuous_features] = scaler.fit_transform(df[continuous_features])

# One-Hot Encoding for nominal categorical features
df = pd.get_dummies(df, columns=categorical_nominal_features, drop_first=True)

# No transformation needed for binary and ordinal categorical features

# Display dataset after transformations
print("Dataset after Normalization and Encoding:")
print(df.head())

Dataset after Normalization and Encoding:
        age  sex  cp  trestbps      chol  fbs   thalach  exang   oldpeak  \
0  0.952197    1   3  0.763956 -0.256334    1  0.015443      0  1.087338   
1 -1.915313    1   2 -0.092738  0.072199    0  1.633471      0  2.122573   
2 -1.474158    0   1 -0.092738 -0.816773    0  0.977514      0  0.310912   
3  0.180175    1   1 -0.663867 -0.198357    0  1.239897      0 -0.206705   
4  0.290464    0   0 -0.663867  2.082050    0  0.583939      1 -0.379244   

   slope  ca  target  restecg_1  restecg_2  thal_1  thal_2  thal_3  
0      0   0       1      False      False    True   False   False  
1      0   0       1       True      False   False    True   False  
2      2   0       1      False      False   False    True   False  
3      2   0       1       True      False   False    True   False  
4      2   0       1       True      False   False    True   False  


Standardization Applied: Features are now scaled for better model performance.

### (c) Logistic Regression Model ###
# Selecting features and target variable