In [24]:
from typing_extensions import dataclass_transform
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy import stats
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier





# Load the dataset
data= pd.read_csv('dataset.csv')

# Display number of rows and columns
num_rows, num_cols = data.shape
print("Number of rows:", num_rows)
print("Number of columns:", num_cols)

# Display the first few rows of the dataset
print("\nFirst few rows of the dataset:")
print(data.head())


Number of rows: 8763
Number of columns: 12

First few rows of the dataset:
   Age     Sex  Cholesterol Blood Pressure  Diabetes  Family History  Smoking  \
0   67    Male          208         158/88         0               0        1   
1   21    Male          389         165/93         1               1        1   
2   21  Female          324         174/99         1               0        0   
3   84    Male          383        163/100         1               1        1   
4   66    Male          318          91/88         1               1        1   

   Obesity  Alcohol Consumption  Physical Activity Days Per Week        BMI  \
0        0                    0                                0  31.251233   
1        1                    1                                1  27.194973   
2        0                    0                                4  28.176571   
3        0                    1                                3  36.464704   
4        1                    0            

In [2]:





# Check for missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

# Handle missing values
# No missing values found in this dataset, so no action needed

# Handle outliers
# We'll use z-score method to remove outliers for numerical features

# Function to remove outliers using z-score method
def remove_outliers_zscore(df, threshold=3):
    z_scores = np.abs(stats.zscore(df.select_dtypes(include=['float64', 'int64'])))
    filtered_entries = (z_scores < threshold).all(axis=1)
    return df[filtered_entries]

# Remove outliers
data = remove_outliers_zscore(data)

#converting into numerical data
data['Sex'] = data['Sex'].map({'Male': 0, 'Female': 1})


data[['Systolic', 'Diastolic']] = data['Blood Pressure'].str.split('/', expand=True).astype(int)

data.drop(['Blood Pressure'], axis=1, inplace=True)
scaler = StandardScaler()
numerical_cols = ['Age', 'Cholesterol', 'BMI','Physical Activity Days Per Week','Systolic','Diastolic']  # Select numerical columns to normalize
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Print the preprocessed data
print("Preprocessed data:\n", data.head())


Missing values:
 Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Physical Activity Days Per Week    0
BMI                                0
Heart Attack Risk                  0
dtype: int64
Preprocessed data:
         Age  Sex  Cholesterol  Diabetes  Family History  Smoking  Obesity  \
0  0.625557    0    -0.641579         0               0        1        0   
1 -1.539322    0     1.596895         1               1        1        1   
2 -1.539322    1     0.793023         1               0        0        0   
3  1.425621    0     1.522691         1               1        1        0   
4  0.578495    0     0.718820         1               1        1        1   

   Alcohol Consumption  Physical Activity D

In [3]:


# Separate features (X) and target variable (y)
X = data.drop('Heart Attack Risk', axis=1)
y = data['Heart Attack Risk']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)




In [25]:
## svm


svm_classifier = SVC()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
svm_classifier_scores = cross_val_score(svm_classifier, X_train_scaled, y_train, cv=kf, scoring='accuracy')
print(f'SVM Cross-Validation Scores: {svm_classifier_scores}')
print(f'SVM Mean Accuracy: {svm_classifier_scores.mean()}')

svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

# Initialize the GridSearchCV object
svm_grid_search = GridSearchCV(SVC(), svm_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
svm_grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
print(f'Best SVM Parameters: {svm_grid_search.best_params_}')
print(f'Best SVM Score: {svm_grid_search.best_score_}')

# Train the model with the best parameters
best_svm_classifier = svm_grid_search.best_estimator_
best_svm_classifier.fit(X_train_scaled, y_train)

# Make predictions on the scaled testing data
y_pred_svm_best = best_svm_classifier.predict(X_test_scaled)

# Print classification report for SVM with the best parameters
print("Classification Report for SVM (Best Params):")
print(classification_report(y_test, y_pred_svm_best,zero_division=1))




SVM Cross-Validation Scores: [0.63694722 0.64550642 0.64336662 0.65049929 0.62910128]
SVM Mean Accuracy: 0.6410841654778888
Best SVM Parameters: {'C': 0.1, 'kernel': 'linear'}
Best SVM Score: 0.6417974322396576
Classification Report for SVM (Best Params):
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1125
           1       1.00      0.00      0.00       628

    accuracy                           0.64      1753
   macro avg       0.82      0.50      0.39      1753
weighted avg       0.77      0.64      0.50      1753



In [18]:

## decision tree

dt_classifier = DecisionTreeClassifier()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
dt_classifier_scores = cross_val_score(dt_classifier, X_train_scaled, y_train, cv=kf, scoring='accuracy')
print(f'Decision Tree Cross-Validation Scores: {dt_classifier_scores}')
print(f'Decision Tree Mean Accuracy: {dt_classifier_scores.mean()}')

dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object
dt_grid_search = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
dt_grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
print(f'Best Decision Tree Parameters: {dt_grid_search.best_params_}')
print(f'Best Decision Tree Score: {dt_grid_search.best_score_}')

# Train the model with the best parameters
best_dt_classifier = dt_grid_search.best_estimator_
best_dt_classifier.fit(X_train_scaled, y_train)

# Make predictions on the scaled testing data
y_pred_dt_best = best_dt_classifier.predict(X_test_scaled)

# Print classification report for Decision Tree with the best parameters
print("Classification Report for Decision Tree (Best Params):")
print(classification_report(y_test, y_pred_dt_best))

Decision Tree Cross-Validation Scores: [0.55634807 0.5235378  0.56062767 0.53495007 0.53138374]
Decision Tree Mean Accuracy: 0.5413694721825962
Best Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best Decision Tree Score: 0.6279600570613411
Classification Report for Decision Tree (Best Params):
              precision    recall  f1-score   support

           0       0.64      0.91      0.75      1125
           1       0.34      0.08      0.13       628

    accuracy                           0.62      1753
   macro avg       0.49      0.50      0.44      1753
weighted avg       0.53      0.62      0.53      1753



In [27]:
# Random Forest

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)


rf_classifier = RandomForestClassifier()
rf_classifier_scores = cross_val_score(rf_classifier, X_train_scaled, y_train, cv=kf, scoring='accuracy')
print(f'Random Forest Cross-Validation Scores: {rf_classifier_scores}')
print(f'Random Forest Mean Accuracy: {rf_classifier_scores.mean()}')


# Define the parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
}

# Initialize the GridSearchCV object
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
rf_grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
print(f'Best Random Forest Parameters: {rf_grid_search.best_params_}')
print(f'Best Random Forest Score: {rf_grid_search.best_score_}')

# Train the model with the best parameters
best_rf_classifier = rf_grid_search.best_estimator_
best_rf_classifier.fit(X_train_scaled, y_train)

# Make predictions on the scaled testing data
y_pred_rf_best = best_rf_classifier.predict(X_test_scaled)

# Print classification report for Random Forest with the best parameters
print("Classification Report for Random Forest (Best Params):")
print(classification_report(y_test, y_pred_rf_best))

Random Forest Cross-Validation Scores: [0.62838802 0.62482168 0.62767475 0.64479315 0.61982882]
Random Forest Mean Accuracy: 0.6291012838801711
Best Random Forest Parameters: {'max_depth': 10, 'n_estimators': 200}
Best Random Forest Score: 0.6410841654778888
Classification Report for Random Forest (Best Params):
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1125
           1       0.40      0.00      0.01       628

    accuracy                           0.64      1753
   macro avg       0.52      0.50      0.39      1753
weighted avg       0.56      0.64      0.50      1753



In [15]:
## logistic regressinon


logistic_reg = LogisticRegression(max_iter=1000)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
logistic_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
logistic_reg_scores = cross_val_score(logistic_reg, X_train_scaled, y_train, cv=kf, scoring='accuracy')
print(f'Logistic Regression Cross-Validation Scores: {logistic_reg_scores}')
print(f'Logistic Regression Mean Accuracy: {logistic_reg_scores.mean()}')

# Define the parameter grid for Logistic Regression
logistic_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

# Initialize the GridSearchCV object
logistic_grid_search = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced'), logistic_param_grid, cv=5, scoring='accuracy')

# Fit the model
logistic_grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best score
print(f'Best Logistic Regression Parameters: {logistic_grid_search.best_params_}')
print(f'Best Logistic Regression Score: {logistic_grid_search.best_score_}')

# Train the model with the best parameters
best_logistic_reg = logistic_grid_search.best_estimator_
best_logistic_reg.fit(X_train_scaled, y_train)

# Make predictions on the scaled testing data
y_pred_logistic_best = best_logistic_reg.predict(X_test_scaled)

# Print classification report for logistic regression with the best parameters
print("Classification Report for Logistic Regression (Best Params):")
print(classification_report(y_test, y_pred_logistic_best, zero_division=1))


Logistic Regression Cross-Validation Scores: [0.4914408  0.50927247 0.51141227 0.50927247 0.50285307]
Logistic Regression Mean Accuracy: 0.5048502139800285
Best Logistic Regression Parameters: {'C': 0.1, 'solver': 'newton-cg'}
Best Logistic Regression Score: 0.5052781740370899
Classification Report for Logistic Regression (Best Params):
              precision    recall  f1-score   support

           0       0.65      0.50      0.56      1125
           1       0.36      0.52      0.43       628

    accuracy                           0.50      1753
   macro avg       0.51      0.51      0.49      1753
weighted avg       0.55      0.50      0.51      1753

