In [None]:
# #age: Age of the patient in years.
# sex: Sex of the patient. (1 = male, 0 = female)
# cp (chest pain type):
# 0: Typical angina
# 1: Atypical angina
# 2: Non-anginal pain
# 3: Asymptomatic
# trestbps (resting blood pressure): Resting blood pressure in mm Hg on admission to the hospital.
# chol (serum cholestoral): Serum cholesterol in mg/dl.
# fbs (fasting blood sugar): Fasting blood sugar > 120 mg/dl. (1 = true, 0 = false)
# restecg (resting electrocardiographic results):
# 0: Normal
# 1: Having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# 2: Showing probable or definite left ventricular hypertrophy by Estes' criteria
# thalach (maximum heart rate achieved): Maximum heart rate achieved during the test.
# exang (exercise-induced angina): Exercise-induced angina. (1 = yes, 0 = no)
# oldpeak : ST depression induced by exercise relative to rest.
# slope (the slope of the peak exercise ST segment):
# 0: Upsloping
# 1: Flat
# 2: Downsloping
# ca (number of major vessels colored by fluoroscopy): Number of major vessels (0-3) colored by fluoroscopy.
# thal (thalassemia):
# 1: Normal
# 2: Fixed defect
# 3: Reversible defect

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Load the data
data = pd.read_csv('heart.csv')

# Basic Information
print("Basic info")
print(data.info())
print(data.describe())



In [None]:

# Missing Values
print("Missing values")
print(data.isnull().sum())



In [None]:
# Univariate Analysis - Categorical Variables
print("Universal analysis categorical variables")
categorical_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']
for col in categorical_columns:
    sns.countplot(x=col, data=data)
    plt.title(f'Distribution of {col}')
    plt.show()


In [None]:

# Univariate Analysis - Numerical Variables
print("Universal analysis numerical variables")
numerical_columns = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for col in numerical_columns:
    sns.histplot(data[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()
    sns.boxplot(x=data[col])
    plt.title(f'Boxplot of {col}')
    plt.show()


In [None]:

# Bivariate Analysis - Categorical vs Target
print("Bivariable analysis categorical variables vs target")
for col in categorical_columns[:-1]:  # exclude target itself
    sns.countplot(x=col, hue='target', data=data)
    plt.title(f'{col} vs Target')
    plt.show()


In [None]:

# Bivariate Analysis - Numerical vs Target
print("Bivariable analysis numerical variables vs target")
for col in numerical_columns:
    sns.boxplot(x='target', y=col, data=data)
    plt.title(f'{col} vs Target')
    plt.show()
    sns.violinplot(x='target', y=col, data=data)
    plt.title(f'{col} vs Target')
    plt.show()


In [None]:

# Pair Plot
print('Pair plot')
sns.pairplot(data)
plt.show()


In [None]:

# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:

# Groupby Statistics
print("Group by stats")
print(data.groupby('target').mean())

# Detecting Outliers using IQR
print("Outlayers using IQR")
for col in numerical_columns:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = data[(data[col] < (Q1 - 1.5 * IQR)) | (data[col] > (Q3 + 1.5 * IQR))]
    print(f'Outliers in {col}:')
    print(outliers)

In [None]:
# Features engineering
# Select 10 best features based on the chi-squared test
from sklearn.feature_selection import SelectKBest, chi2

selector = SelectKBest(chi2,k= 10)
X = data.drop(columns=['target'])
y = data['target']

X_new  = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print(selected_features)


In [None]:
# Splitting data into training and testing sets
from sklearn.model_selection import train_test_split

X = data.drop(columns=['target'])
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train and evaluate

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Define the models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Support Vector Machine': SVC(kernel='linear')
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
    }

# Display the results
results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
# Make a prediction

new_data = pd.DataFrame({
    'age': [63, 45],
    'sex': [1, 0],
    'cp': [3, 2],
    'trestbps': [145, 130],
    'chol': [100, 245],
    'fbs': [1, 0],
    'restecg': [0, 1],
    'thalach': [150, 234],
    'exang': [0, 1],
    'oldpeak': [2.3, 0.5],
    'slope': [1, 2],
    'ca': [0, 1],
    'thal': [3, 2]
})

results = {}
for model_name, model in models.items():
    y_pred = model.predict(new_data)
    
   
    
    results[model_name] = {
        'Prediction': y_pred,
    
    }

pred_df = pd.DataFrame(results)

pred_df