In [33]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split



In [34]:
df=pd.read_csv('/content/heart.csv')

In [35]:
from pandas.core.arrays import categorical
numerical_features_df = df[['age', 'trestbps', 'chol', 'thalach', 'oldpeak']]
categorical_features_df = df[['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']]


In [36]:
#removing duplicates
print("Shape of the DataFrame before dropping duplicates:", df.shape)
df.drop_duplicates(inplace=True)
print("Shape of the DataFrame after dropping duplicates:", df.shape)

Shape of the DataFrame before dropping duplicates: (303, 14)
Shape of the DataFrame after dropping duplicates: (302, 14)


In [37]:
#replacing outliers
def replace_outliers_with_nulls(data, columns):
    df_copy = data.copy()
    for col in columns:
        Q1 = df_copy[col].quantile(0.25)
        Q3 = df_copy[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR

        # Replace only outlier cells with NaN
        df_copy.loc[(df_copy[col] < lower_limit) | (df_copy[col] > upper_limit), col] = np.nan
    return df_copy

#replace the outlires by nulls
df_clean = replace_outliers_with_nulls(df, numerical_features_df)

In [38]:
#fill the outliers by mean
df_filled = df_clean.fillna(df_clean.mean())


In [None]:
print(df_filled.isnull().sum())

In [39]:


# List of numerical columns
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform only the numerical columns
scaled_values = scaler.fit_transform(df_filled[numerical_cols])

# Create a new DataFrame with the same index and column names
numerical_features_df = pd.DataFrame(scaled_values,
                                     columns=numerical_cols,
                                     index=df_filled.index)

df_filled[numerical_cols] = numerical_features_df[numerical_cols]

display(df_filled.head())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.708333,1,3,0.671053,0.457265,1,0,0.54386,0,0.575,0,0,1,1
1,0.166667,1,2,0.473684,0.529915,0,1,0.868421,0,0.875,0,0,2,1
2,0.25,0,1,0.473684,0.333333,0,0,0.736842,0,0.35,2,0,2,1
3,0.5625,1,1,0.342105,0.470085,0,1,0.789474,0,0.2,2,0,2,1
4,0.583333,0,0,0.342105,0.974359,0,1,0.657895,1,0.15,2,0,2,1


In [55]:
X = df.drop("target", axis=1)
y = df["target"]

In [75]:
# Preprocessing pipelines
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')  # drop first to avoid dummy trap

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [76]:
model = Pipeline([
    ('preprocess', preprocessor),
    ('nb', GaussianNB())
])

In [84]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X, y)

# print("Original dataset shape:", X.shape, y.shape)
# print("Resampled dataset shape:", X_resampled.shape, y_resampled.shape)


Original dataset shape: (302, 13) (302,)
Resampled dataset shape: (328, 13) (328,)


In [85]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [86]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [87]:
#evaluate the model
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.803030303030303


In [88]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.81      0.78      0.79        32
           1       0.80      0.82      0.81        34

    accuracy                           0.80        66
   macro avg       0.80      0.80      0.80        66
weighted avg       0.80      0.80      0.80        66

