In [3]:
# eda
import pandas as p
import matplotlib.pyplot as g
import seaborn as s

# pipeline 
from sklearn.pipeline import Pipeline as pipe

# imputation 
from sklearn.impute import SimpleImputer as si

# scaling 
from sklearn.preprocessing import StandardScaler as ss


# transformation
from sklearn.compose import ColumnTransformer as ct

# model selection 
from sklearn.model_selection import train_test_split as tt , cross_val_score as cv , LeaveOneOut as loo , KFold as kf , RandomizedSearchCV as rc



# models 
from xgboost import XGBClassifier as xg  
from sklearn.ensemble import AdaBoostClassifier as ad , RandomForestClassifier as rf


# metrics
from sklearn.metrics import accuracy_score as acc  

In [2]:
na_values = ['na' , 'N/A' , '??', '--']

df = p.read_csv('heart.csv', na_values=na_values)

In [None]:
print(df.shape) 
print(df.columns) 
print(df.head().T) 
print(df.info()) 

In [None]:
print(df.describe().T)
print(df.describe(include=['object', 'category']).T)

In [4]:
df = df.drop_duplicates()

In [None]:
## univariate analysis

# Select numeric columns
num = df.select_dtypes(include=['int64', 'float64']).columns

# Plot histograms and boxplots for each numeric column
for col in num:
    g.figure(figsize=(10, 3))

    # Histogram
    g.subplot(1, 2, 1)
    s.histplot(df[col].dropna(), kde=True)
    g.title(f"Histogram of {col}")

    # Boxplot
    g.subplot(1, 2, 2)
    s.boxplot(x=df[col].dropna())
    g.title(f"Boxplot of {col}")

    g.tight_layout()
    g.show()


In [5]:
X = df.drop(columns=['target'] , axis =1)
y = df['target']

In [6]:
cat = X.select_dtypes(include=['object','category']).columns.tolist()

cat_pipe = pipe(steps=[
    
    ('impute', si(strategy='most_frequent'))
    
])

In [7]:
num = X.select_dtypes(include=['int','float']).columns.tolist()


num_pipe = pipe(steps=[
    ('impute' , si(strategy='mean')),
    ('scaler' , ss())
    
    
    
])

In [8]:
transform = ct(transformers =[
    ('numbers' ,num_pipe , num) , 
    ('objects' , cat_pipe , cat),
])

In [9]:
pro = pipe(steps =[
    ('transfrom' ,transform ) ,
    ('classifer ' , ad())
])

In [10]:
kf = kf(n_splits= len(df), shuffle=True, random_state=42)

In [11]:
cv = cv(pro , X , y , cv = kf , scoring = 'accuracy' )



In [12]:
print(f"score are \n\n {cv}")
print(f"mean is  {cv.mean()}")

score are 

 [1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1.
 0. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1.
 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0.
 1. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1.
 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.]
mean is  0.8079470198675497


In [13]:
pro.fit(X,y)



In [14]:
y_pred = pro.predict(X)
print(acc(y, y_pred))


0.9105960264900662
