In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# --- a. Data Cleaning ---
df = pd.read_csv("/Users/akshay/Desktop/dsbda_practical/newdata/heart.csv")

# Remove rows with NA or '?' or negative values
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')  # Convert all to numeric
df = df[(df >= 0).all(axis=1)]  # Remove rows with any negative values

print("✅ Cleaned Data:")
print(df.head())

# --- b. Error Correcting (Outlier Detection & Removal) ---
# Z-score method for outlier detection
from scipy import stats
z_scores = np.abs(stats.zscore(df))
df = df[(z_scores < 3).all(axis=1)]

print("\n✅ After Removing Outliers:")
print(df.describe())

# --- c. Data Transformation ---
# Normalize features
X = df.drop('target', axis=1)
y = df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\n✅ Transformed Data (Scaled Features):")
print(pd.DataFrame(X_scaled, columns=X.columns).head())

# --- d. Build Data Model using Regression and kNN ---

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, lr_preds)

# k-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_preds = knn_model.predict(X_test)
knn_acc = accuracy_score(y_test, knn_preds)

# --- Results ---
print("\n✅ Accuracy Comparison:")
print(f"Logistic Regression Accuracy: {lr_acc * 100:.2f}%")
print(f"kNN Accuracy (k=5): {knn_acc * 100:.2f}%")

✅ Cleaned Data:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  

✅ After Removing Outliers:
              age         sex          cp   trestbps        chol         fbs  \
count  969.000000  969.000000  969.000000  969.00000  969.000000  969.000000   
mean    54.417957    0.701754    0.948400  130.98968  244.467492    0.143447   
std      9.074174    0.457724    1.035237   16.94101   46.125807    0.350709   
min     29.000000