In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
import pandas as pd

# URL for the Heart Disease dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

# Load the dataset into a pandas DataFrame
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']

ds = pd.read_csv(url, names=column_names)

print(ds.head())

    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope   ca thal  num  
0    3.0  0.0  6.0    0  
1    2.0  3.0  3.0    2  
2    2.0  2.0  7.0    1  
3    3.0  0.0  3.0    0  
4    1.0  0.0  3.0    0  


In [None]:
ds.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


In [None]:
# Create a binary feature 'high_chol' where 1 indicates chol > 240, and 0 indicates chol <= 200
ds['high_chol'] = ds['chol'].apply(lambda x: 1 if x > 240 else 0)

# Inspect the updated DataFrame to check the new feature
print(ds[['chol', 'high_chol']].head(10))

    chol  high_chol
0  233.0          0
1  286.0          1
2  229.0          0
3  250.0          1
4  204.0          0
5  236.0          0
6  268.0          1
7  354.0          1
8  254.0          1
9  203.0          0


In [None]:
#Convert categorical features to numerical using Label Encoding
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for column in ds.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    ds[column] = le.fit_transform(ds[column])
    label_encoders[column] = le

print(ds.head())

    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   

   slope  ca  thal  num  high_chol  
0    3.0   0     1    0          0  
1    2.0   3     0    2          1  
2    2.0   2     2    1          0  
3    3.0   0     0    0          1  
4    1.0   0     0    0          0  


In [None]:
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = ds.drop('num', axis=1)  # Assuming 'num' is the target column
y = ds['num']

# Apply Chi-Square test
chi_scores, p_values = chi2(X, y)

# Create a DataFrame with the feature names and their corresponding Chi-Square scores and p-values
chi2_results = pd.DataFrame({
    'Feature': X.columns,
    'Chi-Square Score': chi_scores,
    'P-Value': p_values
})

# Sort the features by Chi-Square score
chi2_results.sort_values(by='Chi-Square Score', ascending=False, inplace=True)

print(chi2_results)


      Feature  Chi-Square Score       P-Value
7     thalach        215.713336  1.567831e-45
9     oldpeak        101.997523  3.694131e-21
12       thal         93.798751  2.052059e-19
11         ca         89.692947  1.530136e-18
8       exang         41.534482  2.083076e-08
4        chol         37.700089  1.292085e-07
0         age         27.922884  1.292993e-05
3    trestbps         18.870256  8.334390e-04
2          cp         16.881183  2.038435e-03
6     restecg         14.057550  7.113628e-03
10      slope         10.542204  3.222058e-02
1         sex          7.499223  1.117436e-01
5         fbs          6.658223  1.550901e-01
13  high_chol          3.202896  5.244634e-01


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

X = ds.drop('num', axis=1)
y = ds['num']   #target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression(max_iter=2000, solver='liblinear')

# Apply Recursive Feature Elimination (RFE) to select features
rfe = RFE(model, n_features_to_select=5)
rfe = rfe.fit(X_train, y_train)

# Check which features were selected
selected_features = X.columns[rfe.support_]
print("Selected features: ", selected_features)

print("Feature ranking: ", rfe.ranking_)

# Evaluate model with the selected features
score = rfe.score(X_test, y_test)
print("Model accuracy with selected features: ", score)


Selected features:  Index(['sex', 'fbs', 'exang', 'ca', 'thal'], dtype='object')
Feature ranking:  [ 7  1  5  9 10  1  6  8  1  4  3  1  1  2]
Model accuracy with selected features:  0.5714285714285714


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# StandardScaler: Standardizes the features
scaler_standard = StandardScaler()
heart_disease_standardized = ds.copy()
heart_disease_standardized[numerical_cols] = scaler_standard.fit_transform(ds[numerical_cols])

# MinMaxScaler: Normalizes the features
scaler_minmax = MinMaxScaler()
heart_disease_normalized = ds.copy()
heart_disease_normalized[numerical_cols] = scaler_minmax.fit_transform(ds[numerical_cols])

print("Standardized Data:\n", heart_disease_standardized[numerical_cols].head())
print("\nNormalized Data:\n", heart_disease_normalized[numerical_cols].head())


Standardized Data:
         age  trestbps      chol   thalach   oldpeak
0  0.948726  0.757525 -0.264900  0.017197  1.087338
1  1.392002  1.611220  0.760415 -1.821905  0.397182
2  1.392002 -0.665300 -0.342283 -0.902354  1.346147
3 -1.932564 -0.096170  0.063974  1.637359  2.122573
4 -1.489288 -0.096170 -0.825922  0.980537  0.310912

Normalized Data:
         age  trestbps      chol   thalach   oldpeak
0  0.708333  0.481132  0.244292  0.603053  0.370968
1  0.791667  0.622642  0.365297  0.282443  0.241935
2  0.791667  0.245283  0.235160  0.442748  0.419355
3  0.166667  0.339623  0.283105  0.885496  0.564516
4  0.250000  0.339623  0.178082  0.770992  0.225806
