In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from IPython.display import display

# Data preprocessing

In [None]:
data=pd.read_csv('heart.csv')

In [None]:
display(data.info(),
        data.describe(),
        data.shape,
        data.size,
        data.head()
        )

In [None]:
print("NAN")
nan_counts = data.isna().sum()
print(nan_counts[nan_counts > 0])

print("\nNULL")
null_counts = data.isnull().sum()
print(null_counts[null_counts > 0])

In [None]:
from sklearn.preprocessing import StandardScaler
def scalling(data):
    numeric_data = data.select_dtypes(include=['float64', 'int64'])
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(numeric_data)
    scaled_df = pd.DataFrame(scaled_data, columns=numeric_data.columns)
    return scaled_df

In [None]:
scalled_data=scalling(data)

In [None]:
x=data.iloc[:,:-1]
y=data.iloc[:,-1]

In [None]:
scalled_x=scalled_data.iloc[:,:-1]
scalled_y=scalled_data.iloc[:,-1]

# Feature selection

## Filter Method
- it considers each feature independently
- less computationally expensive

Variance Threshold
- removes feature with low variance 
- assumes that high variance = more info

In [None]:
from sklearn.feature_selection import VarianceThreshold
def varience(data):

    numeric_data = data.select_dtypes(include=['float64', 'int64'])
    selector = VarianceThreshold(threshold=0.2)

    selected_data = selector.fit_transform(data)

    selected_features = numeric_data.columns[selector.get_support()]

    features_removed = [col for col in numeric_data.columns if col not in selected_features]


    print(f"Original features: {numeric_data.shape[1]}")
    print(f"Features after variance thresholding: {selected_data.shape[1]}")
    print(f"Features removed: {numeric_data.shape[1] - selected_data.shape[1]}")
    print("\nSelected features:")
    print(selected_features.tolist())
    print("\nremoved features:")
    print(features_removed)
    final_data = data[selected_features]
    return final_data

In [None]:
scaled=scalling(data)
varience(scaled)

Correaltion-based selection
- removes hig corr features as tehy likely provide redundant info

In [None]:
def corr(data):
    numeric_data = data.select_dtypes(include=['float64', 'int64'])
    
    corr_matrix = pd.DataFrame(numeric_data).corr().abs()

    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
    print(to_drop)
    data_selected = pd.DataFrame(data).drop(to_drop, axis=1)

    return data_selected

In [None]:
corr(scaled)

Statistical Tests
- Uses statistical tests to select features that have the strongest relationship with the output variable.

In [None]:
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

def chi2test(data):
    x=data.iloc[:,:-1]
    y=data.iloc[:,-1]
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=1)
    columns_selected= chi2(xtrain,ytrain)
    p_values=pd.Series(columns_selected[1])
    p_values.index=xtrain.columns
    return p_values.sort_index(ascending=False)

In [None]:
chi2test(data)

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
def anovatest(data):
    x=data.iloc[:,:-1]
    y=data.iloc[:,-1]
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=1)
    selector = SelectKBest(f_classif, k=10)
    X_selected = selector.fit_transform(xtrain, ytrain)

    f_scores = selector.scores_
    p_values = selector.pvalues_

    feature_scores = pd.DataFrame({
    	'Feature': xtrain.columns,
    	'F Score': f_scores,
    	'P Value': p_values
    })

    feature_scores = feature_scores.sort_values('F Score', ascending=False)
    print(feature_scores.head(10))

In [None]:
anovatest(data)

In [None]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest
def mutualinfotest(data):
	x=data.iloc[:,:-1]
	y=data.iloc[:,-1]
	xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=1)
	selector = SelectKBest(mutual_info_classif, k=10)
	X_selected = selector.fit_transform(xtrain, ytrain)

	f_scores = selector.scores_

	feature_scores = pd.DataFrame({
	    'Feature': xtrain.columns,
	    'F Score': f_scores,
	})

	feature_scores = feature_scores.sort_values('F Score', ascending=False)
	print(feature_scores.head(10))

In [None]:
mutualinfotest(data)

## Wrapper Method
- use a predictive model to score feature subsets
- train a new model on each feature subset and measure its performance to select the best features

Recursive Feature Elimination (RFE)
- Recursively removes the weakest feature(s) until the desired number of features is reached.

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

def reftest(scalled_x,y):
    model = LogisticRegression(max_iter=1000)


    selector = RFE(estimator=model, n_features_to_select=10, step=1)
    X_selected = selector.fit_transform(scalled_x,y)

    selected_features = scalled_x.columns[selector.support_]
    print("Selected features:", selected_features.tolist())


    feature_ranking = pd.DataFrame({
        'Feature': scalled_x.columns,
        'Ranking': selector.ranking_
    })
    feature_ranking = feature_ranking.sort_values('Ranking')
    print("\nFeature ranking (1 = selected, higher = eliminated earlier):")
    print(feature_ranking)

In [None]:
reftest(scalled_x,y)

Forward/Backward Selection
- Forward selection starts with no features and adds them one by one, while backward selection starts with all features and removes them one by one.

In [93]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression

# Forward selection
sfs_forward = SFS(LinearRegression(), 
                  k_features=10, 
                  forward=True, 
                  verbose=2,
                  scoring='r2')
sfs_forward.fit(scalled_x, y)
X_selected = sfs_forward.transform(scalled_x)

# Backward selection
sfs_backward = SFS(LinearRegression(), 
                   k_features=10, 
                   forward=False, 
                   verbose=2,
                   scoring='r2')
sfs_backward.fit(scalled_x, y)
X_selected = sfs_backward.transform(scalled_x)


[2025-03-20 23:42:29] Features: 1/10 -- score: 0.18991694170523726
[2025-03-20 23:42:29] Features: 2/10 -- score: 0.3202729761019654
[2025-03-20 23:42:30] Features: 3/10 -- score: 0.37901379189669787
[2025-03-20 23:42:30] Features: 4/10 -- score: 0.4231779108022288
[2025-03-20 23:42:30] Features: 5/10 -- score: 0.4541934600545228
[2025-03-20 23:42:30] Features: 6/10 -- score: 0.47210549907312005
[2025-03-20 23:42:30] Features: 7/10 -- score: 0.4879756394058109
[2025-03-20 23:42:30] Features: 8/10 -- score: 0.4932556597799903
[2025-03-20 23:42:30] Features: 9/10 -- score: 0.49719930134170864
[2025-03-20 23:42:30] Features: 10/10 -- score: 0.4998121001307898
[2025-03-20 23:42:30] Features: 12/10 -- score: 0.5003891746727683
[2025-03-20 23:42:30] Features: 11/10 -- score: 0.5004238310924107
[2025-03-20 23:42:30] Features: 10/10 -- score: 0.4998121001307898