# Task 5

Feature Engineering and Selection
Description:
Engineer new features and select relevant features for model training.
Responsibility:
1. Generate meaningful features from existing data.
2. Use techniques like PCA or feature importance to select the most important features.
3. Optimize feature sets for improved model performance.

In [1]:
# Library import
import pandas as pd

# Load the CSV file
data = pd.read_csv('D:\\Study\\Parul University\\PUStudy\\4th sem\\#Data Science With Python\\#Internship\\Task\\5\\heart.csv')

In [2]:
# Display the first five rows of the dataset
print(data.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  


In [3]:
# Information function
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB
None


In [4]:
# Describe function
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [5]:
# Create age groups column
data['age_group'] = pd.cut(data['age'], bins=[0, 40, 50, 60, 100], labels=['<40', '40-50', '50-60', '60+'])

In [6]:
# Create cholesterol level categories column
data['chol_category'] = pd.cut(data['chol'], bins=[0, 200, 239, 1000], labels=['normal', 'borderline', 'high'])

In [7]:
# Create interaction terms
data['age_chol'] = data['age'] * data['chol']
data['thalach_age'] = data['thalach'] * data['age']

In [8]:
# Display the first few rows of the dataset with the new features
print(data.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target age_group chol_category  age_chol  thalach_age  
0   2     3       0     50-60    borderline     11024         8736  
1   0     3       0     50-60    borderline     10759         8215  
2   0     3       0       60+        normal     12180         8750  
3   1     3       0       60+    borderline     12383         9821  
4   3     2       0       60+          high     18228         6572  


In [9]:
# Library import
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Selecting relevant features for PCA (Principal Component Analysis)
features = data.drop(columns=['target', 'age_group', 'chol_category'])

In [10]:
# Encode categorical features
features = pd.get_dummies(features, drop_first=True)

In [11]:
# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [12]:
# Apply PCA
pca = PCA(n_components=0.95)  # retain 95% of the variance
features_pca = pca.fit_transform(features_scaled)

In [13]:
# Check the explained variance ratio to determine the number of components
explained_variance = pca.explained_variance_ratio_
n_components = pca.n_components_

In [14]:
print(features_pca.shape)

(1025, 12)


In [15]:
print(explained_variance)

[0.20889427 0.16830688 0.08840103 0.0812662  0.06882297 0.06670356
 0.0589318  0.05530184 0.05083943 0.0478213  0.0436604  0.03479448]


In [16]:
print(n_components)

12


In [17]:
from sklearn.ensemble import RandomForestClassifier

# Prepare the data for the Random Forest model
X = pd.get_dummies(data.drop(columns=['target', 'age_group', 'chol_category']), drop_first=True)
y = data['target']

In [18]:
# Train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

In [19]:
# Get feature importances
feature_importances = rf_model.feature_importances_
important_features = pd.Series(feature_importances, index=X.columns).sort_values(ascending=False)

In [20]:
# Display the most important features
print(important_features.head(15))

cp             0.120862
ca             0.112540
thal           0.110047
oldpeak        0.090816
thalach        0.089553
age            0.071684
thalach_age    0.071066
age_chol       0.067411
exang          0.060699
chol           0.055705
trestbps       0.051452
slope          0.041440
sex            0.033022
restecg        0.016684
fbs            0.007018
dtype: float64


In [21]:
# Select the top 15 important features
top_features = important_features.head(15).index
X_top = X[top_features]

In [22]:
# Train a new Random Forest model using only the top features
rf_model_top = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_top.fit(X_top, y)

In [23]:
# Evaluate the new model
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf_model_top, X_top, y, cv=5)
print(f'Cross-validated scores: {scores}')
print(f'Mean score: {scores.mean()}')

Cross-validated scores: [1.         1.         1.         1.         0.98536585]
Mean score: 0.9970731707317073
