## Feature Engineering
- Age Category
- BMI Category - Underweight, Normal Weight, Overweight, Obesity
- Pollution RIsk - Location and Air Pollution level
- Smoking Status Encoding
- Interaction Features
- Location Encoding

In [1]:
# import the libraries needed
import pandas as pd
import numpy as np

In [2]:
# Load the data
path = r'M:\Data Science\COPD_Prediction\Data\synthetic_COPD_data.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis
0,31,Male,Former,1,1,1,27.56,Lalitpur,84,0,0
1,60,Male,Never,1,0,0,30.3,Pokhara,131,1,0
2,33,Male,Former,0,0,1,28.45,Pokhara,123,1,0
3,36,Female,Current,1,0,0,27.49,Kathmandu,253,0,1
4,58,Male,Never,0,0,0,25.49,Pokhara,117,1,0


In [3]:
# Age categories
df['Age_Category'] = pd.cut(df['Age'], bins = [29, 39 , 49, 59, 69, 79], labels = ['30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79'])

df['BMI_Categories'] = pd.cut(df['BMI'], bins = [0, 18.5, 24.5, 29.9, 35], labels = ['Underweight', 'Normal Weight', 'Overweight', 'Obessity'])

In [4]:
# Pollustion Risk Score
df['Pollution_Risk_Score'] = np.where(df['Air_Pollution_Level'] > 150, 1, 0)

# Encode Smoking Status
df['Smoking_Status_encoded'] = df['Smoking_Status'].map({'Current': 1, 'Former': 0.5, 'Never': 0})

# Encode Gender
df['Gender'] = df['Gender'].map({'Male':1, 'Female':0})

# Interaction Features - Smoking Status and Pollution level
df['Smoking_Pollution_interaction'] = df['Smoking_Status_encoded'] * df['Air_Pollution_Level']

In [5]:
# One hot encoded for the Location
df = pd.get_dummies(df, columns=['Location'], drop_first=True)

In [6]:
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,...,Smoking_Pollution_interaction,Location_Biratnagar,Location_Butwal,Location_Chitwan,Location_Dharan,Location_Hetauda,Location_Kathmandu,Location_Lalitpur,Location_Nepalgunj,Location_Pokhara
0,31,1,Former,1,1,1,27.56,84,0,0,...,42.0,False,False,False,False,False,False,True,False,False
1,60,1,Never,1,0,0,30.3,131,1,0,...,0.0,False,False,False,False,False,False,False,False,True
2,33,1,Former,0,0,1,28.45,123,1,0,...,61.5,False,False,False,False,False,False,False,False,True
3,36,0,Current,1,0,0,27.49,253,0,1,...,253.0,False,False,False,False,False,True,False,False,False
4,58,1,Never,0,0,0,25.49,117,1,0,...,0.0,False,False,False,False,False,False,False,False,True


## Machine Learning

In [7]:
df = df.drop(columns=['Smoking_Status', 'Age_Category', 'BMI_Categories','Pollution_Risk_Score','Smoking_Pollution_interaction'])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Age                               1000 non-null   int64  
 1   Gender                            1000 non-null   int64  
 2   Biomass_Fuel_Exposure             1000 non-null   int64  
 3   Occupational_Exposure             1000 non-null   int64  
 4   Family_History_COPD               1000 non-null   int64  
 5   BMI                               1000 non-null   float64
 6   Air_Pollution_Level               1000 non-null   int64  
 7   Respiratory_Infections_Childhood  1000 non-null   int64  
 8   COPD_Diagnosis                    1000 non-null   int64  
 9   Smoking_Status_encoded            1000 non-null   float64
 10  Location_Biratnagar               1000 non-null   bool   
 11  Location_Butwal                   1000 non-null   bool   
 12  Locatio

In [9]:
# Save my engineered data
df.to_csv('engineered_COPD_data.csv', index=False)