# Data Preprocessing and Feature Engineering in Machine Learning

In [37]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

In [6]:
data = pd.read_csv(r"C:\Users\ms104154\Downloads\adult_with_headers.csv")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [12]:
data.dropna(inplace=True)

In [14]:
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

In [27]:
# Assuming 'age' and 'education-num' are numerical features
data['age_scaled_standard'] = scaler_standard.fit_transform(data[['age']])
data['age_scaled_minmax'] = scaler_minmax.fit_transform(data[['age']])                                                                

# Task 2: Encoding Techniques

In [30]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [34]:
onehot_cols = ['workclass', 'marital_status', 'occupation', 'relationship', 'race']
for col in onehot_cols:
    if len(data[col].unique()) < 5:
        onehot_encoder = OneHotEncoder(drop='first')
        encoded_cols = pd.DataFrame(onehot_encoder.fir_transform(data[[col]]).toarray(),columns=[col + '_' + str(i) for i in range(i, len(data[col].unique()))])
        data = pd.concat([data, encoded_cols], axis=1)
onehot_cols                                  

['workclass', 'marital_status', 'occupation', 'relationship', 'race']

# Task 3: Feature Engineering

In [41]:
# Create new features
data['capital-gain-minus-loss'] = data['capital_gain'] - data['capital_loss']
data['age_sqaured'] = data['age'] ** 2

In [43]:
 # Apply log transformation to skewed numerical feature (assuming 'capital-gain' is skewed)
data['capital-gain_log'] = np.log1p(data['capital_gain'])

# Task 4: Feature Selection python

In [60]:
# Using the Isolation Forest Algorithm to identify and remove outliers
from sklearn.ensemble import IsolationForest
import ppscore as pps

iso = IsolationForest(contamination=0.05)
outliers = iso.fit_predict(data[['age_scaled_standard', 'age_scaled_minmax']])
data['outlier'] = outliers
data = data[data['outlier'] == 1]  # keep only non-outlier entries
data.drop(columns='outlier', inplace=True)


ModuleNotFoundError: No module named 'ppscore'

In [None]:
# Using the Isolation Forest Algorithm to identify and remove outliers
from sklearn.ensemble import IsolationForest
import ppscore as pps

iso = IsolationForest(contamination=0.05)
outliers = iso.fit_predict(data[['age_scaled_standard', 'age_scaled_minmax']])
data['outlier'] = outliers
data = data[data['outlier'] == 1]  # keep only non-outlier entries
data.drop(columns='outlier', inplace=True)


In [None]:
# Calculate the PPS matrix
pps_matrix = pps.matrix(data)

# Plotting the heatmap
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 10))
sns.heatmap(pps_matrix, annot=True, fmt=".2f")
plt.title('PPS Matrix')
plt.show()

# Comparing with correlation matrix
corr_matrix = data.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
pip install ppscore