# DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

# 1. Data Exploration and Preprocessing

conduct basic data exploration (summary statistics, missing values, data types)

In [2]:
import pandas as pd
df = pd.read_csv('C:\\db\\adult_with_headers.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
# Printing  the summary statistics of the dataset
print(df.describe())

                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  


In [4]:
# Print the data types of each column in the dataset
print(df.dtypes)

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object


In [5]:
numerical_data = df.select_dtypes(include=['int64', 'float64'])
categorical_data = df.select_dtypes(include=['object'])
print ('numarical data :',numerical_data)
print ('chatagorical data :',categorical_data)

numarical data :        age  fnlwgt  education_num  capital_gain  capital_loss  hours_per_week
0       39   77516             13          2174             0              40
1       50   83311             13             0             0              13
2       38  215646              9             0             0              40
3       53  234721              7             0             0              40
4       28  338409             13             0             0              40
...    ...     ...            ...           ...           ...             ...
32556   27  257302             12             0             0              38
32557   40  154374              9             0             0              40
32558   58  151910              9             0             0              40
32559   22  201490              9             0             0              20
32560   52  287927              9         15024             0              40

[32561 rows x 6 columns]
chatagorical data :  

# Handling missing values (imputation, removal, etc.).

In [6]:
# Check for missing values
missing_values = df.isnull().sum()

# Impute missing values for numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

# Impute missing values for categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])

# Check for missing values again
print(df.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


# Applying scaling techniques to numerical features:
# Standard Scaling
# Min-Max Scaling

In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standard Scaling
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[numerical_columns])
df_scaled = pd.DataFrame(scaled_data, columns=numerical_columns)

# Min-Max Scaling
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[numerical_columns])
df_minmax = pd.DataFrame(scaled_data, columns=numerical_columns)
print('scaling techniques applyed sucessfully')

scaling techniques applyed sucessfully


# Discuss the scenarios where each scaling technique is preferred and why
# 2. Encoding Techniques:
# Applying One-Hot Encoding to categorical variables with less than 5 categories

In [8]:
import pandas as pd
for col in categorical_columns:
    if df[col].nunique() < 5:
        df_dummies = pd.get_dummies(df[col], prefix=col)
        df = pd.concat([df, df_dummies], axis=1)
        df.drop(col, axis=1, inplace=True)

# Print the updated DataFrame
print(df.head())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race  \
0        Never-married        Adm-clerical   Not-in-family   White   
1   Married-civ-spouse     Exec-managerial         Husband   White   
2             Divorced   Handlers-cleaners   Not-in-family   White   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black   
4   Married-civ-spouse      Prof-specialty            Wife   Black   

   capital_gain  capital_loss  hours_per_week  native_country  sex_ Female  \
0          2174             0              40   United-States            0   
1             0             0         

# Using Label Encoding for categorical variables with more than 5 categories

In [9]:
# applying Label encoding for categorical variables with more than 5 categories
from sklearn.preprocessing import LabelEncoder

categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    if df[col].nunique() > 5:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Print the updated DataFrame
print(df.head())

   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship    race  capital_gain  capital_loss  \
0           1             1   White          2174             0   
1           4             0   White             0             0   
2           6             1   White             0             0   
3           6             0   Black             0             0   
4          10             5   Black             0             0   

   hours_per_week  native_country  sex_ Female  sex_ Male  income_ <=50K  \
0              40              39            0          1              1   
1              13 

# Discuss the pros and cons of One-Hot Encoding and Label Encoding
# 3. Feature Engineering

In [10]:
import pandas as pd

# Calculate the correlation matrix of the numerical data
correlation_matrix = numerical_data.corr()

# Print the correlation matrix
print(correlation_matrix)

                     age    fnlwgt  education_num  capital_gain  capital_loss  \
age             1.000000 -0.076646       0.036527      0.077674      0.057775   
fnlwgt         -0.076646  1.000000      -0.043195      0.000432     -0.010252   
education_num   0.036527 -0.043195       1.000000      0.122630      0.079923   
capital_gain    0.077674  0.000432       0.122630      1.000000     -0.031615   
capital_loss    0.057775 -0.010252       0.079923     -0.031615      1.000000   
hours_per_week  0.068756 -0.018768       0.148123      0.078409      0.054256   

                hours_per_week  
age                   0.068756  
fnlwgt               -0.018768  
education_num         0.148123  
capital_gain          0.078409  
capital_loss          0.054256  
hours_per_week        1.000000  


# Apply a transformation (e.g., log transformation) to at least one skewed numerical feature and justify your choice

In [11]:
import numpy as np
# Select a skewed numerical feature
skewed_feature = df['fnlwgt']

# Apply log transformation
df['fnlwgt'] = np.log(df['fnlwgt'])

# Justify the choice
print("Log transformation was chosen for the 'Flight_weight' feature because it is skewed to the right.")
print("Log transformation compresses the range of the data and reduces the influence of outliers.")

Log transformation was chosen for the 'Flight_weight' feature because it is skewed to the right.
Log transformation compresses the range of the data and reduces the influence of outliers.


# 4. Feature Selection:

# Using the Isolation Forest algorithm to identify and remove outliers.

In [12]:
from sklearn.ensemble import IsolationForest

# Creating an instance of the Isolation Forest algorithm
iso = IsolationForest(contamination='auto')

# Fitting the model to the data
iso.fit(numerical_data)

# Predict outliers using the Isolation Forest model
outlier_preds = iso.predict(numerical_data)

# Identify the outlier samples (outliers are marked as -1 by the Isolation Forest)
outlier_index = outlier_preds == -1

# Remove the outlier samples from the data
df_clean = numerical_data[~outlier_index]

# Print the number of outlier samples
print(f"Number of outlier samples: {sum(outlier_index)}")

Number of outlier samples: 3611


# Discuss how outliers can affect model performance

In [13]:
# Calculating  the PPS matrix

import ppscore as pps
pps_matrix = df.corr()

# Print the PPS matrix
print(pps_matrix)

                     age  workclass    fnlwgt  education  education_num  \
age             1.000000   0.003787 -0.060188  -0.010508       0.036527   
workclass       0.003787   1.000000 -0.014909   0.023513       0.052085   
fnlwgt         -0.060188  -0.014909  1.000000  -0.028731      -0.037585   
education      -0.010508   0.023513 -0.028731   1.000000       0.359153   
education_num   0.036527   0.052085 -0.037585   0.359153       1.000000   
marital_status -0.266288  -0.064731  0.025822  -0.038407      -0.069304   
occupation     -0.020947   0.254892  0.003256  -0.021260       0.109697   
relationship   -0.263698  -0.090461  0.011122  -0.010876      -0.094153   
capital_gain    0.077674   0.033835  0.003564   0.030046       0.122630   
capital_loss    0.057775   0.012216 -0.005085   0.016746       0.079923   
hours_per_week  0.068756   0.138962 -0.024333   0.055510       0.148123   
native_country -0.001151  -0.007690 -0.059978   0.064288       0.050840   
sex_ Female    -0.088832 

  pps_matrix = df.corr()


In [14]:
# Compare the PPS matrix with the correlation matrix
corr_matrix = df.corr()
print(corr_matrix)

                     age  workclass    fnlwgt  education  education_num  \
age             1.000000   0.003787 -0.060188  -0.010508       0.036527   
workclass       0.003787   1.000000 -0.014909   0.023513       0.052085   
fnlwgt         -0.060188  -0.014909  1.000000  -0.028731      -0.037585   
education      -0.010508   0.023513 -0.028731   1.000000       0.359153   
education_num   0.036527   0.052085 -0.037585   0.359153       1.000000   
marital_status -0.266288  -0.064731  0.025822  -0.038407      -0.069304   
occupation     -0.020947   0.254892  0.003256  -0.021260       0.109697   
relationship   -0.263698  -0.090461  0.011122  -0.010876      -0.094153   
capital_gain    0.077674   0.033835  0.003564   0.030046       0.122630   
capital_loss    0.057775   0.012216 -0.005085   0.016746       0.079923   
hours_per_week  0.068756   0.138962 -0.024333   0.055510       0.148123   
native_country -0.001151  -0.007690 -0.059978   0.064288       0.050840   
sex_ Female    -0.088832 

  corr_matrix = df.corr()


In [15]:
# Discuss the relationships between features based on the PPS matrix
print("Based on the PPS matrix, the following features have the strongest relationships:")
for i in range(len(pps_matrix.columns)):
    for j in range(i+1, len(pps_matrix.columns)):
        if abs(pps_matrix.iloc[i, j]) > 0.7:
            print(f"* {pps_matrix.columns[i]} and {pps_matrix.columns[j]}")

Based on the PPS matrix, the following features have the strongest relationships:
* sex_ Female and sex_ Male
* income_ <=50K and income_ >50K


In [None]:

.



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































