# Step 3 - Feature Engineering

Normalize features to ensure they contribute equally to the model. This process can help improve the performance and convergence of various machine learning algorithms.

In [29]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

## 3.0 - Load the Data

In [2]:
previous_years_df = pd.read_csv('../data/02_previous_years_not_null.csv')
present_year_df = pd.read_csv('../data/04_present_year_not_null.csv')


## 3.1 - Scaling the Data

MinMaxScaler(feature_range = (0, 1)) will transform each value in the column proportionally within the range [0,1]. We use this as the scaler choice to transform the feature and preserve the shape of the dataset (no distortion).

Besides, we will also encode the class 'neg' as 0 and class 'pos' as 1 for further use.


In [25]:
# Function to scale the data
def scaling(data):
    # Separate the target and feature columns
    X = data.drop('class', axis=1)  # Features
    y = data['class']               # Target

    # Initialize the MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    X_scaled = scaler.fit_transform(X)

    # Convert the scaled features back to a DataFrame
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    scaled_data = pd.concat([X_scaled_df, y.reset_index(drop=True)], axis=1)

    scaled_data['class'] = scaled_data['class'].map({'neg': 0, 'pos': 1})

    return scaled_data

**Previous Years**

In [26]:
previous_scaled = scaling(previous_years_df)
previous_scaled.head(5)

Unnamed: 0,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000,class
0,0.027925,0.0,0.9999998,3.261769e-08,0.0,0.0,0.0,0.0,0.0,0.0,...,0.013067,0.007421784,0.008179508,0.01073013,0.001321,0.0038,0.0,0.0,0.0,0
1,0.012036,0.0,0.0,1.467796e-08,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004716,0.003019031,0.004272917,0.004228511,0.000679,0.005064,0.000394,0.0,0.0,0
2,0.014942,0.0,1.070067e-07,1.164918e-08,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004232,0.004364196,0.007130884,0.01014768,0.001321,0.004937,0.000135,0.0,0.0,0
3,4e-06,0.0,3.285295e-08,7.688457e-09,0.0,0.000498,0.0,0.0,0.0,5e-06,...,1e-06,5.970003e-07,7.660803e-07,3.163775e-07,0.0,0.0,0.0,0.008299,0.027923,0
4,0.022164,0.0,6.420405e-07,5.335323e-08,0.0,0.0,0.0,0.0,0.0,0.0,...,0.006086,0.004171776,0.006044861,0.009078578,0.002605,0.022523,0.00032,0.0,0.0,0


**Present Year:**

In [27]:
present_scaled = scaling(present_year_df)
present_scaled.head(5)

Unnamed: 0,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000,class
0,1e-06,0.0,9.386558e-09,0.000138,0.0,0.0,0.0,0.0,0.0,3.7e-05,...,6e-06,8e-06,1.2e-05,2e-06,2e-06,0.0,0.0,0.0,0.0,0
1,2e-06,0.0,3.19143e-08,0.00046,0.0,0.0,0.0,0.0,0.0,0.0,...,1.3e-05,3.3e-05,2e-06,2e-06,1e-05,0.0,0.0,0.0,0.0,0
2,0.001537,0.034483,9.949751e-08,0.001287,0.0,0.0,0.0,0.0,0.0,0.002731,...,0.017227,0.009057,0.00481,0.031192,0.003237,3e-05,0.0,0.0,0.0,0
3,0.001393,0.0,4.740212e-07,0.010754,0.0,0.0,0.0,0.0,0.0,0.0,...,0.011018,0.009945,0.00866,0.010232,0.004425,0.016389,0.000707,0.0,0.0,0
4,4.2e-05,0.0,7.321515e-08,0.001608,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000188,0.00038,0.000888,7.5e-05,1e-05,4e-06,0.0,0.0,0.0,0


## 3.2 - Saving

In [28]:
previous_scaled.to_csv('../data/05_previous_years_scaled.csv', index=False)
present_scaled.to_csv('../data/06_present_year_scaled.csv', index=False)