# Step 3 - Feature Engineering

Normalize features to ensure they contribute equally to the model. This process can help improve the performance and convergence of various machine learning algorithms.

After this, perform data augmentation using SMOTE technique.

In [34]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

## 3.0 - Load the Data

In [23]:
previous_years_df = pd.read_csv('../data/02_previous_years_not_null.csv')
present_year_df = pd.read_csv('../data/04_present_year_not_null.csv')


## 3.1 - Scaling the Data

Since our features do not have the same units or scale, it's generally better to use StandardScaler in this case. StandardScaler standardizes features to have zero mean and unit variance, which is often more suitable when dealing with features of varying scales or units.

Besides, we will also encode the class 'neg' as 0 and class 'pos' as 1 for further use.


In [24]:
# Function to scale the data
def scaling(data):
    # Separate the target and feature columns
    X = data.drop('class', axis=1)  # Features
    y = data['class']               # Target

    # Initialize the StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Convert the scaled features back to a DataFrame
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    scaled_data = pd.concat([X_scaled_df, y.reset_index(drop=True)], axis=1)

    # Map target values to numeric
    scaled_data['class'] = scaled_data['class'].map({'neg': 0, 'pos': 1})

    return scaled_data

**Previous Years**

In [25]:
previous_scaled = scaling(previous_years_df)
previous_scaled.head(5)

Unnamed: 0,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000,class
0,0.119381,-0.096307,2.310224,-0.004085,-0.041322,-0.051358,-0.010762,-0.02837,-0.056929,-0.115643,...,0.524393,0.239087,0.070072,0.008264,-0.107586,-0.143103,-0.175699,-0.020257,-0.02354,0
1,-0.180697,-0.096307,-0.432859,-0.004089,-0.041322,-0.051358,-0.010762,-0.02837,-0.056929,-0.115643,...,-0.059135,-0.129021,-0.131171,-0.184975,-0.152281,-0.08865,-0.143927,-0.020257,-0.02354,0
2,-0.125811,-0.096307,-0.432859,-0.00409,-0.041322,-0.051358,-0.010762,-0.02837,-0.056929,-0.115643,...,-0.092912,-0.016553,0.016053,-0.009047,-0.107547,-0.094124,-0.164812,-0.020257,-0.02354,0
3,-0.407928,-0.096307,-0.432859,-0.004091,-0.041322,-0.002669,-0.010762,-0.02837,-0.056929,-0.115223,...,-0.388574,-0.381387,-0.351244,-0.310645,-0.199493,-0.306838,-0.175699,0.916833,3.685328,0
4,0.010572,-0.096307,-0.432857,-0.00408,-0.041322,-0.051358,-0.010762,-0.02837,-0.056929,-0.115643,...,0.036588,-0.032641,-0.039892,-0.040823,-0.018211,0.663519,-0.1499,-0.020257,-0.02354,0


**Present Year:**

In [26]:
present_scaled = scaling(present_year_df)
present_scaled.head(5)

Unnamed: 0,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000,class
0,-0.134924,-0.137538,-0.433693,-0.259405,-0.053204,-0.059627,-0.018614,-0.02299,-0.058136,-0.106506,...,-0.379295,-0.369641,-0.337213,-0.295763,-0.215305,-0.26713,-0.159622,-0.018788,-0.016517,0
1,-0.134881,-0.137538,-0.433693,-0.238681,-0.053204,-0.059627,-0.018614,-0.02299,-0.058136,-0.109136,...,-0.379057,-0.368683,-0.337643,-0.295757,-0.215074,-0.26713,-0.159622,-0.018788,-0.016517,0
2,-0.003962,1.429849,-0.433693,-0.185393,-0.053204,-0.059627,-0.018614,-0.02299,-0.058136,0.086476,...,0.276886,-0.020698,-0.122468,0.806803,-0.120439,-0.266164,-0.159622,-0.018788,-0.016517,0
3,-0.016248,-0.137538,-0.433692,0.424463,-0.053204,-0.059627,-0.018614,-0.02299,-0.058136,-0.109136,...,0.040289,0.013559,0.049851,0.065875,-0.085608,0.260531,-0.098624,-0.018788,-0.016517,0
4,-0.131441,-0.137538,-0.433693,-0.16467,-0.053204,-0.059627,-0.018614,-0.02299,-0.058136,-0.109136,...,-0.372382,-0.355315,-0.29798,-0.293166,-0.215062,-0.266988,-0.159622,-0.018788,-0.016517,0


## 3.2 - Saving Original Data Scaled

In [28]:
previous_scaled.to_csv('../data/05_previous_years_scaled.csv', index=False)
present_scaled.to_csv('../data/06_present_year_scaled.csv', index=False)

## 3.3 - Data Augmentation

Apply SMOTE (Synthetic Minority Over-sampling Technique) to balance the class distribution by generating synthetic samples for the minority class.

It should be applied only to the training data to prevent data leakage and ensure that the test data remains representative of the real-world scenario. It will be explain in detaild in the 5th notebook that the training data here is the Previous Years data.

In [29]:
# Separate target and features for training data
y_train = previous_scaled['class']
X_train = previous_scaled.drop('class', axis=1)

# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Count the occurrences of each class in the resampled target variable
class_counts = pd.Series(y_train_resampled).value_counts()

print("Training Resampled features shape:", X_train_resampled.shape)
print("Training Resampled target shape:", y_train_resampled.shape)
print(f"\nClass Count: {class_counts}")

Training Resampled features shape: (118000, 170)
Training Resampled target shape: (118000,)

Class Count: class
0    59000
1    59000
Name: count, dtype: int64


## 3.4 - Saving Resampled (Augmented) Data

In [30]:
# Create a DataFrame from the resampled data
X_train_resampled_df = pd.DataFrame(X_train_resampled, columns=X_train.columns)
y_train_resampled_df = pd.DataFrame(y_train_resampled, columns=['class'])

# Combine features and target into a single DataFrame
SMOTEd_df = pd.concat([X_train_resampled_df, y_train_resampled_df], axis=1)

In [31]:
SMOTEd_df.to_csv('../data/07_previous_years_SMOTEd.csv', index=False)