# Data Augmentation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, f1_score, r2_score, mean_squared_error, mean_absolute_percentage_error
import os
import warnings
from sklearn.exceptions import ConvergenceWarning

In [2]:
# Suppress the convergence warning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [3]:
data_directory = r"C:\Users\sb013698\Desktop\github\Machine Learning in Finance\Datasets"

## Detecting Fraudulent Transactions with MLP Models
- In this problem, we will focus on predicting whether a transaction is a fraud or not. 
- All transactions are provided in "transactions.csv." The file contains only numerical input variables resulting from a PCA transformation.
- Due to confidentiality issues, original features cannot be provided. Features {V1, V2, …, V28} are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'.
- Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset.
- The feature 'Amount' is the transaction Amount. This feature can be used for example-dependent cost-sensitive learning.
- Feature 'Class' is the response variable, and it takes value one in case of fraud and zero otherwise.
- Since the target feature is imbalanced, we will address this issue using a data augmentation technique known as the Synthetic Minority Oversampling Technique (SMOTE).

In [4]:
transaction_data = pd.read_csv(os.path.join(data_directory, "transactions.csv"))
print(transaction_data.shape)

(284807, 31)


In [5]:
transaction_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
# Check the class distribution in the target variable
n_not_frauds = transaction_data["Class"].value_counts().values[0]
n_frauds = transaction_data["Class"].value_counts().values[1]
n_data = transaction_data.shape[0]

print(f"Number of observations: Not Fraud = {n_not_frauds} & Fraud = {n_frauds}")
print(f"Class ratios (%): Not Fraud = {(100*n_not_frauds/n_data):.2f} & Fraud = {(100*n_frauds/n_data):.2f}")

Number of observations: Not Fraud = 284315 & Fraud = 492
Class ratios (%): Not Fraud = 99.83 & Fraud = 0.17


**Note:** Only 0.17% of the entire data is labeled as one (fraud), while the rest of the data is labeled as zero (not fraud). Such an imbalanced distribution complicates learning the patterns that make an observation fraud.
- We will now evalute two different MLP architectures with and without applying SMOTE.

In [7]:
# Store results
result_list = list()

In [8]:
# Define MLP model with different architectures
# Model 1: MLP with 1 hidden layer of 16 units
mlp1 = MLPClassifier(hidden_layer_sizes=(16,), max_iter=1000, random_state=42)

# Model 2: MLP with 2 hidden layers, 16 units in first layer and 8 units in second layer
mlp2 = MLPClassifier(hidden_layer_sizes=(16, 8), max_iter=1000, random_state=42)

### Without SMOTE

In [9]:
# Define features and target variable
X = transaction_data.drop('Class', axis=1)
y = transaction_data['Class']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42,
)

# Standardize the features
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train shape: {X_train_scaled.shape}")
print(f"Test shape: {X_test_scaled.shape}")

# Print class distribution in train and test sets
print("Class distributions")
print(f"Train set: {y_train.value_counts(normalize=True)}")
print(f"Test set: {y_test.value_counts(normalize=True)}")

Train shape: (256326, 30)
Test shape: (28481, 30)
Class distributions
Train set: Class
0    0.998272
1    0.001728
Name: proportion, dtype: float64
Test set: Class
0    0.99828
1    0.00172
Name: proportion, dtype: float64


In [10]:
# MLP1: Perform 5-fold cross-validation and calculate the F1 score
mlp1_results = cross_val_score(
    mlp1, X_train_scaled, y_train, 
    cv=5, scoring=make_scorer(f1_score),
)
# Store the result in result_list
result_list.append({
    "Model Name": "MLP-1",
    "IsSMOTEApplied": "No",
    "Hidden Layers": 1,
    "Hidden Units": (16,),
    "Average F1-Score": mlp1_results.mean()
})

# MLP2: Perform 5-fold cross-validation and calculate the F1 score
mlp2_results = cross_val_score(
    mlp2, X_train_scaled, y_train, 
    cv=5, scoring=make_scorer(f1_score),
)
# Store the result in result_list
result_list.append({
    "Model Name": "MLP-2",
    "IsSMOTEApplied": "No",
    "Hidden Layers": 2,
    "Hidden Units": (16, 8),
    "Average F1-Score": mlp2_results.mean()
})

### With SMOTE

In [11]:
# Define features and target variable
X = transaction_data.drop('Class', axis=1)
y = transaction_data['Class']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42,
)

# Standardize the features
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Before SMOTE")
print(f"Train shape: {X_train_scaled.shape}")
print(f"Test shape: {X_test_scaled.shape}")
# Print class distribution in train and test sets
print("Class distributions")
print(f"Train set: {y_train.value_counts(normalize=True)}")
print(f"Test set: {y_test.value_counts(normalize=True)}")

# Apply SMOTE to balance class distribution (only on training data)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("="*50)
print("After SMOTE")
print(f"Train shape: {X_train_resampled.shape}")
print(f"Test shape: {X_test_scaled.shape}")
# Print class distribution in train and test sets
print("Class distributions")
print(f"Train set: {y_train_resampled.value_counts(normalize=True)}")
print(f"Test set: {y_test.value_counts(normalize=True)}")

Before SMOTE
Train shape: (256326, 30)
Test shape: (28481, 30)
Class distributions
Train set: Class
0    0.998272
1    0.001728
Name: proportion, dtype: float64
Test set: Class
0    0.99828
1    0.00172
Name: proportion, dtype: float64
After SMOTE
Train shape: (511766, 30)
Test shape: (28481, 30)
Class distributions
Train set: Class
0    0.5
1    0.5
Name: proportion, dtype: float64
Test set: Class
0    0.99828
1    0.00172
Name: proportion, dtype: float64


In [12]:
# MLP1: Perform 5-fold cross-validation and calculate the F1 score with the augmented data
mlp1_smote_results = cross_val_score(
    mlp1, X_train_resampled, y_train_resampled, 
    cv=5, scoring=make_scorer(f1_score),
)
# Store the result in result_list
result_list.append({
    "Model Name": "MLP-1",
    "IsSMOTEApplied": "Yes",
    "Hidden Layers": 1,
    "Hidden Units": (16,),
    "Average F1-Score": mlp1_smote_results.mean()
})

# MLP2: Perform 5-fold cross-validation and calculate the F1 score with the augmented data
mlp2_smote_results = cross_val_score(
    mlp2, X_train_resampled, y_train_resampled, 
    cv=5, scoring=make_scorer(f1_score),
)
# Store the result in result_list
result_list.append({
    "Model Name": "MLP-2",
    "IsSMOTEApplied": "Yes",
    "Hidden Layers": 2,
    "Hidden Units": (16, 8),
    "Average F1-Score": mlp2_smote_results.mean()
})

In [13]:
# Print overall results
mlp_df = pd.DataFrame(result_list)
mlp_df.sort_values(by="Average F1-Score", ascending=False)

Unnamed: 0,Model Name,IsSMOTEApplied,Hidden Layers,Hidden Units,Average F1-Score
3,MLP-2,Yes,2,"(16, 8)",0.99932
2,MLP-1,Yes,1,"(16,)",0.999096
0,MLP-1,No,1,"(16,)",0.83886
1,MLP-2,No,2,"(16, 8)",0.824226


**Discussion:** The results show that F1 scores are much lower when SMOTE is not used. When SMOTE is applied, the imbalanced target distribution is improved. Hence, the classification is performed correctly for both "Fraud" and "Not Fraud" classes thanks to increasing the sample size with SMOTE. For example, applying SMOTE increased the F1 score of the MLP-2 model by around 23%.

## MLP for House Price Prediction

Let us now focus on the Real State Price dataset. In this problem, we will implement four MLP models with distinct architectures:

    1) MLP with 1 hidden layer with 8 units in hidden layer,

    2) MLP with 1 hidden layer with 4 units in hidden layer,

    3) MLP with 2 hidden layers with 16 units in first layer and 8 units in the second layer.

    4) MLP with 2 hidden layers with 8 units in first layer and 4 units in the second layer.

- We will predict house sale price (last column) by using the following attributes:
      \
    ["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea","Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]

- During training, we will tune learning rate and number of epochs.
- We will evalute the model's performance by applying 5-fold cross-validation and using R2 score and RMSE for the test evaluation.

In [14]:
house_data = pd.read_csv(os.path.join(data_directory, "kaggle_house.csv"))
print(house_data.shape)

(1460, 81)


In [15]:
required_columns = [
    "SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
    "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF",
]
house_data = house_data[required_columns].copy()
print(house_data.shape)

(1460, 10)


In [16]:
house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SalePrice    1460 non-null   int64  
 1   MSSubClass   1460 non-null   int64  
 2   MSZoning     1460 non-null   object 
 3   LotFrontage  1201 non-null   float64
 4   LotArea      1460 non-null   int64  
 5   Street       1460 non-null   object 
 6   YearBuilt    1460 non-null   int64  
 7   LotShape     1460 non-null   object 
 8   1stFlrSF     1460 non-null   int64  
 9   2ndFlrSF     1460 non-null   int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 114.2+ KB


In [17]:
# Check for missing entries
house_data.isnull().sum()

SalePrice        0
MSSubClass       0
MSZoning         0
LotFrontage    259
LotArea          0
Street           0
YearBuilt        0
LotShape         0
1stFlrSF         0
2ndFlrSF         0
dtype: int64

In [18]:
# Remove missing instances in the "LotShape" feature
house_data = house_data.dropna()

# Check for missing entries once more
house_data.isnull().sum()

SalePrice      0
MSSubClass     0
MSZoning       0
LotFrontage    0
LotArea        0
Street         0
YearBuilt      0
LotShape       0
1stFlrSF       0
2ndFlrSF       0
dtype: int64

In [33]:
# Define features and target variable
X = house_data.drop(columns="SalePrice", axis=1)
y = house_data["SalePrice"].values

# Apply one-hot-encoding to the categorical features
X = pd.get_dummies(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Standardize the features
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train shape: {X_train_scaled.shape}")
print(f"Test shape: {X_test.shape}")

Train shape: (1080, 17)
Test shape: (121, 17)


In [19]:
# Define a function for hyperparameter tuning
def find_best(X_train, y_train, X_test, y_test, model_name):

    # Define the base models
    models = {
        "model1": MLPRegressor(hidden_layer_sizes=(8,), random_state=42),
        "model2": MLPRegressor(hidden_layer_sizes=(4,), random_state=42),
        "model3": MLPRegressor(hidden_layer_sizes=(16, 8), random_state=42),
        "model4": MLPRegressor(hidden_layer_sizes=(8, 4), random_state=42),
    }

    # Ensure the specified model is valid
    if model_name not in models:
        raise ValueError(f"Invalid model name '{model_name}'. Choose from 'model1', 'model2', 'model3', 'model4'.")

    # Get the specified model
    model = models[model_name]

    # Define the hyperparameter grid
    param_grid = {
        "learning_rate_init": [1e-3, 5e-3, 1e-2],
        "max_iter": [250, 500, 750, 1000, 2000],
    }

    # Define scoring metric (negative MSE)
    scoring = make_scorer(mean_squared_error, greater_is_better=False)

    # Perform grid search
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=scoring, verbose=0)
    grid_search.fit(X_train, y_train)

    # Get the optimal hyperparameters
    best_lr = grid_search.best_params_["learning_rate_init"]
    best_epoch = grid_search.best_params_["max_iter"]

    # Re-train the model using optimal hyperparameters
    optimized_model = MLPRegressor(
        hidden_layer_sizes=model.hidden_layer_sizes,
        learning_rate_init=best_lr,
        max_iter=best_epoch,
        random_state=42,
    )
    optimized_model.fit(X_train, y_train)

    # Evaluate the best model's performance on the test set
    test_preds = optimized_model.predict(X_test)
    r2 = r2_score(y_test, test_preds)
    rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    mape = mean_absolute_percentage_error(y_test, test_preds)

    # Create a dictionary for the model's test results
    model_results = {
        "Model Name": model_name,
        "Hidden Layer Sizes": model.hidden_layer_sizes,
        "Learning Rate": best_lr,
        "Number of Epochs": best_epoch,
        "Test R2-Score": r2,
        "Test RMSE": rmse,
        "Test MAPE": mape,
    }

    return optimized_model, model_results

In [21]:
# Store model results
result_list = list()

# Model-1
best_model1, model1_results = find_best(
    X_train_scaled, y_train,
    X_test_scaled, y_test,
    model_name="model1",
)
result_list.append(model1_results)

# Model-2
best_model2, model2_results = find_best(
    X_train_scaled, y_train,
    X_test_scaled, y_test,
    model_name="model2",
)
result_list.append(model2_results)

# Model-3
best_model3, model3_results = find_best(
    X_train_scaled, y_train,
    X_test_scaled, y_test,
    model_name="model3",
)
result_list.append(model3_results)

# Model-4
best_model4, model4_results = find_best(
    X_train_scaled, y_train,
    X_test_scaled, y_test,
    model_name="model4",
)
result_list.append(model4_results)

In [22]:
regression_df = pd.DataFrame(result_list)
regression_df.sort_values(by="Test RMSE", ascending=True)

Unnamed: 0,Model Name,Hidden Layer Sizes,Learning Rate,Number of Epochs,Test R2-Score,Test RMSE,Test MAPE
3,model4,"(8, 4)",0.01,2000,0.793355,46737.749886,0.175086
2,model3,"(16, 8)",0.005,2000,0.789654,47154.469023,0.189969
0,model1,"(8,)",0.01,2000,0.562443,68009.936014,0.227756
1,model2,"(4,)",0.01,2000,0.007847,102410.52567,0.428317


In [32]:
# Define the best model with optimal hp values
final_model = MLPRegressor(
    hidden_layer_sizes=(8, 4),
    learning_rate_init=1e-2,
    max_iter=2000,
    random_state=42,
)

final_model

# END