In [1]:
import pandas as pd
import numpy as np
from pyautomlib.config import RegressionModel
from pyautomlib.regression import Regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from helper import get_outliers
from sklearn.metrics import r2_score

In [2]:
df_init = pd.read_csv("datasets/PGA.csv")
df_init.head()

Unnamed: 0,MW,FaultType,Rrup,VS30,PGA
0,3.5,Ters,77.26,323.0,0.000208
1,3.5,Ters,190.15,317.0,8.9e-05
2,3.5,Ters,99.67,429.01,0.00023
3,3.5,Ters,101.31,480.79,5.9e-05
4,3.5,Ters,108.75,396.89,9.3e-05


In [3]:
num_cols = ["MW", "Rrup", "VS30", "PGA"]
cat_cols = ["FaultType"]

In [4]:
df_init = pd.get_dummies(df_init, columns=cat_cols, drop_first=True)
df_init.head()

Unnamed: 0,MW,Rrup,VS30,PGA,FaultType_Ters,FaultType_Yanal
0,3.5,77.26,323.0,0.000208,1,0
1,3.5,190.15,317.0,8.9e-05,1,0
2,3.5,99.67,429.01,0.00023,1,0
3,3.5,101.31,480.79,5.9e-05,1,0
4,3.5,108.75,396.89,9.3e-05,1,0


In [5]:
def create_model(df):
    X = df.drop("PGA",axis=1)
    y = df["PGA"]
    x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    rm = RegressionModel()
    regressor = Regression(x_train,y_train,rm)
    model = regressor.create_model()
    y_predict = model.predict(x_test)
    test_score = r2_score(y_test, y_predict)
    print("test_score = ", test_score)

### Model 1 - Without Data Processing

In [7]:
create_model(df_init)

Base Models....
Best 3 models: ['LightGBM', 'GBM', 'RF']
Hyperparameter Optimization....
LightGBM best score : 0.7096

GBM best score : 0.6901

RF best score : 0.7051

R2: 0.711026328613363
test_score =  0.6349027620334562


### Model 2 - Drop Outliers

In [6]:
outlier_indexes = get_outliers(df_init)
df_2 = df_init.drop(outlier_indexes)
df_2.head()

Unnamed: 0,MW,Rrup,VS30,PGA,FaultType_Ters,FaultType_Yanal
0,3.5,77.26,323.0,0.000208,1,0
1,3.5,190.15,317.0,8.9e-05,1,0
2,3.5,99.67,429.01,0.00023,1,0
3,3.5,101.31,480.79,5.9e-05,1,0
4,3.5,108.75,396.89,9.3e-05,1,0


In [7]:
create_model(df_2)

Base Models....
Best 3 models: ['LightGBM', 'GBM', 'RF']
Hyperparameter Optimization....
LightGBM best score : 0.6925

GBM best score : 0.6697

RF best score : 0.6852

R2: 0.6907369889995142
test_score =  0.7234278532814377


### Model 3 - Drop outliers + Standard Scaler

In [8]:
df_3 = df_2.copy()
scaler = StandardScaler()
df_3[num_cols] = scaler.fit_transform(df_3[num_cols])
df_3.head()

Unnamed: 0,MW,Rrup,VS30,PGA,FaultType_Ters,FaultType_Yanal
0,-1.111858,-0.384764,-0.749199,-0.377425,1,0
1,-1.111858,0.961211,-0.780206,-0.379165,1,0
2,-1.111858,-0.117572,-0.201348,-0.3771,1,0
3,-1.111858,-0.098018,0.066247,-0.379599,1,0
4,-1.111858,-0.009312,-0.367342,-0.37911,1,0


In [9]:
create_model(df_3)

Base Models....
Best 3 models: ['LightGBM', 'GBM', 'XGBoost']
Hyperparameter Optimization....
LightGBM best score : 0.6911

GBM best score : 0.6698

XGBoost best score : 0.6761

R2: 0.6896343543373569
test_score =  0.7197445345864533


### Model 4 - Drop outliers + Standard Scaler + Drop FaultType

In [10]:
df_4 = df_3.drop(["FaultType_Ters", "FaultType_Yanal"], axis=1)
df_4.head()

Unnamed: 0,MW,Rrup,VS30,PGA
0,-1.111858,-0.384764,-0.749199,-0.377425
1,-1.111858,0.961211,-0.780206,-0.379165
2,-1.111858,-0.117572,-0.201348,-0.3771
3,-1.111858,-0.098018,0.066247,-0.379599
4,-1.111858,-0.009312,-0.367342,-0.37911


In [11]:
create_model(df_4)

Base Models....
Best 3 models: ['LightGBM', 'GBM', 'RF']
Hyperparameter Optimization....
LightGBM best score : 0.6864

GBM best score : 0.6695

RF best score : 0.6768

R2: 0.686871761054754
test_score =  0.7243765373368254


### Model 5 - Log Scale PGA + Standard Scaler + Drop Outliers

In [12]:
log_cols = ["PGA"]
df_5 = df_2.copy()
df_5[log_cols] = np.log10(df_5[log_cols])
scaler = StandardScaler()
df_5[num_cols] = scaler.fit_transform(df_5[num_cols])
df_5.head()

Unnamed: 0,MW,Rrup,VS30,PGA,FaultType_Ters,FaultType_Yanal
0,-1.111858,-0.384764,-0.749199,-0.926539,1,0
1,-1.111858,0.961211,-0.780206,-1.253588,1,0
2,-1.111858,-0.117572,-0.201348,-0.887414,1,0
3,-1.111858,-0.098018,0.066247,-1.409739,1,0
4,-1.111858,-0.009312,-0.367342,-1.237756,1,0


In [15]:
create_model(df_5)

Base Models....
Best 3 models: ['XGBoost', 'LightGBM', 'RF']
Hyperparameter Optimization....
XGBoost best score : 0.9292

LightGBM best score : 0.9312

RF best score : 0.9139

R2: 0.9292912883488091
test_score =  0.9327402103891709


### Model 6 - Log Scale [PGA,Rrup] + Standard Scaler + Drop Outliers

In [14]:
log_cols = ["PGA","Rrup"]
df_6 = df_2.copy()
df_6[log_cols] = np.log10(df_6[log_cols])
scaler = StandardScaler()
df_6[num_cols] = scaler.fit_transform(df_6[num_cols])
df_6.head()

Unnamed: 0,MW,Rrup,VS30,PGA,FaultType_Ters,FaultType_Yanal
0,-1.111858,-0.027347,-0.749199,-0.926539,1,0
1,-1.111858,1.003065,-0.780206,-1.253588,1,0
2,-1.111858,0.26404,-0.201348,-0.887414,1,0
3,-1.111858,0.282712,0.066247,-1.409739,1,0
4,-1.111858,0.36379,-0.367342,-1.237756,1,0


In [16]:
create_model(df_6)

Base Models....
Best 3 models: ['XGBoost', 'LightGBM', 'RF']
Hyperparameter Optimization....
XGBoost best score : 0.9292

LightGBM best score : 0.9313

RF best score : 0.914

R2: 0.9293433318557845
test_score =  0.9326510734326602


### Model 7 - Log Scale [PGA,Rrup,VS30] + Standard Scaler + Drop Outliers

In [17]:
log_cols = ["PGA","Rrup","VS30"]
df_7 = df_2.copy()
df_7[log_cols] = np.log10(df_7[log_cols])
scaler = StandardScaler()
df_7[num_cols] = scaler.fit_transform(df_7[num_cols])
df_7.head()

Unnamed: 0,MW,Rrup,VS30,PGA,FaultType_Ters,FaultType_Yanal
0,-1.111858,-0.027347,-0.754776,-0.926539,1,0
1,-1.111858,1.003065,-0.802942,-1.253588,1,0
2,-1.111858,0.26404,-0.025689,-0.887414,1,0
3,-1.111858,0.282712,0.267023,-1.409739,1,0
4,-1.111858,0.36379,-0.225593,-1.237756,1,0


In [18]:
create_model(df_7)

Base Models....
Best 3 models: ['XGBoost', 'LightGBM', 'RF']
Hyperparameter Optimization....
XGBoost best score : 0.9292

LightGBM best score : 0.9313

RF best score : 0.9139

R2: 0.9292463389605435
test_score =  0.9328259325530598
