In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer , r2_score

In [2]:
df = pd.read_excel(r"D:\Earthquake-prediction-using-Machine-learning-models-main\Dataset\Processed_data\Earthquake_data_processed.xlsx")

# read_csv argument to load date_time column ---> ''', parse_dates=['Date(YYYY/MM/DD)']'''

# df.sample(10)

Train Test Split for Magnitude Prediction


In [3]:
from sklearn.model_selection import train_test_split

# Select relevant columns
# X = df[['Latitude(deg)', 'Longitude(deg)', 'Depth(km)', 'No_of_Stations']]

# taking all the columns
X = df[['Latitude(deg)', 'Longitude(deg)', 'Depth(km)', 'No_of_Stations','Gap','Close','RMS']]
y = df['Magnitude(ergs)']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
print(X_train.shape,y_train.shape)

(14424, 7) (14424,)


# Searching for Best Result Algorithm through Grid Search CV

**Defining the Parameter Grid**

Polynomial Regressor Parameter Grid

In [6]:
poly_regressor_params_grid = {
    'poly_feature__degree' : [2,3,4,5],
    'lin_reg__fit_intercept' : [True,False]
}

Random Forest Regressor Parameter Grid

In [7]:
rf_regressor_params_grid = {
    'rfr__n_estimators' : [100,150,200],
    'rfr__max_depth' : [None,10,20],
    'rfr__min_samples_split' : [2,5]
}

XGBoost Regressor Parameter Grid

In [8]:
xgb_regressor_params_grid = {
    'xgbr__n_estimators' : [100,150,200],
    'xgbr__max_depth' : [3,6],
    'xgbr__learning_rate' : [0.001,0.01],
    'xgbr__subsample' : [0.8,1.0]
}

**Setting Up Pipelines for all Models

Polynomial Regressor Pipeline

In [9]:
poly_regressor_pipeline = Pipeline([
    ('poly_feature', PolynomialFeatures()),
    ('scaler', StandardScaler()),
    ('lin_reg', LinearRegression())
])

Random Forest Regressor Pipeline

In [10]:
rf_regressor_pipeline = Pipeline([
    # ('scaler', StandardScaler()),  # Scaling might not be necessary for Random Forest but keeping it for consistency
    ('rfr', RandomForestRegressor(random_state=42))
])

XGBoost Regressor Pipeline

In [11]:
xgb_regressor_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scaling might not be necessary for XGBoost but keeping it for consistency
    ('xgbr', XGBRegressor(random_state=42, objective='reg:squarederror'))
])

**Using GridSearchCV to Find Best Model

In [12]:
poly_regressor_grid_search = GridSearchCV(poly_regressor_pipeline,poly_regressor_params_grid,cv=5,scoring='r2')

In [13]:
rf_regressor__grid_search = GridSearchCV(rf_regressor_pipeline,rf_regressor_params_grid,cv=5,scoring='r2')

In [14]:
xgb_regressor_grid_search = GridSearchCV(xgb_regressor_pipeline,xgb_regressor_params_grid,cv=5,scoring='r2')

In [15]:
# Fit the Grid Searches

poly_regressor_grid_search.fit(X_train,y_train)
rf_regressor__grid_search.fit(X_train,y_train)
xgb_regressor_grid_search.fit(X_train,y_train)

In [16]:
#get best models from Grid Search
polynomial_regressor_model = poly_regressor_grid_search.best_estimator_
rf_regressor_model = rf_regressor__grid_search.best_estimator_
xgb_regressor_model = xgb_regressor_grid_search.best_estimator_

Evaluate the Best Model on the test set

In [17]:
y_pred_poly = polynomial_regressor_model.predict(X_test)
y_pred_rf = rf_regressor_model.predict(X_test)
y_pred_xgb = xgb_regressor_model.predict(X_test)

In [18]:

r2_poly = r2_score(y_test, y_pred_poly)
r2_rf = r2_score(y_test, y_pred_rf)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"Best Polynomial Regression R2: {r2_poly}")
print(f"Best Random Forest R2: {r2_rf}")
print(f"Best XGBoost R2: {r2_xgb}")

Best Polynomial Regression R2: 0.07712804415083652
Best Random Forest R2: 0.17979081846117184
Best XGBoost R2: 0.1300505838762568


In [19]:
# Optionally, print the best parameters
print(f"Best parameters for Polynomial Regression: {poly_regressor_grid_search.best_params_}")
print(f"Best parameters for Random Forest: {rf_regressor__grid_search.best_params_}")
print(f"Best parameters for XGBoost: {xgb_regressor_grid_search.best_params_}")

Best parameters for Polynomial Regression: {'lin_reg__fit_intercept': True, 'poly_feature__degree': 2}
Best parameters for Random Forest: {'rfr__max_depth': 20, 'rfr__min_samples_split': 5, 'rfr__n_estimators': 200}
Best parameters for XGBoost: {'xgbr__learning_rate': 0.01, 'xgbr__max_depth': 6, 'xgbr__n_estimators': 200, 'xgbr__subsample': 0.8}


*By Above R2 Scores, We can say that Random Forest is working good among all other. So We'll further tune It's parameters for best Performence.*

# Hyper Parameter Tuning of Random Forest Regressor through Grid Search CV

In [41]:
from sklearn.model_selection import GridSearchCV

In [73]:
rf_params_grid = {
    'n_estimators' : [100,150,200],
    'criterion' : ['squared_error','absolute_error','poisson'],
    'min_samples_split' : [3,5,6],
    'min_samples_leaf' : [3,5,7,10],
    'oob_score' : [True],
    'n_jobs' : [30]
}

In [74]:
gs = GridSearchCV(estimator=RandomForestRegressor(),param_grid=rf_params_grid,cv=5,verbose=True)

Searching with more combinations would take upto hours

In [75]:
gs.fit(X_train,y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [80]:
rf = gs.best_estimator_

In [90]:
gs.best_params_

{'criterion': 'squared_error',
 'min_samples_leaf': 5,
 'min_samples_split': 6,
 'n_estimators': 200,
 'n_jobs': 30,
 'oob_score': True}

In [85]:
y_pred = rf.predict(X_train)

# Evaluate the loaded model
from sklearn.metrics import r2_score
r2_loaded_model = r2_score(y_train, y_pred)
print(f"R2 score of the loaded model: {r2_loaded_model}")

mse_rf = mean_squared_error(y_true=y_train,y_pred=y_pred)
print(f"Mean Squared Error of RF : {mse_rf}")

R2 score of the loaded model: 0.5982352202671812
Mean Squared Error of RF : 0.0779917118978595


In [86]:
rf.predict([[ 4.059180e+01, -1.257567e+02,  5.000000e+00,  1.600000e+02,
         2.730000e+02,  1.210000e+02,  2.700000e-01]])



array([5.13553684])

In [91]:
import joblib

joblib.dump(rf,"random_forest_regressor_200_estimators_minSampLeaf_5_minSampleSplit6_oob_True.pkl")

['random_forest_regressor_200_estimators_minSampLeaf_5_minSampleSplit6_oob_True.pkl']