<h1>Regression</h1>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('option_train.csv')
test_data = pd.read_csv('option_test_nolabel.csv')

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,Value,S,K,tau,r,BS
0,1,348.5,1394.46,1050,0.128767,0.0116,Under
1,2,149.375,1432.25,1400,0.679452,0.0113,Under
2,3,294.5,1478.9,1225,0.443836,0.0112,Under
3,4,3.375,1369.89,1500,0.117808,0.0119,Over
4,5,84.0,1366.42,1350,0.29863,0.0119,Under


In [None]:
data.isnull().sum()

Unnamed: 0    0
Value         0
S             0
K             0
tau           0
r             0
BS            0
dtype: int64

In [None]:
X = data[['S','K','tau','r']]

In [None]:
y = data['Value']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

<h2>Linear Regression</h2>

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

X = data[['S', 'K', 'tau', 'r']]
y = data['Value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred_li = linear_model.predict(X_test)

r2_li = r2_score(y_test, y_pred_li)
mse_li = mean_squared_error(y_test, y_pred_li)
rmse_li = np.sqrt(mse_li)

print(f"R-squared on Linear Regression: {r2_li:.3f}")
print(f"Root Mean Squared Error for Linear Regression: {rmse_li:.2f}")


R-squared on Linear Regression: 0.928
Root Mean Squared Error for Linear Regression: 33.01


<h3>CV - Linear Regression</h3>

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

model = LinearRegression()

r2_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print("R-squared scores:", r2_scores)

def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse_score, greater_is_better=False)
rmse_scores = cross_val_score(model, X, y, cv=5, scoring=rmse_scorer)
rmse_scores = -rmse_scores  # Negate to get positive RMSE scores
print("Root Mean Squared Error scores:", rmse_scores)

average_r2 = np.mean(r2_scores)
average_rmse = np.mean(rmse_scores)
print("Average R-squared:", average_r2)
print("Average Root Mean Squared Error:", average_rmse)


R-squared scores: [0.92664342 0.9321327  0.9353855  0.89892416 0.92669229]
Root Mean Squared Error scores: [33.78343926 32.60504772 32.39294422 37.85971882 34.86862054]
Average R-squared: 0.9239556126998967
Average Root Mean Squared Error: 34.301954114529266


<h2>Decision Tree - Regression</h2>

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

tree_model = DecisionTreeRegressor(random_state=101)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

r2 = r2_score(y_test, y_pred_tree)
mse = mean_squared_error(y_test, y_pred_tree)
rmse = np.sqrt(mse)

print(f'R-squared on DT: {r2:.3f}')
print(f'Root Mean Squared Error DT: {rmse:.2f}')


R-squared on DT: 0.991
Root Mean Squared Error DT: 11.40


<h3>CV for Decision Tree</h3>

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

param_grid = {'max_depth': range(1, 20)}
tree_model = DecisionTreeRegressor(random_state=101)
grid_search = GridSearchCV(tree_model, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best max_depth: {best_params["max_depth"]}')
print(f'Best cross-validation R²: {best_score}')

final_model = DecisionTreeRegressor(max_depth=best_params['max_depth'], random_state=101)
final_model.fit(X_train, y_train)
y_pred_final = final_model.predict(X_test)

final_r2_score = r2_score(y_test, y_pred_final)
final_mse = mean_squared_error(y_test, y_pred_final)
final_rmse = np.sqrt(final_mse)

print(f'Final R-squared on Test Set: {final_r2_score}')
print(f'Final Root Mean Squared Error (RMSE) on Test Set: {final_rmse}')

Best max_depth: 15
Best cross-validation R²: 0.9902609322205503
Final R-squared on Test Set: 0.9913386134247938
Final Root Mean Squared Error (RMSE) on Test Set: 11.427669180155151


<h2>Random Forest - Regression</h2>

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

rf_model = RandomForestRegressor(n_estimators=100, random_state=101)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

r2_rf = r2_score(y_test, y_pred_rf)
print(f'R² Score for Random Forest Regression: {r2_rf:.3f}')

mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
print(f'Root Mean Squared Error for Random Forest Regression: {rmse_rf:.2f}')

R² Score for Random Forest Regression: 0.996
Root Mean Squared Error for Random Forest Regression: 7.69


<h3>CV for Random Forest</h3>

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=101)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5,
                           scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_mse = grid_search.best_score_
print(f'Best parameters found: {best_params}')
print(f'Best cross-validation MSE: {best_mse}')

final_model = RandomForestRegressor(**best_params, random_state=101)
final_model.fit(X_train, y_train)
y_pred_test = final_model.predict(X_test)

final_r2 = r2_score(y_test, y_pred_test)
final_mse = mean_squared_error(y_test, y_pred_test)
final_rmse = np.sqrt(final_mse)

print(f'Final R-squared on Test Set: {final_r2}')
print(f'Final RMSE on Test Set: {final_rmse}')

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation MSE: -69.89546108517042
Final R-squared on Test Set: 0.996108406811331
Final RMSE on Test Set: 7.659980526431714


<h2>Gradient Boosting - Regression</h2>

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

X = data[['S', 'K', 'tau', 'r']]
y = data['Value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

gb_reg = GradientBoostingRegressor(random_state=101)
gb_reg.fit(X_train, y_train)

y_pred_gb = gb_reg.predict(X_test)

mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f'R-squared for Gradient Boosting Regressor: {r2_gb:.3f}')
print(f'Root Mean Squared Error for Gradient Boosting Regressor: {rmse_gb:.2f}')

R-squared for Gradient Boosting Regressor: 0.994
Root Mean Squared Error for Gradient Boosting Regressor: 9.22


<h3> CV for Graudent Boosting</h3>

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

X = data[['S', 'K', 'tau', 'r']]
y = data['Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'loss': ['ls', 'lad', 'huber']
}
gb_reg = GradientBoostingRegressor(random_state=101)

grid_search = GridSearchCV(gb_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
try:
    grid_search.fit(X_train, y_train)
except ValueError as e:
    print("Error during grid search:", e)
    raise

best_model = grid_search.best_estimator_
y_pred_gb = best_model.predict(X_test)

mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Mean Squared Error for Gradient Boosting Regressor: {mse_gb}')
print(f'Best R-Squared for Gradient Boosting Regressor: {r2_gb}')
print(f'Root Mean Squared Error for Gradient Boosting Regressor: {rmse_gb}')

270 fits failed out of a total of 405.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/minjoosung/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/minjoosung/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/minjoosung/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/minjoosung/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, i

Best Parameters: {'learning_rate': 0.1, 'loss': 'huber', 'max_depth': 4, 'n_estimators': 300}
Best Mean Squared Error for Gradient Boosting Regressor: 43.86658052942052
Best R-Squared for Gradient Boosting Regressor: 0.9970905835819603
Root Mean Squared Error for Gradient Boosting Regressor: 6.623185074374754


<h2>XGBoosting - Regression</h2>

In [None]:
import xgboost as xgb
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

model = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3,
                         learning_rate=0.1, max_depth=5, alpha=10, n_estimators=100)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2_best_xg = r2_score(y_test, y_pred)
print(f'R-squared: {r2_best_xg:.3f}')

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Men Squared Error: {rmse:.2f}')

R-squared: 0.953
Root Mean Squared Error: 26.72


<h3> CV for XGBoosting </h3>

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5,
                           scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
lowest_cv_rmse = np.sqrt(-grid_search.best_score_)
print(f"Best parameters found: {best_params}")
print(f"Lowest RMSE found in CV: {lowest_cv_rmse:.2f}")

best_model = grid_search.best_estimator_
y_pred_best_xg = best_model.predict(X_test)

mse_test = mean_squared_error(y_test, y_pred_best_xg)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_pred_best_xg)

print(f"Best R-squared: {r2_test:.3f}")
print(f"Mean Squared Error: {mse_test:.2f}")
print(f"Root Mean Squared Error: {rmse_test:.2f}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits


Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: { "loss" } are not used.

Parameters: 

Best parameters found: {'learning_rate': 0.2, 'loss': 'ls', 'max_depth': 3, 'n_estimators': 300}
Lowest RMSE found in CV: 17.31
Best R-squared: 0.983
Mean Squared Error: 259.08
Root Mean Squared Error: 16.10


Parameters: { "loss" } are not used.



<h2>KNN Regression</h2>

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train, y_train)

y_pred_knn = knn_regressor.predict(X_test)

mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
r2_knn = r2_score(y_test, y_pred_knn)

print("Root Mean Squared Error (RMSE) for KNN Regression: {:.2f}".format(rmse_knn))
print("R-squared for KNN Regression: {:.3f}".format(r2_knn))

Root Mean Squared Error (RMSE) for KNN Regression: 28.38
R-squared for KNN Regression: 0.947
