In [None]:
!git clone https://github.com/tranvuongquocdat/Semester20221_IntroToDataScience_CapstoneProject.git

In [None]:
!pip install optuna

## Import Libraries

In [None]:
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
import optuna

## Load the training and testing set

In [None]:
df_train = pd.read_csv("/content/Semester20221_IntroToDataScience_CapstoneProject/data/training_set.csv")
df_test = pd.read_csv("/content/Semester20221_IntroToDataScience_CapstoneProject/data/testing_set.csv")

In [None]:
df_train.drop(["Unnamed: 0"], axis = 1, inplace = True)
df_test.drop(["Unnamed: 0"], axis = 1, inplace = True)

In [None]:
#Train test split
X_train = df_train.iloc[:, :-1]
y_train = df_train["price"]
X_test = df_test.iloc[:, :-1]
y_test = df_test["price"]

## SGDRegressor

In [None]:
# Create the model
reg = SGDRegressor(max_iter=1000, tol=1e-3, learning_rate='constant', eta0=0.1)

# Fit the model to the data
reg.fit(X_train, y_train)

# Print the coefficients
print("Intercept: ", reg.intercept_)
print("Coefficients: ", reg.coef_)

# Generate predictions for the data
y_pred = reg.predict(X_test)

# Calculate the evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Mean Absolute Error: ", mae)
print("Mean Squared Error: ", mse)
print("R-Squared: ", r2)

## Gradient Boosting

In [None]:
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

reg = GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

In [None]:
test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
    test_score[i] = mean_squared_error(y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title("Deviance")
plt.plot(
    np.arange(params["n_estimators"]) + 1,
    reg.train_score_,
    "b-",
    label="Training Set Deviance",
)
plt.plot(
    np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
)
plt.legend(loc="upper right")
plt.xlabel("Boosting Iterations")
plt.ylabel("Deviance")
fig.tight_layout()
plt.show()

## Linear Regression

In [None]:
print("Linear Regression")

# Train the linear regression model on the training data
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = regressor.predict(X_test)

# Evaluate the performance of the model using mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

# Obtain the feature importances
importances = np.abs(regressor.coef_)

# Create a list of feature names and their importances
feature_importances = [(feature, importance) for feature, importance in zip(X_train.columns, importances)]

# Sort the feature importances in descending order
feature_importances.sort(key=lambda x: x[1], reverse=True)

# Plot the feature importances
features, importances = zip(*feature_importances)
plt.bar(features, importances, align='center')
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Feature Importances")
plt.xticks(rotation=90)
plt.show()


## Decision Tree

In [None]:
print("Decision Tree Regression")

# Define the model
dtr = DecisionTreeRegressor()

# Define the hyperparameter grid to search
param_grid = {'max_depth': [1, 2, 3, 4, 5],
              'min_samples_split': [2, 4, 6, 8],
              'min_samples_leaf': [1, 2, 3, 4]
              }

# Grid search to find the best hyperparameters
grid_search = GridSearchCV(dtr, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameter: ", grid_search.best_params_)

# Use the best hyperparameters to train the final model
dtr = grid_search.best_estimator_
dtr.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = dtr.predict(X_test)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

# Evaluate the performance of the model using R-squared
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

# Obtain the feature importances
importances = dtr.feature_importances_

# Create a list of feature names and their importances
feature_importances = [(feature, importance) for feature, importance in zip(X_train.columns, importances)]

# Sort the feature importances in descending order
feature_importances.sort(key=lambda x: x[1], reverse=True)

# Print the feature importances
'''for feature, importance in feature_importances:
    print(feature, ":", importance)
'''

# Plot the feature importances
features, importances = zip(*feature_importances)
plt.bar(features, importances, align='center')
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Feature Importances")
plt.xticks(rotation=90)
plt.show()

In [None]:
def objective(trial):
    """Define the objective function"""
    # train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.15,random_state=42)
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 5),
        'min_samples_split': trial.suggest_float("min_samples_split", 0.01, 1),
        'min_samples_leaf':  trial.suggest_float("min_samples_leaf", 0.01, 0.5),
        'random_state': 1
    }

    # Fit the model
    model = DecisionTreeRegressor(**params)  
    r2 = cross_val_score(model, X_train, y_train, cv=5, scoring = 'r2').mean()
    return r2

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=280)
trial = study.best_trial
tuned_model = DecisionTreeRegressor(**trial.params) 
tuned_model.fit(X_train, y_train)
y_preds = tuned_model.predict(X_test)
r2_train = r2_score(y_train, tuned_model.predict(X_train))
r2_test = r2_score(y_test, y_preds)
print("R2 score for DecisionTreeRegressor model (test):", r2_test)
print("R2 score for DecisionTreeRegressor model (train):", r2_train)

## Random Forest

In [None]:
#Random Forest Regression
print("Random Forest Regressor")

#Define the model
regressor = RandomForestRegressor()

#Define the hyperparameter grid to search
param_grid = {'n_estimators': [100, 200, 300, 400, 500],
                'max_depth': [5, 10, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_leaf_nodes': [None, 10, 20],
                }               

#Grid search to find the best hyperparameter
grid_search = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    scoring = 'r2',
    n_jobs = -1,
    cv = 5,
    verbose=True
)
grid_search.fit(X_train, y_train)
print("Best parameter: ", grid_search.best_params_)

# Train the random forest regression model on the training data
regressor = grid_search.best_estimator_
regressor.fit(X_train, y_train)



# Make predictions on the testing data
y_pred = regressor.predict(X_test)

# Evaluate the performance of the model using mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Evaluate the performance of the model using R-squared
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

# Obtain the feature importances
importances = regressor.feature_importances_

# Create a list of feature names and their importances
feature_importances = [(feature, importance) for feature, importance in zip(X_train.columns, importances)]

# Sort the feature importances in descending order
feature_importances.sort(key=lambda x: x[1], reverse=True)

# Print the feature importances
'''for feature, importance in feature_importances:
    print(feature, ":", importance)
'''

# Plot the feature importances
features, importances = zip(*feature_importances)
plt.bar(features, importances, align='center')
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Feature Importances")
plt.xticks(rotation=90)
plt.show()

### Bayesian Optimization

In [None]:
def objective(trial):
    """Define the objective function"""
    # train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.15,random_state=42)
    params = {
        # 'max_depth': trial.suggest_int('max_depth', 1, 31),
        # 'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 20),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_samples_split': trial.suggest_loguniform('min_samples_split', 0.01, 1),
        'min_samples_leaf': trial.suggest_loguniform('min_samples_split', 0.01, 1),
        'random_state': 1
    }

    # Fit the model
    model = RandomForestRegressor(**params)  
    r2 = cross_val_score(model, X_train, y_train, cv=5, scoring = 'r2').mean()
    return r2

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=280)

trial = study.best_trial
tuned_model = RandomForestRegressor(**trial.params) 
tuned_model.fit(X_train, y_train)
y_preds = tuned_model.predict(X_test)
r2_train = r2_score(y_train, tuned_model.predict(X_train))
r2_test = r2_score(y_test, y_preds)
print("R2 score for RandomForestRegressor model (test):", r2_test)
print("R2 score for RandomForestRegressor model (train):", r2_train)

## XGBoost

In [None]:
# params = trial.best_params
default_model = xgb.XGBRegressor()
# model = xgb.XGBRegressor(**params)
default_model.fit(X_train, y_train)
y_preds = default_model.predict(X_test)
r2_train = r2_score(y_train, default_model.predict(X_train))
r2_test = r2_score(y_test, y_preds)
print("R2 score for XGBoost model (test):", r2_test)
print("R2 score for XGBoost model (train):", r2_train)

params = {
        'gamma': [0, 0.5, 1, 5],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 6, 10],
        'n_estimators':[50,100,200],
        }

model = xgb.XGBRegressor() 

grid_search = GridSearchCV(
    estimator=model,
    param_grid=params,
    scoring = 'r2',
    n_jobs = -1,
    cv = 5,
    verbose=True
)

grid_search.fit(X_train, y_train)

print('Best Score: %s' % grid_search.best_score_)
print('Best Hyperparameters: %s' % grid_search.best_params_)

tuned_model = xgb.XGBRegressor(**grid_search.best_params_) 
tuned_model.fit(X_train, y_train)
y_preds = tuned_model.predict(X_test)
r2_train = r2_score(y_train, tuned_model.predict(X_train))
r2_test = r2_score(y_test, y_preds)
print("R2 score for XGBoost model (test):", r2_test)
print("R2 score for XGBoost model (train):", r2_train)

In [None]:
def objective(trial):
    """Define the objective function"""
    # train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.15,random_state=42)
    params = {
        'max_depth': trial.suggest_categorical('max_depth', [5, 6, 8, 11, 15, 18]),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.1, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'random_state': 1
    }

    # Fit the model
    model = xgb.XGBRegressor(**params)  
    r2 = cross_val_score(model, X_train, y_train, cv=5, scoring = 'r2').mean()
    return r2

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=280)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
trial = study.best_trial
print('Best Value: {}'.format(trial.value))
print('Best Params: ')

for key, value in trial.params.items():
    print('{}: {}'.format(key, value))

tuned_model = xgb.XGBRegressor(**trial.params) 
tuned_model.fit(X_train, y_train)
y_preds = tuned_model.predict(X_test)
r2_train = r2_score(y_train, tuned_model.predict(X_train))
r2_test = r2_score(y_test, y_preds)
print("R2 score for XGBoost model (test):", r2_test)
print("R2 score for XGBoost model (train):", r2_train)

## ANN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor

In [None]:
def create_model(learning_rate = 1e-4, activation="relu"):
  model = Sequential()
  model.add(Dense(20,activation=activation))
  model.add(Dense(20,activation=activation)) 
  model.add(Dense(20,activation=activation))
  model.add(Dense(20,activation=activation))
  model.add(Dense(20,activation=activation))
  model.add(Dense(1))
  model.compile(optimizer=Adam(learning_rate),loss="mse")

  return model

### Grid Search

In [None]:
model = KerasRegressor(build_fn=create_model, verbose=1)
cv = KFold(n_splits=5, shuffle=True, random_state=1)
batches = [10, 100, 32, 64]
activationFunction = ['relu','selu']
epochs = [100, 1000, 2000]
learning_rate = [1e-2, 1e-3, 1e-4]
param_grid = dict(batch_size = batches,
                  activation=activationFunction,
                  epochs = epochs,
                  learning_rate = learning_rate)
search = GridSearchCV(model, param_grid=param_grid, return_train_score=True, cv=cv, n_jobs = -1)
grid_result = search.fit(X_train.to_numpy(), y_train.to_numpy())
# best result batch = 10, activation = relu, epoch = 1000, learning_rate = 1e-3

In [None]:
print("===========RESULTS============")
print(f'Best Accuracy for {grid_result.best_score_:.4} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean_accuracy, stdev, param in zip(means, stds, params):
    print(f'mean={mean_accuracy:.4}, std={stdev:.4} using {param}')

### Bayesian Optimization

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
trial = study.best_trial
print('Best Value: {}'.format(trial.value))
print('Best Params: ')

for key, value in trial.params.items():
    print('{}: {}'.format(key, value))

tuned_model = KerasRegressor(**trial.params) 
hist = tuned_model.fit(X_train, y_train)
y_preds = tuned_model.predict(X_test)
r2_train = r2_score(y_train, tuned_model.predict(X_train))
r2_test = r2_score(y_test, y_preds)
print("R2 score for XGBoost model (test):", r2_test)
print("R2 score for XGBoost model (train):", r2_train)

In [None]:
pred = model.predict(X_test)
print(r2_score(pred, y_test.to_numpy().reshape(132, 1)))

In [None]:
import matplotlib.pyplot as plt

plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()