Import libaries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
tf.random.set_seed(42)
np.random.seed(42)


In [None]:
from google.colab import drive
import sys

# Mount Google Drive
drive.mount('/content/drive')

# Get the absolute path of the current folder
abspath_curr = '/content/drive/My Drive/Colab Notebooks/final_project/'

In [2]:
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline 

# Set matplotlib sizes
plt.rc('font', size=20)
plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=20)
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
plt.rc('legend', fontsize=20)
plt.rc('figure', titlesize=20)

In [None]:
# The magic below allows us to use tensorflow version 2.x
%tensorflow_version 2.x 
import tensorflow as tf
from tensorflow import keras

In [None]:
# The random seed
random_seed = 42

# Set random seed in tensorflow
tf.random.set_seed(random_seed)

# Set random seed in numpy
import numpy as np
np.random.seed(random_seed)

In [None]:
# Implement me

# Loading raw training data
df_raw_train = pd.read_csv(abspath_curr + 'data/DailyDelhiClimateTrain.csv',header=0)

# Make a copy of the raw training data
df_train = df_raw_train.copy(deep=True)

# Loading raw test data
df_raw_test = pd.read_csv(abspath_curr + 'data/DailyDelhiClimateTest.csv',header=0)

# Make a copy of the raw test data
df_test = df_raw_test.copy(deep=True)

# Name of target
target="trip_duration"

##Exploratory Data Analysis (EDA)

In [None]:
# Check the dimensions of the training and test datasets
print("Training data shape:", df_train.shape)
print("Test data shape:", df_test.shape)

# Get the summary statistics of the training dataset
print(df_train.describe())

# Check for missing values in the training and test datasets
print("Missing values in training data:\n", df_train.isnull().sum())
print("Missing values in test data:\n", df_test.isnull().sum())

# Visualize the distribution of the target variable
sns.histplot(data=df_train, x=target, bins=30)

# Visualize the correlations between the features and the target variable
sns.pairplot(data=df_train, x_vars=['feature1', 'feature2', ...], y_vars=[target], kind='scatter')

# Plot the time series of the target variable
df_train.set_index('date', inplace=True)
df_train[target].plot(figsize=(15, 5))

# Identify outliers using box plots or scatter plots
sns.boxplot(data=df_train, x=target)
sns.scatterplot(data=df_train, x='feature1', y=target)


Data Preprocessing

In [None]:


# Impute missing values using mean, median, or mode
df_train.fillna(df_train.mean(), inplace=True)
df_test.fillna(df_test.mean(), inplace=True)

# One-hot encode categorical variables
df_train = pd.get_dummies(df_train, columns=['categorical_column'])
df_test = pd.get_dummies(df_test, columns=['categorical_column'])

# Scale the features using standardization or normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(df_train.drop([target], axis=1))
y_train = df_train[target]
X_test = scaler.transform(df_test.drop([target], axis=1))
y_test = df_test[target]

# Feature engineering using domain knowledge
df_train['new_feature'] = df_train['feature1'] + df_train['feature2']
df_test['new_feature'] = df_test['feature1'] + df_test['feature2']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_seed)


Hyperparameter Tuning

In [None]:
# Define the hyperparameters and their search spaces
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]
}

# Initialize the grid search with the estimator, hyperparameters, and scoring metric
rf_grid = GridSearchCV(RandomForestRegressor(random_state=random_seed), 
                       param_grid, 
                       scoring='neg_mean_squared_error',
                       cv=5, 
                       n_jobs=-1)

# Fit the grid search to the training data
rf_grid.fit(X_train, y_train)

# Get the best hyperparameters and their performance score
best_params = rf_grid.best_params_
best_score = rf_grid.best_score_


Model Selection

In [None]:
# Initialize the models
models = {
    'linear_regression': LinearRegression(),
    'ridge_regression': Ridge(),
    'lasso_regression': Lasso(),
    'elastic_net': ElasticNet(),
    'random_forest': RandomForestRegressor(random_state=random_seed),
    'gradient_boosting': GradientBoostingRegressor(random_state=random_seed),
    'xgboost': XGBRegressor(random_state=random_seed),
    'lightgbm': LGBMRegressor(random_state=random_seed)
}

# Define the hyperparameters and their search spaces for each model
param_grids = {
    'linear_regression': {},
    'ridge_regression': {'alpha': [0.1, 1.0, 10.0]},
    'lasso_regression': {'alpha': [0.1, 1.0, 10.0]},
    'elastic_net': {'alpha': [0.1, 1.0, 10.0], 'l1_ratio': [0.1, 0.5, 0.9]},
    'random_forest': {'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5], 
                      'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2, 3]},
    'gradient_boosting': {'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5], 
                          'learning_rate': [0.1, 0.01, 0.001]},
    'xgboost': {'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5], 
                'learning_rate': [0.1, 0.01, 0.001]},
    'lightgbm': {'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5], 
                 'learning_rate': [0.1, 0.01, 0.001]}
}

# Initialize the results dictionary
results = {}

# Loop over each model and perform cross-validation with hyperparameter tuning
for model_name, model in models.items():
    print("Training", model_name)
    
    # Define the grid search with the estimator, hyperparameters, and scoring metric
    grid = GridSearchCV(model, param_grids[model_name], scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    
    # Fit the grid search to the training data
    grid.fit(X_train, y_train)
    
    # Get the best hyperparameters and their performance score
    best_params = grid.best_params_
    best_score = grid.best_score_
    
    # Evaluate the model on the validation set
    y_pred = grid.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    
    # Save the results to the dictionary
    results[model_name] = {'best_params': best_params, 'best_score': best_score, 
                           'rmse': rmse, 'r2': r2}

# Print the results dictionary
print(results)
