**Do we need more bikes?**

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# dir_path = "/content/drive/MyDrive/SML-project/"

In [2]:
dir_path = "Data/"

# Hyperparameter Tuning

In [3]:
import numpy as np

# Number of fold in Cross Validation
num_folds = 5
seed = 42

# Random forest
rf_params = {'bootstrap': True, # [True, False]
            'class_weight': 'balanced', # [balanced, balanced_subsample, None]
            'criterion': 'entropy', #[gini, entropy, log_loss]
            'max_depth': None,
            'max_features':None,
            'min_samples_leaf': 3, # [ 3, 4, 5]
            'min_samples_split': 2, # [2, 5]
            'n_estimators': 100, # [50, 100, 150]
            'random_state': 0,
            'verbose': 0,
            }

# Load Data

## Load Train Dataset

In [4]:
import pandas as pd
dataset_path = dir_path + "training_data.csv"
train_data = pd.read_csv(dataset_path, delimiter=',')
output_name = 'increase_stock'
train_data.head(5)

Unnamed: 0,hour_of_day,day_of_week,month,holiday,weekday,summertime,temp,dew,humidity,precip,snow,snowdepth,windspeed,cloudcover,visibility,increase_stock
0,5,5,1,0,0,0,-7.2,-15.0,53.68,0.0,0,0.0,16.3,31.6,16.0,low_bike_demand
1,21,4,1,0,1,0,-1.3,-12.8,40.97,0.0,0,0.0,23.9,85.7,16.0,low_bike_demand
2,21,3,8,0,1,1,26.9,21.8,73.39,0.0,0,0.0,0.0,81.1,16.0,low_bike_demand
3,1,6,1,0,0,0,3.1,-4.0,59.74,0.0,0,0.0,19.2,0.0,16.0,low_bike_demand
4,17,0,3,0,1,0,11.7,-11.4,18.71,0.0,0,0.0,10.5,44.6,16.0,low_bike_demand


## Load Test Dataset

In [5]:
import pandas as pd
dataset_path = dir_path + "test_data.csv"
test_data = pd.read_csv(dataset_path, delimiter=',')
test_data.head(5)

Unnamed: 0,hour_of_day,day_of_week,month,holiday,weekday,summertime,temp,dew,humidity,precip,snow,snowdepth,windspeed,cloudcover,visibility
0,14,0,1,0,1,0,-1.7,-1.9,98.86,2.434,0,2.96,33.0,100.0,3.3
1,14,5,3,0,0,0,14.3,2.2,43.93,0.0,0,0.0,16.4,44.6,16.0
2,18,3,1,0,1,0,11.1,7.8,80.07,0.0,0,0.0,7.7,99.2,16.0
3,2,2,1,1,1,0,1.3,-3.2,71.95,0.0,0,0.0,0.0,94.3,16.0
4,15,0,5,0,1,1,16.1,1.6,37.47,0.0,0,0.0,33.7,86.8,16.0


# Data Preprocessing

In [6]:
label_mapping = {"low_bike_demand": 0, 'high_bike_demand': 1}
feature_names = train_data.columns[:-1]
class_names = list(label_mapping)

In [7]:
# Encode categorical variables
train_data[output_name] = train_data[output_name].map(label_mapping)
train_data.head(3)

Unnamed: 0,hour_of_day,day_of_week,month,holiday,weekday,summertime,temp,dew,humidity,precip,snow,snowdepth,windspeed,cloudcover,visibility,increase_stock
0,5,5,1,0,0,0,-7.2,-15.0,53.68,0.0,0,0.0,16.3,31.6,16.0,0
1,21,4,1,0,1,0,-1.3,-12.8,40.97,0.0,0,0.0,23.9,85.7,16.0,0
2,21,3,8,0,1,1,26.9,21.8,73.39,0.0,0,0.0,0.0,81.1,16.0,0


In [8]:
# Split the train data into X and y
y = train_data.loc[:, output_name].to_numpy()
X = train_data.loc[:, train_data.columns != output_name].to_numpy()
X.shape, y.shape

((1600, 15), (1600,))

In [9]:
# test dataset
test_X = test_data.to_numpy()

# Feature Selection

In [10]:
# train dataset
dropped_col = [ 10, 11]  # snow & snow_depth
X_dropped = np.delete(X, dropped_col, axis=1)
feature_names_dropped = np.array([feature_names[i] for i in range(len(feature_names)) if i not in dropped_col])

X_dropped.shape

(1600, 13)

In [11]:
# test dataset
dropped_col = [ 10, 11]  # snow & snow_depth
test_X_dropped = np.delete(test_X, dropped_col, axis=1)
test_X_dropped.shape

(400, 13)

# Feature Scaling

In [12]:
# training dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# scaler = MinMaxScaler()
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X_dropped)


In [13]:
# test dataset
test_X_normalized = scaler.transform(test_X_dropped)

# Final Models

## 4. Tree-based methods

In [14]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import plot_tree


# plt.ion()


class RandomForest:

    def __init__(self, params, verbose = True):
        self.verbose = verbose
        # self.dropped_col = [10, 11] #[3, 5, 10, 11, 14]
        self.model = RandomForestClassifier(**params)

        # Perform Grid Search
        # rf = RandomForestClassifier(random_state=42)
        # self.model = GridSearchCV(rf, param_grid=params, cv=5, scoring='accuracy')


    def fit(self, X_train, y_train):
        X_train_preprocessed = self.preprocessing(X_train, y_train)
        return self.model.fit(X_train_preprocessed, y_train)

    def predict(self, X_valid):
        X_test_preprocessed = self.preprocessing(X_valid)
        # return self.model.best_estimator_.predict(X_test_preprocessed)
        return self.model.predict(X_test_preprocessed)

    def predict_proba(self, X_test):
        X_test_preprocessed = self.preprocessing(X_test)
        return self.model.predict_proba(X_test_preprocessed)

    def preprocessing(self, X, y=None):
        # X_dropped = np.delete(X, self.dropped_col, axis=1)
        return X

    def get_params(self):
        # return  self.model.best_params_
        return self.model.get_params()

    def plot_most_important_features(self, feature_names):
        
        feature_importance = self.model.feature_importances_
        sorted_idx = feature_importance.argsort()
        custom_palette = sns.color_palette("Purples", n_colors=len(feature_importance))
        plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], color = custom_palette)
        plt.yticks(range(len(sorted_idx)), feature_names[sorted_idx])
        plt.ylabel('Features')
        plt.xlabel('Feature Importance')
        plt.savefig('most_important.pdf', bbox_inches='tight')
        plt.show()

# Train

In [15]:
# Random Forest
rf_model = RandomForest(rf_params, verbose=True)
rf_model.fit(X_normalized, y)

# Extracting probabilities of the positive class
rf_probe_y = rf_model.predict_proba(X_normalized)[:, 1]

# Test

In [16]:
test_y_probs = rf_model.predict_proba(test_X_normalized)[:, 1]
test_y_pred = [1 if prob >= 0.5 else 0 for prob in test_y_probs]
len(test_y_pred)

400

In [17]:
np.savetxt('predictions.csv',[test_y_pred],delimiter=',',fmt='%d') 

# Model Comparison

In [18]:
from sklearn.metrics import roc_curve, auc
def evaluate(y_true_list, y_probs_list, verbose = False):

  accuracy = []
  precision = []
  recall = []
  f1 = []
  cm = []
  fpr_list, tpr_list = [], []
  # clas = []
  for k, (y_true, y_probs) in enumerate(zip(y_true_list, y_probs_list)):
    y_pred = [1 if prob >= 0.5 else 0 for prob in y_probs]
    cm.append(confusion_matrix(y_true, y_pred))
    accuracy.append(accuracy_score(y_true, y_pred))
    precision.append(precision_score(y_true, y_pred))
    recall.append(recall_score(y_true, y_pred))
    f1.append(f1_score(y_true, y_pred))
    fpr, tpr, thresholds = roc_curve(y_true, y_probs)
    # fpr_list.append(fpr)
    # tpr_list.append(tpr)
    # clas.append(classification_report(y_true, y_pred))
    if verbose:
      print('')
      print(f"for {k}th iteration:")
      print("Confusion Matrix:")
      print(cm[-1])
      print(f"> Accuracy: {accuracy[-1]}")
      print(f"> Precision: {precision[-1]}")
      print(f"> Recall: {recall[-1]}")
      print(f"> F1 Score: {f1[-1]}")
      # print(f"> Report: {clas[-1]}")


  # Calculate the mean and standard deviation of each metric
    mean_cm = np.mean(cm ,axis = 0)
    mean_accuracy = np.mean(accuracy)
    mean_precision = np.mean(precision)
    mean_recall = np.mean(recall)
    mean_f1 = np.mean(f1)
    # mean_fpr = np.mean(fpr_list, axis = 0)
    # mean_fpr[-1] = 1
    # mean_tpr = np.mean(tpr_list, axis = 0)
    # mean_fpr[-1] = 1

    if verbose:
      # Print or use the mean values as overall performance metrics
      print(f"Mean Accuracy: {mean_accuracy:.4f}")
      print(f"Mean Precision: {mean_precision:.4f}")
      print(f"Mean Recall: {mean_recall:.4f}")
      print(f"Mean F1 Score: {mean_f1:.4f}")
  return mean_cm, mean_accuracy, mean_precision, mean_recall, mean_f1, fpr, tpr

In [19]:
rf_results = evaluate([y], [rf_probe_y], True)


for 0th iteration:
Confusion Matrix:
[[1275   37]
 [   3  285]]
> Accuracy: 0.975
> Precision: 0.8850931677018633
> Recall: 0.9895833333333334
> F1 Score: 0.9344262295081968
Mean Accuracy: 0.9750
Mean Precision: 0.8851
Mean Recall: 0.9896
Mean F1 Score: 0.9344
