# Predict Bike Sharing Demand with AutoGluon Template

1. Notebook should be using a `ml.t3.medium` instance (2 vCPU + 4 GiB)
2. Notebook should be using kernal: `Python 3 (MXNet 1.8 Python 3.7 CPU Optimized)`

### Install packages

In [2]:
#!pip install -U pip
#!pip install -U setuptools wheel
#!pip install -U "mxnet<2.0.0" bokeh==2.0.1
#!pip install autogluon --no-cache-dir
#!pip install kaggle
# Without --no-cache-dir, smaller aws instances may have trouble installing

### Setup Kaggle API Key

In [3]:
# create the .kaggle directory and an empty kaggle.json file
!mkdir -p /root/.kaggle
!touch /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [4]:
# Fill in your user name and key from creating the kaggle account and API token file
import json
kaggle_username = "masinde"
kaggle_key = "26ba6a9b0bd0c324054f61b2158b9a7a"

# Save API token the kaggle.json file
with open("/root/.kaggle/kaggle.json", "w") as f:
    f.write(json.dumps({"username": kaggle_username, "key": kaggle_key}))

### Download and explore dataset

In [5]:
# Download the dataset, it will be in a .zip file so you'll need to unzip it as well.
!kaggle competitions download -c bike-sharing-demand
# If you already downloaded it you can use the -o command to overwrite the file
!unzip -o bike-sharing-demand.zip

bike-sharing-demand.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  bike-sharing-demand.zip
  inflating: sampleSubmission.csv    
  inflating: test.csv                
  inflating: train.csv               


In [6]:
import pandas as pd
from autogluon.tabular import TabularPredictor  
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import calendar
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Create the train dataset in pandas by reading the csv
# Set the parsing of the datetime column so you can use some of the `dt` features in pandas later
train = pd.read_csv('train.csv', parse_dates=['datetime'])
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(8)
memory usage: 1020.7 KB


In [9]:
# Simple output of the train dataset to view some of the min/max/varition of the dataset features.
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
season,10886.0,2.506614,1.116174,1.0,2.0,3.0,4.0,4.0
holiday,10886.0,0.028569,0.166599,0.0,0.0,0.0,0.0,1.0
workingday,10886.0,0.680875,0.466159,0.0,0.0,1.0,1.0,1.0
weather,10886.0,1.418427,0.633839,1.0,1.0,1.0,2.0,4.0
temp,10886.0,20.23086,7.79159,0.82,13.94,20.5,26.24,41.0
atemp,10886.0,23.655084,8.474601,0.76,16.665,24.24,31.06,45.455
humidity,10886.0,61.88646,19.245033,0.0,47.0,62.0,77.0,100.0
windspeed,10886.0,12.799395,8.164537,0.0,7.0015,12.998,16.9979,56.9969
casual,10886.0,36.021955,49.960477,0.0,4.0,17.0,49.0,367.0
registered,10886.0,155.552177,151.039033,0.0,36.0,118.0,222.0,886.0


In [10]:
# Create the test pandas dataframe in pandas by reading the csv, remember to parse the datetime!
test = pd.read_csv('test.csv',  parse_dates=['datetime'])
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [11]:
# Same thing as train and test dataset
submission = pd.read_csv('sampleSubmission.csv')
submission.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,0
1,2011-01-20 01:00:00,0
2,2011-01-20 02:00:00,0
3,2011-01-20 03:00:00,0
4,2011-01-20 04:00:00,0


## Step 3: Train a model using AutoGluon’s Tabular Prediction

Requirements:
* We are predicting `count`, so it is the label we are setting.
* Ignore `casual` and `registered` columns as they are also not present in the test dataset. 
* Use the `root_mean_squared_error` as the metric to use for evaluation.
* Set a time limit of 10 minutes (600 seconds).
* Use the preset `best_quality` to focus on creating the best model.

In [None]:
predictor = TabularPredictor(label="count", 
                             eval_metric='root_mean_squared_error',
                             problem_type='regression',
                             learner_kwargs={'ignored_columns': ['casual','registered']}
                            ).fit(train_data=train,
                                  time_limit=600,
                                  presets="best_quality"
)

No path specified. Models will be saved in: "AutogluonModels/ag-20230106_104653/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "AutogluonModels/ag-20230106_104653/"
AutoGluon Version:  0.6.1
Python Version:     3.8.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Oct 26 20:36:53 UTC 2022
Train Data Rows:    10886
Train Data Columns: 11
Label Column: count
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Dropping user-specified ignored columns: ['casual', 'registered']
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    3105.01 MB
	Train Data (Original)  Memory Usage: 0.78 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features

### Review AutoGluon's training run with ranking of models that did the best.

In [None]:
predictor.fit_summary(show_plot=True)

### Create predictions from test dataset

In [None]:
predictions = predictor.predict(test)
predictions.head()

#### NOTE: Kaggle will reject the submission if we don't set everything to be > 0.

In [None]:
# Describe the `predictions` series to see if there are any negative values
predictions.describe()

In [None]:
# How many negative values do we have?
predictions.lt(0).value_counts()

In [None]:
# Set them to zero
predictions.iloc[predictions<0] = 0
predictions.lt(0).value_counts()

### Set predictions to submission dataframe, save, and submit

In [None]:
submission["count"] = predictions
submission.to_csv("submission.csv", index=False)

In [None]:
!kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "first raw submission"

#### View submission via the command line or in the web browser under the competition's page - `My Submissions`

In [None]:
!kaggle competitions submissions -c bike-sharing-demand | tail -n +1 | head -n 6

#### Initial score of 1.80370

## Step 4: Exploratory Data Analysis and Creating an additional feature
* Any additional feature will do, but a great suggestion would be to separate out the datetime into hour, day, or month parts.

In [None]:
# Create a histogram of all features to show the distribution of each one relative to the data. This is part of the exploritory data analysis
train.hist()

In [None]:
train.columns

In [None]:
train.describe().T

##  Distributions of registered vs casual rides

In [None]:
# plot distributions of registered vs casual rides
sns.histplot(train['registered'], label='registered')
sns.histplot(train['casual'], label='casual')
plt.legend()
plt.xlabel('rides')
plt.title("Rides distributions")
plt.savefig('rides_distributions.png', format='png')

The registered users perform way more rides than casual ones. Furthermore, we can see that the two distributions are skewed to the right, meaning that, for most of the entries in the data, zero or a small number of rides. Finally, every entry in the data has quite a large number of rides (that is, higher than 800).

## Make category types for these so models know they are not just numbers
* AutoGluon originally sees these as ints, but in reality they are int representations of a category.
* Setting the dtype to category will classify these as categories in AutoGluon.

In [None]:
train["season"] = train["season"].astype("category")
train["weather"] = train["weather"].astype("category")
test["season"] = test["season"].astype("category")
test["weather"] = test["weather"].astype("category")

In [None]:
# create a new feature
train['year'] = train.datetime.dt.year
train['month'] = train.datetime.dt.month
train['day'] = train.datetime.dt.day
train['hour'] = train.datetime.dt.hour
train.drop(["datetime"], axis=1, inplace=True)

test['year'] = test.datetime.dt.year
test['month'] = test.datetime.dt.month
test['day'] = test.datetime.dt.day
test['hour'] = test.datetime.dt.hour
test.drop(["datetime"], axis=1, inplace=True)

In [None]:

figure, axes = plt.subplots(nrows=2, ncols=2) #
plt.tight_layout()
figure.set_size_inches(10, 10)


sns.boxplot(x='season', y='count', data=train, ax=axes[0, 0])
sns.boxplot(x='weather', y='count', data=train, ax=axes[0, 1])
sns.boxplot(x='holiday', y='count', data=train, ax=axes[1, 0])
sns.boxplot(x='workingday', y='count', data=train, ax=axes[1, 1])


axes[0, 0].set(title='Box Plot On Count Across Season')
axes[0, 1].set(title='Box Plot On Count Across Weather')
axes[1, 0].set(title='Box Plot On Count Across Holiday')
axes[1, 1].set(title='Box Plot On Count Across Working Day')


axes[0, 1].tick_params('x', labelrotation=10) 

In [None]:
train['season'] = train['season'].map({1: 'Spring', 
                                       2: 'Summer', 
                                       3: 'Fall', 
                                       4: 'Winter' })
train['weather'] = train['weather'].map({1: 'Clear', 
                                         2: 'Mist, Few clouds', 
                                         3: 'Light Snow, Rain, Thunder', 
                                         4: 'Heavy Snow, Rain, Thunder'})

In [None]:
train.head()

In [None]:
fig, ax = plt.subplots(figsize=(20,5))
sns.pointplot(data=train, x='hour', y='registered', hue='workingday', ax=ax)
ax.set(title='Count of bikes during working days: Registered users')

In [None]:
fig, ax = plt.subplots(figsize=(20,5))
sns.pointplot(data=train, x='hour', y='count',  hue='holiday', ax=ax)
ax.set(title='Count of bikes during holidays days: Registered users')

In [None]:
fig, ax = plt.subplots(figsize=(20,5))
sns.pointplot(data=train, x='hour', y='count',  hue='season', ax=ax)
ax.set(title='Count of bikes during different seasons : Registered users')

In [None]:
fig, ax = plt.subplots(figsize=(20,5))
sns.pointplot(data=train, x='hour', y='count',  hue='weather', ax=ax)
ax.set(title='Count of bikes in different weather: Registered users')

In [None]:
def plot_correlations(data, col):
    # get correlation between col and registered rides
    corr_r = np.corrcoef(data[col], data["registered"])[0,1]
    ax =sns.regplot(x=col, y='registered', data=data,scatter_kws={'alpha':0.05},label=f'Registered rides(correlation:{corr_r:.3f})')
    # get correlation between col and casual rides
    corr_c = np.corrcoef(data[col], data["casual"])[0,1]
    ax = sns.regplot(x=col, y='casual',data=data,scatter_kws={'alpha':0.05},label=f'Casual rides (correlation:{corr_c:.3f})')
    #adjust legend alpha
    legend = ax.legend()
    for lh in legend.legendHandles:
        lh.set_alpha(0.5)
    ax.set_ylabel("rides")
    ax.set_title(f"Correlation between rides and {col}")
    return ax

### Correlation between rides and temperature

In [None]:
plt.figure(figsize=(10,8))
ax = plot_correlations(train, 'temp')
plt.savefig('correlation_temp.png', format='png')


In [None]:
plt.figure(figsize=(10,8))
ax = plot_correlations(train, 'atemp')
plt.savefig('correlation_atemp.png', format='png')

The higher temperatures have a positive impact on the number of rides (the correlation between registered/casual rides and temp is 0.31 and 0.46, respectively, and it's a similar case for atemp). Note that as the values in the registered column are widely spread with respect to the different values in temp, we have a lower correlation compared to the casual column.

### Correlation between rides and humidity

In [None]:
plt.figure(figsize=(10,8))
ax = plot_correlations(train, 'humidity')
plt.savefig('correlation_hum.png', format='png')


 The humidity level has a negative correlation with both types of rides (-0.265 for registered and -0.348 for casual). This means that with a high level of humidity (mist or rain), customers will tend not to use the bike sharing service.

### Correlation between the number of rides and the wind speed

In [None]:
plt.figure(figsize=(10,8))
ax = plot_correlations(train, 'windspeed')
plt.savefig('correlation_windspeed.png', format='png')

There is minimal correlation between the number of rides and the wind speed .

## Correlation Matrix Plot
A comparison between different continuous features is  to use the correlation matrix plot. It allows us to quickly visualize any possible relationships between the different features and identify potential clusters with highly correlated features.

With the training data we will visualize the temperature, atemp, humidity, windspeed, registered and casual columns.

In [None]:
# plot correlation matrix
cols = ["temp", "atemp", "humidity", "windspeed", "registered", "casual"]
plot_data = train[cols]
corr = plot_data.corr()
fig = plt.figure(figsize=(10,8))
plt.matshow(corr, fignum=fig.number)
plt.xticks(range(len(plot_data.columns)), plot_data.columns)
plt.yticks(range(len(plot_data.columns)), plot_data.columns)
plt.colorbar()
plt.ylim([5.5, -0.5])
fig.savefig('correlations.png', format='png')

##### The conclusion of the visualization is that when the temperature is good the hire of the bike is good for both registered users and casual users.

## Step 5: Rerun the model with the same settings as before, just with more features

In [None]:
predictor_new_features = TabularPredictor(label="count", 
                             eval_metric='root_mean_squared_error',
                             problem_type='regression',
                             learner_kwargs={'ignored_columns': ['casual','registered']}
                            ).fit(train_data=train,
                                  time_limit=600,
                                  presets="best_quality"
)

In [None]:
predictor_new_features.fit_summary()

In [None]:
# Remember to set all negative values to zero
predictions_new_features = predictor_new_features.predict(test)
predictions_new_features.iloc[predictions_new_features.lt(0)] = 0

predictions_new_features.head()

In [None]:
# Same submitting predictions
submission_new_features = pd.read_csv('sampleSubmission.csv', parse_dates=["datetime"])
submission_new_features["count"] = predictions_new_features
submission_new_features.to_csv("submission_new_features.csv", index=False)

In [None]:
!kaggle competitions submit -c bike-sharing-demand -f submission_new_features.csv -m "new features"

In [None]:
!kaggle competitions submissions -c bike-sharing-demand | tail -n +1 | head -n 6

#### New Score of 0.47052


## Step 6: Hyper parameter optimization
* There are many options for hyper parameter optimization.
* Options are to change the AutoGluon higher level parameters or the individual model hyperparameters.
* The hyperparameters of the models themselves that are in AutoGluon. Those need the `hyperparameter` and `hyperparameter_tune_kwargs` arguments.

In [None]:
num_trials = 5
search_strategy = 'auto'
hyperparameters = {
  'NN_TORCH': {'num_epochs': 10, 'batch_size': 32}, 
  'GBM': {'num_boost_round': 20}
}
hyperparameter_tune_kwargs = { 
  'num_trials': num_trials,
  'scheduler' : 'local',
  'searcher': search_strategy,
}


predictor_new_hpo = TabularPredictor(label="count", 
                             eval_metric='root_mean_squared_error',
                             problem_type='regression',
                             learner_kwargs={'ignored_columns': ['casual','registered']}
                            ).fit(train_data=train,
                                  num_bag_folds=5,
                                  num_bag_sets=1,
                                  num_stack_levels=1,
                                  time_limit=600,
                                  presets="best_quality",
                                  hyperparameters=hyperparameters,
                                  hyperparameter_tune_kwargs=hyperparameter_tune_kwargs
)

In [None]:
predictor_new_hpo.fit_summary()

In [None]:
# Remember to set all negative values to zero
predictions_new_hpo = predictor_new_hpo.predict(test)
predictions_new_hpo.iloc[predictions_new_hpo.lt(0)] = 0

In [None]:
# Same submitting predictions
submission_new_hpo = pd.read_csv('sampleSubmission.csv', parse_dates=["datetime"])
submission_new_hpo["count"] = predictions_new_hpo
submission_new_hpo.to_csv("submission_new_hpo.csv", index=False)

In [None]:
!kaggle competitions submit -c bike-sharing-demand -f submission_new_hpo.csv -m "new features with hyperparameters"

In [None]:
!kaggle competitions submissions -c bike-sharing-demand | tail -n +1 | head -n 6

#### New Score of 0.53050

### Hyperparameter table

In [None]:
fig = pd.DataFrame(
    {
        "model": ["initial", "add_features", "hpo"],
        "score": [
            predictor.leaderboard(silent=True)['score_val'][0],
            predictor_new_features.leaderboard(silent=True)['score_val'][0],
            predictor_new_hpo.leaderboard(silent=True)['score_val'][0]
        ]
    }
).plot(x="model", y="score", figsize=(8, 6)).get_figure()
fig.savefig('img/model_train_score.png')

In [None]:
df = pd.DataFrame(
    {
        "model": ["initial", "add_features", "hpo"],
        "score": [
            predictor.leaderboard(silent=True)['score_val'][0],
            predictor_new_features.leaderboard(silent=True)['score_val'][0],
            predictor_new_hpo.leaderboard(silent=True)['score_val'][0]
        ]
    }
)
df

In [None]:
# Take the 3 kaggle scores and creating a line plot to show improvement
fig = pd.DataFrame(
    {
        "test_eval": ["initial", "add_features", "hpo"],
        "score": [
            predictor.leaderboard(silent=True)['score_val'][0],
            predictor_new_features.leaderboard(silent=True)['score_val'][0],
            predictor_new_hpo.leaderboard(silent=True)['score_val'][0]
        ]
    }
).plot(x="test_eval", y="score", figsize=(8, 6)).get_figure()
fig.savefig('img/model_test_score.png')

In [None]:
# The 3 hyperparameters we tuned with the kaggle score as the result
pd.DataFrame({
    "model": ["initial", "add_features", "hpo"],
    "score": [
            predictor.leaderboard(silent=True)['score_val'][0],
            predictor_new_features.leaderboard(silent=True)['score_val'][0],
            predictor_new_hpo.leaderboard(silent=True)['score_val'][0]
        ]
    
})