In [1]:
import pandas as pd
import numpy as np
# import the regressor 
from sklearn.tree import DecisionTreeRegressor  
from sklearn.metrics import mean_absolute_error

Reading and exploring the dataset

In [2]:
dataframe = pd.read_csv("https://introtomlsampledata.blob.core.windows.net/data/bike-rental/bike-rental-hour.csv")

In [3]:
dataframe.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [4]:
dataframe.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'],
      dtype='object')

In [5]:
dataframe.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

Editing metadata and selecting columns

In [6]:
categoryVariableList = ["season","weathersit"]
for var in categoryVariableList:
    dataframe[var] = dataframe[var].astype("category")


In [7]:
dataframe.dtypes

instant          int64
dteday          object
season        category
yr               int64
mnth             int64
hr               int64
holiday          int64
weekday          int64
workingday       int64
weathersit    category
temp           float64
atemp          float64
hum            float64
windspeed      float64
casual           int64
registered       int64
cnt              int64
dtype: object

In [8]:
dataframe = dataframe.drop(["instant", "dteday", "casual" ,"registered"], axis = 1)

In [9]:
dataframe.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1


Creating dataset for set A

In [10]:
dataframesetA = dataframe.copy()
dataframesetA.columns

Index(['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt'],
      dtype='object')

Creating dataset for set A+B

In [11]:
def azureml_main(dataframe1 = None, dataframe2 = None):

    # Execution logic goes here
    #print(f'Input pandas.DataFrame #1: {dataframe1}')

    # If a zip file is connected to the third input port,
    # it is unzipped under "./Script Bundle". This directory is added
    # to sys.path. Therefore, if your zip file contains a Python file
    # mymodule.py you can import it using:
    # import mymodule

    for i in np.arange(1, 13):
        prev_col_name = 'cnt' if i == 1 else 'Rentals in hour -{}'.format(i-1)
        new_col_name = 'Rentals in hour -{}'.format(i)

        dataframe1[new_col_name] = dataframe1[prev_col_name].shift(1).fillna(0)

    # Return value must be of a sequence of pandas.DataFrame
    # E.g.
    #   -  Single return value: return dataframe1,
    #   -  Two return values: return dataframe1, dataframe2
    return dataframe1,

In [12]:
dataframesetAB = azureml_main(dataframe)[0]

In [13]:
dataframesetAB.columns

Index(['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt',
       'Rentals in hour -1', 'Rentals in hour -2', 'Rentals in hour -3',
       'Rentals in hour -4', 'Rentals in hour -5', 'Rentals in hour -6',
       'Rentals in hour -7', 'Rentals in hour -8', 'Rentals in hour -9',
       'Rentals in hour -10', 'Rentals in hour -11', 'Rentals in hour -12'],
      dtype='object')

In [14]:
dataframesetAB.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,...,Rentals in hour -3,Rentals in hour -4,Rentals in hour -5,Rentals in hour -6,Rentals in hour -7,Rentals in hour -8,Rentals in hour -9,Rentals in hour -10,Rentals in hour -11,Rentals in hour -12
0,1,0,1,0,0,6,0,1,0.24,0.2879,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,1,1,0,6,0,1,0.22,0.2727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,1,2,0,6,0,1,0.22,0.2727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,1,3,0,6,0,1,0.24,0.2879,...,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,1,4,0,6,0,1,0.24,0.2879,...,40.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Splitting data into test and train for both set A and set A+B

In [15]:
dataframesetAtest = dataframesetA[dataframesetA["yr"] == 0]
dataframesetAtrain = dataframesetA[dataframesetA["yr"] != 0]
dataframesetAtest.shape,dataframesetAtrain.shape

((8645, 13), (8734, 13))

In [16]:
dataframesetABtest = dataframesetAB[dataframesetAB["yr"] == 0]
dataframesetABtrain = dataframesetAB[dataframesetAB["yr"] != 0]
dataframesetABtest.shape,dataframesetABtrain.shape

((8645, 25), (8734, 25))

Dropping "yr" column from all test and train datasets

In [17]:
dataframesetAtest = dataframesetAtest.drop(["yr"],axis = 1)
dataframesetAtrain = dataframesetAtrain.drop(["yr"],axis = 1)
dataframesetABtest = dataframesetABtest.drop(["yr"],axis = 1)
dataframesetABtrain = dataframesetABtrain.drop(["yr"],axis = 1)

In [18]:
dataframesetAtest.shape,dataframesetAtrain.shape

((8645, 12), (8734, 12))

In [19]:
dataframesetABtest.shape,dataframesetABtrain.shape

((8645, 24), (8734, 24))


Creating X and y for test and train set A 


In [20]:
dataframesetAtestX = dataframesetAtest.drop(["cnt"],axis = 1)
dataframesetAtesty = dataframesetAtest["cnt"]
dataframesetAtestX.shape,dataframesetAtesty.shape

((8645, 11), (8645,))

In [21]:
dataframesetAtrainX = dataframesetAtrain.drop(["cnt"],axis = 1)
dataframesetAtrainy = dataframesetAtrain["cnt"]
dataframesetAtrainX.shape,dataframesetAtrainy.shape

((8734, 11), (8734,))

Creating X and y for test and train set A + B

In [22]:
dataframesetABtestX = dataframesetABtest.drop(["cnt"],axis = 1)
dataframesetABtesty = dataframesetABtest["cnt"]
dataframesetABtestX.shape,dataframesetABtesty.shape

((8645, 23), (8645,))

In [23]:
dataframesetABtrainX = dataframesetABtrain.drop(["cnt"],axis = 1)
dataframesetABtrainy = dataframesetABtrain["cnt"]
dataframesetABtrainX.shape,dataframesetABtrainy.shape

((8734, 23), (8734,))

In [24]:
# create a regressor object for set A
regressorA = DecisionTreeRegressor(random_state = 0)  

# fit the regressor with X and Y data 
regressorA.fit(dataframesetAtrainX, dataframesetAtrainy) 

DecisionTreeRegressor(random_state=0)

In [25]:
# create a regressor object for set A + B
regressorAB = DecisionTreeRegressor(random_state = 0)  

# fit the regressor with X and Y data 
regressorAB.fit(dataframesetABtrainX, dataframesetABtrainy) 

DecisionTreeRegressor(random_state=0)

In [26]:
maesetA = mean_absolute_error(dataframesetAtesty, regressorA.predict(dataframesetAtestX))

In [27]:
maesetAB = mean_absolute_error(dataframesetABtesty,regressorAB.predict(dataframesetABtestX))

In [28]:
print("The Mean Absolute Error for set A: " + str(maesetA))
print("The Mean Absolute Error for set A + B: " + str(maesetAB))

The Mean Absolute Error for set A: 92.086003470214
The Mean Absolute Error for set A + B: 37.039329091960674
