In [None]:
%matplotlib inline
import sagemaker
from sagemaker import get_execution_role
import pandas as pd 
import numpy as np
import calendar
import boto3
import matplotlib
import matplotlib.pyplot as plt

sess = sagemaker.Session()
role = get_execution_role()
s3 = boto3.resource('s3')
pd.options.display.max_columns = None

### Download, uncompress and load datasets

In [None]:
!wget http://edzon.io/datasets/bike_share_dataset.zip -O bike_share_dataset.zip
!unzip -o bike_share_dataset.zip

In [None]:
bike_share_df = pd.read_csv('bike_share_data.csv')
weathers_df = pd.read_csv('weathers.csv').set_index('id')
seasons_df = pd.read_csv('seasons.csv').set_index('id')
weekdays_df = pd.read_csv('weekdays.csv').set_index('id')

### Dataset columns
* **season:** Spring, Summer, Fall or Winter
* **holiday:** 1 - Yes, 0 - No
* **weekday:** Sunday, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday
* **workingday:** 1 - Yes, 0 - No
* **weather:** clear - Clear, Few clouds, Partly cloudy, Partly cloudy<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;misty_cloudy - Misty and Cloudy, Misty with Broken clouds, Misty with Few clouds, Misty<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;ligth_rain_snow - Light Snow, Light Rain and Thunderstorm, Light Rain and Scatter clouds<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;heavy_rain_snow - Heavy Rain and Ice Pallets, Thunderstorm, Snow and Fog
* **temp:** Normalized temperature in Celsius
* **atemp:** Normalized feeling temperature in Celsius
* **humidity:** Normalized humidity
* **count:** Count of total rental bikes aggregated in one hour
* **datetime:** The hour and date

In [None]:
bike_share_df.head()

## Prepare dataset

Since fields: season, weekday and weather are categorical, we have to change to one hot encoding

#### One hot encoding

In [None]:
def one_hot_encoding(df, categories_df, column_name):
    categories = pd.api.types.CategoricalDtype(categories=categories_df.values.flatten())
    df[column_name]=df[column_name].astype(categories)
    df = pd.concat([df,pd.get_dummies(df[column_name],prefix=column_name)],axis=1)
    df.drop([column_name],axis=1, inplace=True)
    return df

In [None]:
bike_share_df = one_hot_encoding(bike_share_df,weathers_df,'weather')
bike_share_df = one_hot_encoding(bike_share_df,seasons_df,'season')
bike_share_df = one_hot_encoding(bike_share_df,weekdays_df,'weekday')
bike_share_df.head()

#### Split date

In [None]:
def split_date(df, column_name):
    result = df
    result[column_name] = pd.to_datetime(df[column_name],infer_datetime_format=True)
    result['month']=result[column_name].dt.strftime('%b')
    result['hour']=result[column_name].dt.hour
    result.drop([column_name],axis=1, inplace=True)
    
    bike_share_df.loc[bike_share_df['hour'] < 12, 'time_of_day'] = 'Morning'
    bike_share_df.loc[(bike_share_df['hour'] >= 12) & (bike_share_df['hour'] < 17), 'time_of_day'] = 'Afternoon'
    bike_share_df.loc[bike_share_df['hour'] >= 17, 'time_of_day'] = 'Evening'
    result.drop(['hour'],axis=1, inplace=True)
    return result

In [None]:
bike_share_df = split_date(bike_share_df,'datetime')
bike_share_df.head()

In [None]:
def encode_month(df, column_name):
    months = []
    for i in range(1,13):
        months.append((i, calendar.month_name[i][0:3]))
    months = pd.DataFrame(months, columns=['id','month'])
    months = months.set_index('id')
    return one_hot_encoding(df,months, column_name)

In [None]:
def encode_time(df, column_name):
    momentum = pd.DataFrame("Morning,Afternoon,Evening".split(','))
    return one_hot_encoding(df, momentum, column_name)

In [None]:
bike_share_df = encode_month(bike_share_df, 'month')
bike_share_df = encode_time(bike_share_df, 'time_of_day')
bike_share_df.head()

#### Move target to first column

In [None]:
def move_target(df,target):
    target_column = df[target]
    df.drop([target], axis=1, inplace = True)
    df.insert(0,target,target_column)
    return df

In [None]:
bike_share_df = move_target(bike_share_df,'count')
bike_share_df.head()

#### Split into training and validation

In [None]:
def train_val_split(df, train_size):
    return np.split(df.sample(frac=1), [int(train_size*len(df))])

In [None]:
train_df, val_df = train_val_split(bike_share_df, .7)

train_file='train_data.csv'
val_file='val_data.csv'

train_df.to_csv(train_file, index=False, header=False)
val_df.to_csv(val_file, index=False, header=False)

### Upload to S3

In [None]:
def upload_to_s3(bucket, prefix, file):    
    data = open(file, "rb")
    key = '{}/{}'.format(prefix, file)
    s3.Bucket(bucket).put_object(Key=key, Body=data, ContentType='text/csv')
    path = 's3://{}/{}'.format(bucket, key)
    print(path)
    return path

def create_data_channels(train_file, val_file):
    s3_train_data = upload_to_s3(bucket,'{}/train'.format(prefix),train_file)
    s3_val_data = upload_to_s3(bucket,'{}/val'.format(prefix),val_file)
    
    train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                                        content_type='text/csv', s3_data_type='S3Prefix')
    val_data = sagemaker.session.s3_input(s3_val_data, distribution='FullyReplicated', 
                                        content_type='text/csv', s3_data_type='S3Prefix')
        
    return {'train': train_data, 'validation': val_data}

In [None]:
bucket = 'edzon-test'
prefix = 'sagemaker/Lab-linear-learner'
output_location = 's3://{}/{}/output'.format(bucket, prefix)

In [None]:
data_channels = create_data_channels(train_file, val_file)

### Train the model

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [None]:
linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c5.xlarge', 
                                       output_path=output_location,
                                       sagemaker_session=sess)

linear.set_hyperparameters(feature_dim=len(train_df.columns)-1,
                           predictor_type='regressor',
                           early_stopping_patience=50,
                           early_stopping_tolerance=0.001,
                           epochs=120,                           
                           learning_rate='auto',
                           loss='squared_loss',
                           optimizer='sgd',
                           mini_batch_size=100)

linear.fit(data_channels, logs=True)

### Deploy the model

In [None]:
linear_predictor = linear.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')

### Realtime inference

In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [None]:
result = linear_predictor.predict(train_df.iloc[20].drop(['count']))
print("prediction: {}".format(result['predictions'][0]['score']))

In [None]:
def plot_actual_vs_prediction(df, predictor):
    labels = []
    predictions = []
    target_column = df.columns[0]
    for index, row in df.iterrows():
        labels.append(row[target_column])
        predictions.append(predictor.predict(row.drop(target_column))['predictions'][0]['score'])
    
    %matplotlib inline

    import matplotlib
    import matplotlib.pyplot as plt
    matplotlib.rcParams['figure.dpi'] = 100

    plt.plot(np.array(labels),label='actual')
    plt.plot(np.array(predictions),label='prediction')
    plt.legend()
    plt.show()

In [None]:
plot_actual_vs_prediction(val_df.sample(n=200), linear_predictor)

### Getting model parameters
#### Download the model

In [None]:
model_file = '{}/output/{}/output/model.tar.gz'.format(prefix, linear.latest_training_job.job_name)

In [None]:
import os
import mxnet as mx

s3.Bucket(bucket).download_file(model_file, os.path.basename(model_file))

#### Extract model

In [None]:
!tar -zxvf model.tar.gz

#### Linear learner model is itself a zip file, containing a mxnet model and other metadata

In [None]:
!unzip model_algo-1

#### Load the mxnet module

In [None]:
mod = mx.module.Module.load("mx-mod", 0)

#### Model weights

In [None]:
mod._arg_params['fc0_weight'].asnumpy().flatten()

#### Model bias

In [None]:
mod._arg_params['fc0_bias'].asnumpy().flatten()

## Now with XGBoost

In [None]:
container = get_image_uri(boto3.Session().region_name, 'xgboost', '0.90-1')

In [None]:
xgboost = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c5.xlarge', 
                                       output_path=output_location,
                                       sagemaker_session=sess)

xgboost.set_hyperparameters(max_depth=6,
                            eta=0.3,
                            gamma=0,
                            min_child_weight=1,
                            subsample=1,
                            silent=0,
                            objective="reg:linear",
                            num_round=120)

xgboost.fit(inputs=data_channels, logs=True)

### Deploy the model

In [None]:
xgboost_predictor = xgboost.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')

### Realtime inference

In [None]:
xgboost_predictor.content_type = 'text/csv'
xgboost_predictor.serializer = csv_serializer
xgboost_predictor.deserializer = json_deserializer

In [None]:
result = xgboost_predictor.predict(train_df.iloc[20].drop(['count']))
print("prediction: {}".format(result))

In [None]:
def xgboost_plot_actual_vs_prediction(df, predictor):
    labels = []
    predictions = []
    target_column = df.columns[0]
    for index, row in df.iterrows():
        labels.append(row[target_column])    
        predictions.append(predictor.predict(row.drop(target_column)))
    

    %matplotlib inline

    import matplotlib
    import matplotlib.pyplot as plt
    matplotlib.rcParams['figure.dpi'] = 100

    plt.plot(np.array(labels),label='actual')
    plt.plot(np.array(predictions),label='prediction')
    plt.legend()
    plt.show()

In [None]:
sample = val_df.sample(n=200)
xgboost_plot_actual_vs_prediction(sample, xgboost_predictor)

### VS linear regression

In [None]:
plot_actual_vs_prediction(sample, linear_predictor)