In [1]:
# Import AWS and Sagemaker SDKs and get files access
import boto3
import io
from sagemaker import get_execution_role

role = get_execution_role()
bucket ='sagemaker-data-energy-demand'

In [4]:
!pip install lightgbm

Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/0b/9d/ddcb2f43aca194987f1a99e27edf41cf9bc39ea750c3371c2a62698c509a/lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 21.6MB/s ta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [10]:
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
plt.style.use('fivethirtyeight')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error


# Light GBM 
import lightgbm as lgb

In [7]:
# Import data 

CITY = 'LosAngeles'
train_key = 'dataframes/%s/train/data_train.csv' % CITY
validation_key = 'dataframes/%s/test/data_validation.csv' % CITY

train_location = 's3://{}/{}'.format(bucket, train_key)
validation_location = 's3://{}/{}'.format(bucket, validation_key)

df_train = pd.read_csv(train_location, index_col='datetime')
df_validation = pd.read_csv(validation_location, index_col='datetime')

In [8]:
def data_sppliter(df, label):
    cols = list(df.columns)
    cols.remove(label)
    X = df[cols]
    y = df[label]
    return X, y

X_train, y_train = data_sppliter(df_train, label='demand(t)')
X_validation, y_validation = data_sppliter(df_validation, label='demand(t)')

In [9]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_validation_scaled = scaler.transform(X_validation)

X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_validation = pd.DataFrame(X_validation_scaled, columns=X_validation.columns, index=X_validation.index)

## Create LightGBM Model

In [15]:
# Creating model
gbm = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=100)

In [20]:
gbm.fit(X_train, y_train,
        eval_set=[(X_validation, y_validation)],
        eval_metric='l1',
        early_stopping_rounds=10)

print('Starting predicting...')

# predict
df_validation['LightGBM_demand_prediction'] = gbm.predict(X_validation, num_iteration=gbm.best_iteration_)

# eval
print('The rmse of prediction is:', mean_squared_error(y_validation, y_pred) ** 0.5)

[1]	valid_0's l1: 433.691	valid_0's l2: 262657
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l1: 413.587	valid_0's l2: 238460
[3]	valid_0's l1: 395.062	valid_0's l2: 216847
[4]	valid_0's l1: 377.029	valid_0's l2: 197080
[5]	valid_0's l1: 360.295	valid_0's l2: 179379
[6]	valid_0's l1: 343.91	valid_0's l2: 163142
[7]	valid_0's l1: 328.595	valid_0's l2: 148504
[8]	valid_0's l1: 313.939	valid_0's l2: 135330
[9]	valid_0's l1: 300.218	valid_0's l2: 123398
[10]	valid_0's l1: 286.897	valid_0's l2: 112423
[11]	valid_0's l1: 273.59	valid_0's l2: 102340
[12]	valid_0's l1: 261.536	valid_0's l2: 93373.9
[13]	valid_0's l1: 250.462	valid_0's l2: 85383.3
[14]	valid_0's l1: 238.896	valid_0's l2: 77766.6
[15]	valid_0's l1: 228.934	valid_0's l2: 71206.1
[16]	valid_0's l1: 218.542	valid_0's l2: 64995.8
[17]	valid_0's l1: 209.305	valid_0's l2: 59527.4
[18]	valid_0's l1: 200.822	valid_0's l2: 54702
[19]	valid_0's l1: 192.436	valid_0's l2: 50127.9
[20]	valid_0's l1: 184.74	valid_

In [24]:
df_lgb = df_validation[['LightGBM_demand_prediction']].copy()

In [26]:
# save as csv file to continue in another notebook
lgb_buffer = io.StringIO()
s3_resource = boto3.resource('s3')
key = 'predict/%s/pred_lightgbm.csv' % CITY

df_lgb.to_csv(lgb_buffer, compression=None)
s3_resource.Object(bucket, key).put(Body=lgb_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '0CE7F6D6AAFA8D3D',
  'HostId': '0V+vGWZNzCCasQdW5tK3MhEs30LlYSs88XWPPpIsRrOwDwJSPblDq8ALvx+u5+E3bfOfVv3mF0M=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '0V+vGWZNzCCasQdW5tK3MhEs30LlYSs88XWPPpIsRrOwDwJSPblDq8ALvx+u5+E3bfOfVv3mF0M=',
   'x-amz-request-id': '0CE7F6D6AAFA8D3D',
   'date': 'Thu, 16 Jan 2020 14:27:41 GMT',
   'etag': '"465ee4ec617d5dca73359c0c01102235"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"465ee4ec617d5dca73359c0c01102235"'}