* Importing the libararies

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler(feature_range=(-1,1))

In [8]:
data=pd.read_csv('Data/Google_Stock_Price_Train.csv')

In [9]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1/3/2012,325.25,332.83,324.97,663.59,7380500
1,1/4/2012,331.27,333.87,329.08,666.45,5749400
2,1/5/2012,329.83,330.75,326.89,657.21,6590300
3,1/6/2012,328.34,328.77,323.68,648.24,5405900
4,1/9/2012,322.04,322.29,309.46,620.76,11688800


In [10]:
train_data=data[['Open','High','Low']]
#train_data=[x.replace(',','') for x in train_data.values]
train_data=np.array(train_data)

In [11]:
train_data=scaler.fit_transform(train_data)

In [12]:
print(train_data)

[[-0.82837265 -0.80719742 -0.81910138]
 [-0.80597515 -0.80331298 -0.80353084]
 [-0.81133269 -0.81496629 -0.81182755]
 ...
 [ 0.91450257  0.91614843  0.91688135]
 [ 0.87592083  0.88514763  0.90066677]
 [ 0.87376293  0.87338226  0.86842703]]


In [47]:
y_train=data['Close']
y_train=[x.replace(',','') for x in y_train.values]
y_train=np.array(y_train)
y_train=scaler.fit_transform(y_train.reshape(-1,1))


* Now as we have to upload data to s3, so we have to use Sagemaker Resources

In [3]:
import boto3
import sagemaker
from sagemaker import get_execution_role

In [4]:
# Sagemaker session and role
sagemaker_session=sagemaker.Session()
role=sagemaker.get_execution_role()

# Default s3 bucket
bucket=sagemaker_session.default_bucket()

* Function to create csv files

In [45]:
import os

def make_csv(x, y, filename, data_dir):
    '''Merges features and labels and converts them into one csv file with labels in the first column.
       :param x: Data features
       :param y: Data labels
       :param file_name: Name of csv file, ex. 'train.csv'
       :param data_dir: The directory where files will be saved
       '''
    # make data dir, if it does not exist
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    # first column is the labels and rest is features 
    pd.concat([pd.DataFrame(y), pd.DataFrame(x)], axis=1)\
             .to_csv(os.path.join(data_dir, filename), header=False, index=False)
    
    # nothing is returned, but a print statement indicates that the function has run
    print('Path created: '+str(data_dir)+'/'+str(filename))

In [48]:
data_dir='data_dir'
filename='train.csv'

make_csv(train_data,y_train,filename,data_dir)

Path created: data_dir/train.csv


* Upload data to s3

In [49]:
prefix='Temp-Capstone-Project'

# upload to s3
input_data=sagemaker_session.upload_data(path=data_dir,bucket=bucket,key_prefix=prefix)

##### Creating PyTorch Estimator

In [50]:
from sagemaker.pytorch import PyTorch

output_path='s3://{}/{}/'.format(bucket,prefix)




In [51]:
estimator_with_L1loss=PyTorch(entry_point='train.py',
                 source_dir='source',
                 role=role,
                 sagemaker_session=sagemaker_session,
                 train_instance_count=1,
                 output_path=output_path,
                 train_instance_type='ml.c4.xlarge',
                 hyperparameters={
                     'input_dim':3,
                     'hidden_dim':30,
                     'output_dim':1,
                     'epochs':40
                 },
                 framework_version='1.0')

In [52]:
estimator_with_L1loss.fit({'train':input_data})

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-09-03 16:09:51 Starting - Starting the training job...
2020-09-03 16:09:53 Starting - Launching requested ML instances......
2020-09-03 16:11:19 Starting - Preparing the instances for training.........
2020-09-03 16:12:41 Downloading - Downloading input data
2020-09-03 16:12:41 Training - Downloading the training image..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-09-03 16:13:02,949 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-09-03 16:13:02,952 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-03 16:13:02,964 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-09-03 16:13:02,965 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-09-03 16:13:03,213 sagemaker-containers INFO     Module train does not pr

In [48]:
estimator.fit({'train':input_data})

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-09-03 08:57:33 Starting - Starting the training job...
2020-09-03 08:57:35 Starting - Launching requested ML instances......
2020-09-03 08:59:02 Starting - Preparing the instances for training.........
2020-09-03 09:00:22 Downloading - Downloading input data
2020-09-03 09:00:22 Training - Downloading the training image..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-09-03 09:00:37,074 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-09-03 09:00:37,077 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-09-03 09:00:37,089 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-09-03 09:00:40,125 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2020-09-03 09:00:40,385 sagemaker-containers INFO     Module train does not pr

### We have to create a pytorch model first, before deploying it    

In [57]:
print(estimator_with_L1loss.model_data)

s3://sagemaker-us-west-2-886035371869/Temp-Capstone-Project/sagemaker-pytorch-2020-09-03-16-09-51-504/output/model.tar.gz


In [53]:
# Create a model from trained estimator data
# and point to the prediction script     ****here we import PyTorchModel,,,,not PyTorch***
from sagemaker.pytorch import PyTorchModel
model=PyTorchModel(model_data=estimator_with_L1loss.model_data,
             role=role,
             framework_version='1.0',
             entry_point='predict.py',
             source_dir='source')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


## Now Deploying it

In [54]:
%%time
predictor_from_L1loss=model.deploy(initial_instance_count=1,instance_type='ml.t2.medium')

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


---------------!CPU times: user 380 ms, sys: 48.4 ms, total: 428 ms
Wall time: 7min 33s


In [55]:
predictions_from_L1loss_model=predictor_from_L1loss.predict(transformed_test)

In [56]:
print(predictions_from_L1loss_model)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]


### Very important thing, I was saving the model in 'data_dir' instead of 'model_dir' in train.py

In [67]:

# Now making predictor from already deployed endpoint,and see that if it can perform inference like the one,'predictor_from_L1loss'
# **** Here I have to use the endpoint name, from the deployed endpoint if i want to perform real time predictions
from sagemaker.predictor import npy_serializer,numpy_deserializer

predictor_from_L1loss_2nd=sagemaker.predictor.RealTimePredictor(endpoint=predictor_from_L1loss.endpoint,
                                                                serializer=npy_serializer,deserializer=numpy_deserializer)

# **** So very important specify  the serializer, and deserializer ,as above **********

## Now we will perform some inference

In [60]:
# TO get the name of endpoint

# **** So look both are sames *******
endpoint_name=predictor_from_L1loss.endpoint
print(endpoint_name)
print(predictor_from_L1loss_2nd.endpoint)

sagemaker-pytorch-2020-09-03-16-32-29-482
sagemaker-pytorch-2020-09-03-16-32-29-482


In [68]:
# Checking if 'predictor_from_L1loss_2nd' can perform predictions in the same way as 'predictor_from_L1loss'
predictions_from_L1loss_2nd=predictor_from_L1loss_2nd.predict(transformed_test)

* Very important as ,I included deserializer above( in RealTimePredictor), I got predictions

In [70]:
print(predictions_from_L1loss_2nd)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]


## So, this is most important ....I cannot have predictions from RealTimePredictor, in the same 
##  way as I had from 'predictor_from_L1Loss' error 'type: <class 'numpy.ndarray'>, valid types: <class 'bytes'>, <class 'bytearray'>, file-like object'

### So,very important custom serializer and deserializer

In [77]:
# Custom Serializer
def _npy_dumps(data):
    """
    Serializes a numpy array into a stream of npy-formatted bytes.
    """
    from six import BytesIO
    import numpy as np
    buffer = BytesIO()
    np.save(buffer, data)
    return buffer.getvalue()

In [81]:
x=np.array([565.5, 675.0, 666.0])
x_in=_npy_dumps(x)
print(x_in)

b"\x93NUMPY\x01\x00v\x00{'descr': '<f8', 'fortran_order': False, 'shape': (3,), }                                                            \n\x00\x00\x00\x00\x00\xac\x81@\x00\x00\x00\x00\x00\x18\x85@\x00\x00\x00\x00\x00\xd0\x84@"


In [82]:
# Custom Desializer
def _npy_loads(data):
    """
    Deserializes npy-formatted bytes into a numpy array
    """
    from six import BytesIO
    import numpy as np
    stream = BytesIO(data)
    return np.load(stream)

In [83]:
x_dese=_npy_loads(x_in)
print(x_dese)

[565.5 675.  666. ]


In [87]:
runtime=boto3.Session().client('sagemaker-runtime')
endpoint_name='sagemaker-pytorch-2020-09-03-16-32-29-482'
x=np.array([565.5, 675.0, 666.0])
#input_d=io.TextIOWrapper(x,encoding="utf-8")
response=runtime.invoke_endpoint(EndpointName=endpoint_name,
                                 
                                ContentType = 'application/x-npy',
                                Accept='application/x-npy',# Very important, today I came to know the importance of logs, from there I 
                                Body=x_in)                  #catch, the error, I was not giving Accept parameter

In [88]:
# Now let's see what we have in response
print(response)

{'ResponseMetadata': {'RequestId': '95847fc4-2f9a-4881-b246-368a53c46032', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '95847fc4-2f9a-4881-b246-368a53c46032', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Thu, 3 Sep 2020 17:46:21 GMT', 'content-type': 'application/x-npy', 'content-length': '132'}, 'RetryAttempts': 0}, 'ContentType': 'application/x-npy', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7f59fcdd9748>}


* But our inference is stored in response['Body']

In [92]:
#print(response['Body'])# But we have to decode it
print(response['Body'].read().decode('utf-8'))




### Lets try to predict collectively on test features

In [93]:
test_df=pd.read_csv('Data/Google_Stock_Price_Test.csv')

In [94]:
test_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1/3/2017,778.81,789.63,775.8,786.14,1657300
1,1/4/2017,788.36,791.34,783.16,786.9,1073000
2,1/5/2017,786.08,794.48,785.02,794.02,1335200
3,1/6/2017,795.26,807.9,792.2,806.15,1640200
4,1/9/2017,806.4,809.97,802.83,806.65,1272400


In [95]:
required_test_df=test_df[['Open','High','Low']]
required_test_df.head()

Unnamed: 0,Open,High,Low
0,778.81,789.63,775.8
1,788.36,791.34,783.16
2,786.08,794.48,785.02
3,795.26,807.9,792.2
4,806.4,809.97,802.83


In [96]:
transformed_test=scaler.fit_transform(required_test_df.values)

In [99]:
xx_in=_npy_dumps(transformed_test)# here I have serialized the test data before taking inference from endpoint

# Most Important results

In [2]:
response_2=runtime.invoke_endpoint(EndpointName=endpoint_name,
                                 
                                ContentType = 'application/x-npy',
                                Accept='application/x-npy',# Very important, today I came to know the importance of logs, from there I 
                                Body=xx_in) # So here I have to provide the data in bytes , whereas in predictor, where it access the predict.py
                                            #file, and serializes the input data itself

NameError: name 'runtime' is not defined

In [1]:
print(response_2['Body'].read())

NameError: name 'response_2' is not defined

In [24]:
transformed_test_df=pd.DataFrame(transformed_test)

In [27]:
transformed_test_df.head()

Unnamed: 0,0,1,2
0,-1.0,-1.0,-1.0
1,-0.676271,-0.934633,-0.712556
2,-0.753559,-0.814602,-0.639914
3,-0.442373,-0.301606,-0.3595
4,-0.064746,-0.222477,0.055653


In [28]:
transformed_test_df.to_csv('Data/test.csv',header=None,index=None)

In [38]:

from sagemaker.predictor import csv_serializer,json_deserializer,json_serializer
predictor_2.content_type='text/csv'
#predictor.serializer=json_serializer
#predictor.deserializer=json_deserializer
path='Data/test.csv'
inputs=pd.read_csv(path,header=None)
import io
with open(path,'rb') as f:
    bytes_input=io.TextIOWrapper(f, encoding="utf-8")
    predictions=predictor_2.predict(bytes_input)

    



TypeError: Unicode-objects must be encoded before hashing

In [71]:
real_close_values=test_df['Close'].values
scaled_real_close_df=scaler.fit_transform(real_close_values.reshape(-1,1))

In [72]:
retransformed_predictions=scaler.inverse_transform(predictions)

In [73]:
print(predictions)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]
