In [115]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import hvplot.pandas
import yfinance as yf
import pandas_datareader as pdr
import datetime as dt

In [116]:
# Import stock data to dataframe
tickers_index = 'AGG'

start = dt.datetime(2017, 1, 1)
end = dt.datetime(2021, 12, 31)
 
df_1 = pdr.get_data_yahoo(tickers_index, start, end)
df_1.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,108.220001,107.730003,107.730003,108.169998,5079200.0,94.6717
2017-01-04,108.209999,108.029999,108.160004,108.199997,3297100.0,94.697968
2017-01-05,108.68,108.220001,108.370003,108.650002,5165900.0,95.091789
2017-01-06,108.5,108.260002,108.43,108.290001,2910100.0,94.776733
2017-01-09,108.540001,108.43,108.540001,108.470001,3030500.0,94.934265


In [117]:
# Function to clean df (drop columns, delete null values)

def clean_df(df):
    if 'Open' in df.columns:
        df = df.drop(['Open','High','Low','Volume','Adj Close'],axis=1) #drop unwanted columns
        df = df.dropna().copy() #drop null values
        print(f"Number of Null Values: {df.isnull().sum().sum()}") #check for null values and print
    return df #return df to variable

In [118]:
# Create a new dataframe for AGG

df_1 = clean_df(df_1)
df_1.columns = ['AGG']

Number of Null Values: 0


In [119]:
df_1.head()

Unnamed: 0_level_0,AGG
Date,Unnamed: 1_level_1
2017-01-03,108.169998
2017-01-04,108.199997
2017-01-05,108.650002
2017-01-06,108.290001
2017-01-09,108.470001


In [120]:
# Calculate the daily returns using the closing prices and the pct_change function
df_1["actual_returns"] = df_1["AGG"].pct_change()

# Drop all NaN values from the DataFrame
df_1 = df_1.dropna()

# Review the DataFrame
display(df_1.head())


Unnamed: 0_level_0,AGG,actual_returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-04,108.199997,0.000277
2017-01-05,108.650002,0.004159
2017-01-06,108.290001,-0.003313
2017-01-09,108.470001,0.001662
2017-01-10,108.440002,-0.000277


In [121]:
# Create a simple moving average (SMA) using the short_window and assign this to a new columns called sma_fast
# Define a window size of 50
short_window = 50
df_1["sma_fast"] = df_1["AGG"].rolling(window=short_window).mean()
# Define a window size of 100
long_window = 100

# Create a simple moving average (SMA) using the long_window and assign this to a new columns called sma_slow
df_1["sma_slow"] = df_1["AGG"].rolling(window=long_window).mean()

In [122]:
# Drop the NaNs using dropna()
df_1 = df_1.dropna()


In [123]:
# Create a new column in the trading_df called signal setting its value to zero.
df_1["signal"] = 0.0
df_1

Unnamed: 0_level_0,AGG,actual_returns,sma_fast,sma_slow,signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-05-26,109.440002,-0.000365,108.867999,108.5122,0.0
2017-05-30,109.650002,0.001919,108.903199,108.5267,0.0
2017-05-31,109.760002,0.001003,108.937399,108.5378,0.0
2017-06-01,109.480003,-0.002551,108.961600,108.5497,0.0
2017-06-02,109.839996,0.003288,108.990000,108.5634,0.0
...,...,...,...,...,...
2021-12-27,114.260002,0.000525,114.398800,114.9852,0.0
2021-12-28,114.209999,-0.000438,114.391600,114.9642,0.0
2021-12-29,113.870003,-0.002977,114.379200,114.9450,0.0
2021-12-30,114.120003,0.002195,114.378400,114.9304,0.0


In [124]:
# Generate the trading signal 0 or 1,
# where 1 is the short-window (SMA50) greater than the long-window (SMA100)
# and 0 is when the condition is not met
df_1["signal"][short_window:] = np.where(
    df_1["sma_slow"][short_window:] > df_1["sma_fast"][short_window:], 1.0, 0.0
)

# Review the DataFrame
df_1.tail(10)

Unnamed: 0_level_0,AGG,actual_returns,sma_fast,sma_slow,signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-12-17,114.5,0.001224,114.41,115.096,1.0
2021-12-20,114.330002,-0.001485,114.414,115.0763,1.0
2021-12-21,114.239998,-0.000787,114.4198,115.0541,1.0
2021-12-22,114.339996,0.000875,114.4192,115.0321,1.0
2021-12-23,114.199997,-0.001224,114.4106,115.0083,1.0
2021-12-27,114.260002,0.000525,114.3988,114.9852,1.0
2021-12-28,114.209999,-0.000438,114.3916,114.9642,1.0
2021-12-29,113.870003,-0.002977,114.3792,114.945,1.0
2021-12-30,114.120003,0.002195,114.3784,114.9304,1.0
2021-12-31,114.080002,-0.000351,114.3784,114.9169,1.0


In [125]:
 # Slice the DataFrame to confirm the Signal
df_1.loc["2020-04-09":"2020-04-20"]

Unnamed: 0_level_0,AGG,actual_returns,sma_fast,sma_slow,signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-09,117.25,0.012434,114.461,113.6676,0.0
2020-04-13,116.699997,-0.004691,114.5088,113.7105,0.0
2020-04-14,116.760002,0.000514,114.551,113.7519,0.0
2020-04-15,117.239998,0.004111,114.6084,113.7973,0.0
2020-04-16,117.389999,0.001279,114.6764,113.8415,0.0
2020-04-17,117.25,-0.001193,114.745,113.8859,0.0
2020-04-20,117.080002,-0.00145,114.8088,113.928,0.0


In [126]:
# Calculate the points in time when the Signal value changes
# Identify trade entry (1) and exit (-1) points
df_1["Entry/Exit"] = df_1["signal"].diff()

# Review the DataFrame
df_1.loc["2020-03-14":"2020-03-25"]

Unnamed: 0_level_0,AGG,actual_returns,sma_fast,sma_slow,signal,Entry/Exit
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-16,113.730003,0.016717,114.286201,113.426,0.0,0.0
2020-03-17,110.790001,-0.025851,114.241801,113.4073,0.0,0.0
2020-03-18,106.57,-0.03809,114.114801,113.3455,0.0,0.0
2020-03-19,108.949997,0.022333,114.0378,113.3072,0.0,0.0
2020-03-20,110.029999,0.009913,113.985,113.2813,0.0,0.0
2020-03-23,112.639999,0.023721,113.982,113.2839,0.0,0.0
2020-03-24,113.57,0.008256,113.9938,113.2957,0.0,0.0
2020-03-25,114.730003,0.010214,114.0302,113.3159,0.0,0.0


In [127]:
# Visualize exit position relative to close price
exit = df_1[df_1["Entry/Exit"] == -1.0]["AGG"].hvplot.scatter(
    color="yellow",
    marker="v",
    size=200,
    legend=False,
    ylabel="Price in $",
    width=1000,
    height=400)

# Show the plot
exit

In [128]:
 # Calculate the daily returns using the closing prices and the pct_change function
df_1["actual_returns"] = df_1["AGG"].pct_change()

# Drop all NaN values from the DataFrame
df_1 = df_1.dropna()

# Review the DataFrame
display(df_1.head())
display(df_1.tail())

Unnamed: 0_level_0,AGG,actual_returns,sma_fast,sma_slow,signal,Entry/Exit
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-05-30,109.650002,0.001919,108.903199,108.5267,0.0,0.0
2017-05-31,109.760002,0.001003,108.937399,108.5378,0.0,0.0
2017-06-01,109.480003,-0.002551,108.9616,108.5497,0.0,0.0
2017-06-02,109.839996,0.003288,108.99,108.5634,0.0,0.0
2017-06-05,109.669998,-0.001548,109.0144,108.5757,0.0,0.0


Unnamed: 0_level_0,AGG,actual_returns,sma_fast,sma_slow,signal,Entry/Exit
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-12-27,114.260002,0.000525,114.3988,114.9852,1.0,0.0
2021-12-28,114.209999,-0.000438,114.3916,114.9642,1.0,0.0
2021-12-29,113.870003,-0.002977,114.3792,114.945,1.0,0.0
2021-12-30,114.120003,0.002195,114.3784,114.9304,1.0,0.0
2021-12-31,114.080002,-0.000351,114.3784,114.9169,1.0,0.0


In [129]:
# Imports 
from pandas.tseries.offsets import DateOffset

In [130]:
 # Assign a copy of the sma_fast and sma_slow columns to a new DataFrame called X
X = df_1[["sma_fast", "sma_slow"]].copy()

# Display sample data
display(X.head())
display(X.tail())

Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-30,108.903199,108.5267
2017-05-31,108.937399,108.5378
2017-06-01,108.9616,108.5497
2017-06-02,108.99,108.5634
2017-06-05,109.0144,108.5757


Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-27,114.3988,114.9852
2021-12-28,114.3916,114.9642
2021-12-29,114.3792,114.945
2021-12-30,114.3784,114.9304
2021-12-31,114.3784,114.9169


In [131]:
# Copy the new signal column to a new Series called y.
# Creating the target set y
y = df_1["signal"]

# Display sample data
y.head()

Date
2017-05-30    0.0
2017-05-31    0.0
2017-06-01    0.0
2017-06-02    0.0
2017-06-05    0.0
Name: signal, dtype: float64

In [132]:
# Select the start of the training period
training_begin = X.index.min()

# Display the training begin date
print(training_begin)

2017-05-30 00:00:00


In [133]:
# Select the ending period for the training data with an offset of 3 months
training_end = X.index.min() + DateOffset(months=3)

# Display the training end date
print(training_end)

2017-08-30 00:00:00


In [134]:
# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Display sample data
X_train.head()

Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-30,108.903199,108.5267
2017-05-31,108.937399,108.5378
2017-06-01,108.9616,108.5497
2017-06-02,108.99,108.5634
2017-06-05,109.0144,108.5757


In [135]:
# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end:]
y_test = y.loc[training_end:]

# Display sample data
X_test.head()


Unnamed: 0_level_0,sma_fast,sma_slow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-30,109.703,109.4947
2017-08-31,109.7124,109.5134
2017-09-01,109.7112,109.5242
2017-09-05,109.7186,109.5371
2017-09-06,109.7216,109.5474


In [136]:
# Imports
from sklearn.preprocessing import StandardScaler

In [137]:
# Split the preprocessed data into training and testing datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [138]:
# Create a StandardScaler instance
scaler = StandardScaler()
 
# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)
 
# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train


Date
2020-10-09    0.0
2020-11-19    1.0
2021-05-26    1.0
2020-11-12    1.0
2021-10-08    0.0
             ... 
2020-04-01    0.0
2020-12-31    1.0
2021-10-05    0.0
2018-05-04    1.0
2021-08-16    0.0
Name: signal, Length: 868, dtype: float64

In [139]:
# Import Amazon SageMaker libraries and modules
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer, json_deserializer

# Import AWS Python SDK
import boto3

# Import support libraries
import io
import os
import json
import numpy as np

In [81]:
# Set the S3 bucket name
bucket = "fintechbootcamp-pankaj-sep-08-1"

In [140]:
# Set a prefix for the data files
prefix = "Machine_Learning_2"

In [141]:
# Set the IAM execution role
role = get_execution_role()

In [143]:
# Encode the training data as Protocol Buffer
buf = io.BytesIO()
vectors = np.array(X_train_scaled).astype("float32")
labels = np.array(y_train).astype("float32")
smac.write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

# Upload encoded training data to Amazon S3
key = 'linear_train.data'
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = "s3://{}/{}/train/{}".format(bucket, prefix, key)
print("Training data uploaded to: {}".format(s3_train_data))

Training data uploaded to: s3://fintechbootcamp-pankaj-sep-08-1/Machine_Learning_2/train/linear_train.data


In [144]:
# Encode the testing data as Protocol Buffer
buf = io.BytesIO()
vectors = np.array(X_test_scaled).astype("float32")
labels = np.array(y_test).astype("float32")
smac.write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

# Upload encoded testing data to Amazon S3
key = "linear_test.data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "test", key)).upload_fileobj(buf)
s3_test_data = "s3://{}/{}/test/{}".format(bucket, prefix, key)
print("Testing data uploaded to: {}".format(s3_test_data))

Testing data uploaded to: s3://fintechbootcamp-pankaj-sep-08-1/Machine_Learning_2/test/linear_test.data


In [145]:
# Save the current session in a variable
sess = sagemaker.Session()

In [146]:
# Import the get_image_uri module from the sagemaker library
from sagemaker.amazon.amazon_estimator import get_image_uri

In [147]:
# Import the container image
container = get_image_uri(boto3.Session().region_name, "linear-learner")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


In [148]:
# Create an instance of the machine learning model
linear = sagemaker.estimator.Estimator(
    container,
    role,
    train_instance_count=1,
    train_instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(bucket, prefix),
    sagemaker_session=sess,
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [149]:
# Get the dimension of the feature-input vector
feature_dim = X.shape[1]

In [150]:
# Define linear learner hyperparameters
# Note how in this case we use: predictor_type='binary_classifier' # (credit risk: good or bad)
linear.set_hyperparameters(
    feature_dim=feature_dim,
    mini_batch_size=200,
    predictor_type="binary_classifier"
)

In [151]:
# Fitting the linear learner model
linear.fit({"train": s3_train_data, "test": s3_test_data})

2022-09-20 03:39:52 Starting - Starting the training job...
2022-09-20 03:40:19 Starting - Preparing the instances for trainingProfilerReport-1663645192: InProgress
.........
2022-09-20 03:41:49 Downloading - Downloading input data...
2022-09-20 03:42:19 Training - Downloading the training image...........[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[09/20/2022 03:44:07 INFO 139884342290240] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.

In [153]:
 # Deploy an instance of the linear-learner model to create a predictor
linear_predictor = linear.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

------------------!

In [154]:
print(linear_predictor.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


linear-learner-2022-09-20-04-02-20-418


In [155]:
 # Linear predictor configurations
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [156]:
# Making some predictions using the test data
model_predictions = linear_predictor.predict(X_test_scaled)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [157]:
# Display sample predictions
model_predictions["predictions"][:5]

[{'score': 0.275529146194458, 'predicted_label': 0},
 {'score': 0.7046324610710144, 'predicted_label': 1},
 {'score': 0.7296398282051086, 'predicted_label': 1},
 {'score': 0.18422618508338928, 'predicted_label': 0},
 {'score': 0.2923662066459656, 'predicted_label': 0}]

In [158]:
# Create a list with the predicted values
y_predictions = [np.uint8(value["predicted_label"]) for value in model_predictions["predictions"]]

# Transforming the list into an array
y_predictions = np.array(y_predictions)

# Display sample data
y_predictions[:10]

array([0, 1, 1, 0, 0, 1, 0, 0, 1, 0], dtype=uint8)

In [163]:
# Import the classification report from Scikit-learn
from sklearn.metrics import classification_report

In [164]:
 # Display classification report
print("Classification report")
print(classification_report(y_test, y_predictions))

Classification report
              precision    recall  f1-score   support

         0.0       0.99      0.87      0.93       156
         1.0       0.87      0.99      0.92       134

    accuracy                           0.92       290
   macro avg       0.93      0.93      0.92       290
weighted avg       0.93      0.92      0.92       290



In [169]:
# Imports
from sklearn import svm
from sklearn.metrics import classification_report

In [170]:
# Create the classifier model
svm_model = svm.SVC()
 
# Fit the model to the data using X_train_scaled and y_train
svm_model = svm_model.fit(X_train_scaled, y_train)

# Use the trained model to predict the trading signals for the training data
training_signal_predictions = svm_model.predict(X_train_scaled)

# Display the sample predictions
training_signal_predictions[:10]

array([1., 1., 1., 1., 0., 0., 0., 0., 0., 1.])

In [171]:
# Evaluate the model using a classification report
training_report = classification_report(y_train, training_signal_predictions)
print(training_report)

              precision    recall  f1-score   support

         0.0       0.92      0.93      0.92       475
         1.0       0.91      0.90      0.91       393

    accuracy                           0.91       868
   macro avg       0.91      0.91      0.91       868
weighted avg       0.91      0.91      0.91       868

