In [334]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import yfinance as yf
import hvplot.pandas
import pandas_datareader as pdr
import datetime as dt

In [351]:
# Import stock data to dataframe
tickers_index = 'SPY'

start = dt.datetime(2017, 1, 1)
end = dt.datetime(2022, 8, 31)
 
trading_df = pdr.get_data_yahoo(tickers_index, start, end)
print(trading_df.head())
print(trading_df.tail())

                  High         Low        Open       Close      Volume  \
Date                                                                     
2017-01-03  225.830002  223.880005  225.039993  225.240005  91366500.0   
2017-01-04  226.750000  225.610001  225.619995  226.580002  78744400.0   
2017-01-05  226.580002  225.479996  226.270004  226.399994  78379000.0   
2017-01-06  227.750000  225.899994  226.529999  227.210007  71559900.0   
2017-01-09  227.070007  226.419998  226.910004  226.460007  46939700.0   

             Adj Close  
Date                    
2017-01-03  203.788071  
2017-01-04  205.000412  
2017-01-05  204.837540  
2017-01-06  205.570404  
2017-01-09  204.891815  
                  High         Low        Open       Close       Volume  \
Date                                                                      
2022-08-25  419.559998  414.089996  415.239990  419.510010   50942300.0   
2022-08-26  419.959991  405.250000  419.390015  405.309998  103087000.0   
2022-0

In [283]:
# Function to clean df (drop columns, delete null values)
def clean_df(trading_df):
    if 'Open' in trading_df.columns:
        trading_df = trading_df.drop(['Open','High','Low','Volume','Adj Close'],axis=1) #drop unwanted columns
        trading_df = trading_df.dropna().copy() #drop null values
        print(f"Number of Null Values: {trading_df.isnull().sum().sum()}") #check for null values and print
    return trading_df #return df to variable

In [284]:
# Create a new dataframe for SPY
trading_df = clean_df(trading_df)
trading_df.columns = ['SPY']

Number of Null Values: 0


In [285]:
trading_df

Unnamed: 0_level_0,SPY
Date,Unnamed: 1_level_1
2017-01-03,225.240005
2017-01-04,226.580002
2017-01-05,226.399994
2017-01-06,227.210007
2017-01-09,226.460007
...,...
2022-08-25,419.510010
2022-08-26,405.309998
2022-08-29,402.630005
2022-08-30,398.209991


In [286]:
# Use the pct_change function to generate returns from close prices
trading_df["Actual Returns"] = trading_df["SPY"].pct_change()

In [287]:
# Drop all NaN values from the DataFrame
trading_df = trading_df.dropna()
trading_df.head()

Unnamed: 0_level_0,SPY,Actual Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-04,226.580002,0.005949
2017-01-05,226.399994,-0.000794
2017-01-06,227.210007,0.003578
2017-01-09,226.460007,-0.003301
2017-01-10,226.460007,0.0


In [288]:
# Define a window size of 50
short_window = 50

# Create a simple moving average (SMA) using the short_window and assign this to a new columns called sma_fast
trading_df["sma_fast"] = trading_df["SPY"].rolling(window=short_window).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [289]:
# Define a window size of 100
long_window = 100

# Create a simple moving average (SMA) using the long_window and assign this to a new columns called sma_slow
trading_df["sma_slow"] = trading_df["SPY"].rolling(window=long_window).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [290]:
# Drop the NaNs using dropna()
trading_df = trading_df.dropna()

In [291]:
# Create a new column in the trading_df called signal setting its value to zero.
trading_df["signal"] = 0
trading_df

Unnamed: 0_level_0,SPY,Actual Returns,sma_fast,sma_slow,signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-05-26,241.710007,-0.000207,236.972800,234.438900,0
2017-05-30,241.500000,-0.000869,237.062200,234.588100,0
2017-05-31,241.440002,-0.000248,237.155600,234.738500,0
2017-06-01,243.360001,0.007952,237.348200,234.900000,0
2017-06-02,244.169998,0.003328,237.546000,235.077100,0
...,...,...,...,...,...
2022-08-25,419.510010,0.014118,398.041401,406.921401,0
2022-08-26,405.309998,-0.033849,398.563600,406.406501,0
2022-08-29,402.630005,-0.006612,399.283201,405.922501,0
2022-08-30,398.209991,-0.010978,399.930201,405.439401,0


In [292]:
# Generate the trading signal 0 or 1,
# where 1 is the short-window (SMA50) greater than the long-window (SMA100)
# and 0 is when the condition is not met
trading_df["signal"][short_window:] = np.where(
    trading_df["sma_slow"][short_window:] > trading_df["sma_fast"][short_window:], 1.0, 0.0
)

# Review the DataFrame
trading_df.tail(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0_level_0,SPY,Actual Returns,sma_fast,sma_slow,signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-08-18,427.890015,0.002906,395.447599,408.918401,1.0
2022-08-19,422.140015,-0.013438,395.666,408.580701,1.0
2022-08-22,413.350006,-0.020822,395.9042,408.098701,1.0
2022-08-23,412.350006,-0.002419,396.3552,407.635201,1.0
2022-08-24,413.670013,0.003201,397.1286,407.255501,1.0
2022-08-25,419.51001,0.014118,398.041401,406.921401,1.0
2022-08-26,405.309998,-0.033849,398.5636,406.406501,1.0
2022-08-29,402.630005,-0.006612,399.283201,405.922501,1.0
2022-08-30,398.209991,-0.010978,399.930201,405.439401,1.0
2022-08-31,395.179993,-0.007609,400.332401,404.903501,1.0


In [293]:
trading_df['signal'].value_counts()

0.0    1005
1.0     321
Name: signal, dtype: int64

In [294]:
# Calculate the strategy returns and add them to the signals_df DataFrame
trading_df['Strategy Returns'] = trading_df['Actual Returns'] * trading_df['signal'].shift()

# Review the DataFrame
display(trading_df.head())
display(trading_df.tail())

Unnamed: 0_level_0,SPY,Actual Returns,sma_fast,sma_slow,signal,Strategy Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-05-26,241.710007,-0.000207,236.9728,234.4389,0.0,
2017-05-30,241.5,-0.000869,237.0622,234.5881,0.0,-0.0
2017-05-31,241.440002,-0.000248,237.1556,234.7385,0.0,-0.0
2017-06-01,243.360001,0.007952,237.3482,234.9,0.0,0.0
2017-06-02,244.169998,0.003328,237.546,235.0771,0.0,0.0


Unnamed: 0_level_0,SPY,Actual Returns,sma_fast,sma_slow,signal,Strategy Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-08-25,419.51001,0.014118,398.041401,406.921401,1.0,0.014118
2022-08-26,405.309998,-0.033849,398.5636,406.406501,1.0,-0.033849
2022-08-29,402.630005,-0.006612,399.283201,405.922501,1.0,-0.006612
2022-08-30,398.209991,-0.010978,399.930201,405.439401,1.0,-0.010978
2022-08-31,395.179993,-0.007609,400.332401,404.903501,1.0,-0.007609


In [295]:
# Plot Strategy Returns to examine performance
(1 + trading_df['Strategy Returns']).cumprod().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7fd0921cfcd0>

In [296]:
# Imports 
from pandas.tseries.offsets import DateOffset

In [297]:
 # Assign a copy of the sma_fast and sma_slow columns to a new DataFrame called X
X = trading_df[["sma_fast", "sma_slow","Actual Returns"]].copy()

# Display sample data
display(X.head())
display(X.tail())

Unnamed: 0_level_0,sma_fast,sma_slow,Actual Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-05-26,236.9728,234.4389,-0.000207
2017-05-30,237.0622,234.5881,-0.000869
2017-05-31,237.1556,234.7385,-0.000248
2017-06-01,237.3482,234.9,0.007952
2017-06-02,237.546,235.0771,0.003328


Unnamed: 0_level_0,sma_fast,sma_slow,Actual Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-08-25,398.041401,406.921401,0.014118
2022-08-26,398.5636,406.406501,-0.033849
2022-08-29,399.283201,405.922501,-0.006612
2022-08-30,399.930201,405.439401,-0.010978
2022-08-31,400.332401,404.903501,-0.007609


In [298]:
# Copy the new signal column to a new Series called y.
# Creating the target set y
y = trading_df["signal"]

# Display sample data
y.head()

Date
2017-05-26    0.0
2017-05-30    0.0
2017-05-31    0.0
2017-06-01    0.0
2017-06-02    0.0
Name: signal, dtype: float64

In [299]:
# Select the start of the training period
training_begin = X.index.min()

# Display the training begin date
print(training_begin)

2017-05-26 00:00:00


In [300]:
 # Select the ending period for the training data with an offset of 3 months
training_end = X.index.min() + DateOffset(months=3)

# Display the training end date
print(training_end)

2017-08-26 00:00:00


In [301]:
# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Display sample data
X_train.head()

Unnamed: 0_level_0,sma_fast,sma_slow,Actual Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-05-26,236.9728,234.4389,-0.000207
2017-05-30,237.0622,234.5881,-0.000869
2017-05-31,237.1556,234.7385,-0.000248
2017-06-01,237.3482,234.9,0.007952
2017-06-02,237.546,235.0771,0.003328


In [302]:
y_train

Date
2017-05-26    0.0
2017-05-30    0.0
2017-05-31    0.0
2017-06-01    0.0
2017-06-02    0.0
             ... 
2017-08-21    0.0
2017-08-22    0.0
2017-08-23    0.0
2017-08-24    0.0
2017-08-25    0.0
Name: signal, Length: 64, dtype: float64

In [303]:
# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end:]
y_test = y.loc[training_end:]

# Display sample data
X_test.head()

Unnamed: 0_level_0,sma_fast,sma_slow,Actual Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-08-28,244.799001,242.038701,4.1e-05
2017-08-29,244.802801,242.132801,0.001145
2017-08-30,244.862801,242.240901,0.004738
2017-08-31,244.953601,242.362401,0.006016
2017-09-01,245.053601,242.490201,0.001414


In [304]:
 # Imports
from sklearn.preprocessing import StandardScaler

In [305]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [306]:
 # Create a StandardScaler instance
scaler = StandardScaler()
 
# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)
 
# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [307]:
## Create a Machine Learning Model in SageMaker Studio

In [308]:
# Import Amazon SageMaker libraries and modules
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer, json_deserializer

# Import AWS Python SDK
import boto3

# Import support libraries
import io
import os
import json
import numpy as np

In [309]:
# Set the S3 bucket name
bucket = "fintechbootcamp-pankaj-sep-08-1"

In [310]:
# Set a prefix for the data files
prefix = "Machine_Learning"

In [311]:
# Set the IAM execution role
role = get_execution_role()

In [312]:
# Encode the training data as Protocol Buffer
buf = io.BytesIO()
vectors = np.array(X_train_scaled).astype("float32")
labels = np.array(y_train).astype("float32")
smac.write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

# Upload encoded training data to Amazon S3
key = 'linear_train.data'
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = "s3://{}/{}/train/{}".format(bucket, prefix, key)
print("Training data uploaded to: {}".format(s3_train_data))

Training data uploaded to: s3://fintechbootcamp-pankaj-sep-08-1/Machine_Learning/train/linear_train.data


In [313]:
# Encode the testing data as Protocol Buffer
buf = io.BytesIO()
vectors = np.array(X_test_scaled).astype("float32")
labels = np.array(y_test).astype("float32")
smac.write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

# Upload encoded testing data to Amazon S3
key = "linear_test.data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "test", key)).upload_fileobj(buf)
s3_test_data = "s3://{}/{}/test/{}".format(bucket, prefix, key)
print("Testing data uploaded to: {}".format(s3_test_data))

Testing data uploaded to: s3://fintechbootcamp-pankaj-sep-08-1/Machine_Learning/test/linear_test.data


In [314]:
# Save the current session in a variable
sess = sagemaker.Session()

In [315]:
# Import the get_image_uri module from the sagamaker library
from sagemaker.amazon.amazon_estimator import get_image_uri

In [316]:
# Import the container image
container = get_image_uri(boto3.Session().region_name, "linear-learner")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


In [317]:
# Create an instance of the machine learning model
linear = sagemaker.estimator.Estimator(
    container,
    role,
    train_instance_count=1,
    train_instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(bucket, prefix),
    sagemaker_session=sess,
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [318]:
# Get the dimension of the feature-input vector
feature_dim = X.shape[1]

In [319]:
# Define linear learner hyperparameters
linear.set_hyperparameters(
    feature_dim=feature_dim,
    mini_batch_size=200,
    predictor_type="binary_classifier"
)

In [320]:
# Fitting the linear learner model
linear.fit({"train": s3_train_data, "test": s3_test_data})

2022-09-18 18:59:21 Starting - Starting the training job...
2022-09-18 18:59:45 Starting - Preparing the instances for trainingProfilerReport-1663527561: InProgress
..................
2022-09-18 19:02:45 Downloading - Downloading input data....................................
2022-09-18 19:08:47 Training - Downloading the training image.....[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[09/18/2022 19:09:32 INFO 139731472357184] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': '

In [322]:
 # Deploy an instance of the linear-learner model to create a predictor
linear_predictor = linear.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

------------------!

In [323]:
print(linear_predictor.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


linear-learner-2022-09-19-03-12-24-252


In [324]:
 # Linear predictor configurations
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [325]:
# Making some predictions using the test data
model_predictions = linear_predictor.predict(X_test_scaled)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [326]:
# Display sample predictions
model_predictions["predictions"][:5]

[{'score': 0.15866805613040924, 'predicted_label': 0},
 {'score': 0.20984874665737152, 'predicted_label': 0},
 {'score': 0.3110906481742859, 'predicted_label': 0},
 {'score': 0.17826364934444427, 'predicted_label': 0},
 {'score': 0.40644583106040955, 'predicted_label': 1}]

In [329]:
# Create a new empty predictions DataFrame:

# Create a predictions DataFrame
predictions_df = pd.DataFrame(index=X_test.index)

# Add the SVM model predictions to the DataFrame
predictions_df['Predicted'] = y_predictions

# Add the actual returns to the DataFrame
predictions_df['Actual Returns'] = trading_df['Actual Returns']

# Add the strategy returns to the DataFrame
predictions_df['Strategy Returns'] = predictions_df['Actual Returns'] * predictions_df['Predicted']

# Review the DataFrame
display(predictions_df.head())
display(predictions_df.tail())

Unnamed: 0_level_0,Predicted,Actual Returns,Strategy Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-04-25,0,-0.000616,-0.0
2021-03-10,0,0.006225,0.0
2021-12-21,0,0.017759,0.0
2017-11-08,0,0.001701,0.0
2022-06-09,1,-0.023783,-0.023783


Unnamed: 0_level_0,Predicted,Actual Returns,Strategy Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-04-15,0,-0.000655,-0.0
2021-05-05,0,0.000313,0.0
2020-03-10,0,0.051745,0.0
2019-01-03,0,-0.023863,-0.0
2022-05-03,1,0.004584,0.004584


In [338]:
predictions_df['Strategy Returns'] = predictions_df['Strategy Returns'].cumsum()

In [343]:
predicted_returns = sum(predictions_df['Strategy Returns'])
predicted_returns

3.854193817832046

In [344]:
predicted_returns_1 = sum(predictions_df['Actual Returns'])
predicted_returns_1

0.15669734125030887

In [328]:
# Create a list with the predicted values
y_predictions = [np.uint8(value["predicted_label"]) for value in model_predictions["predictions"]]

# Transforming the list into an array
y_predictions = np.array(y_predictions)

# Display sample data
y_predictions[:10]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype=uint8)

In [330]:
# Import the classification report from Scikit-learn
from sklearn.metrics import classification_report

In [331]:
 # Display classification report
print("Classification report")
print(classification_report(y_test, y_predictions))

Classification report
              precision    recall  f1-score   support

         0.0       0.86      0.99      0.92       256
         1.0       0.94      0.43      0.59        76

    accuracy                           0.86       332
   macro avg       0.90      0.71      0.76       332
weighted avg       0.88      0.86      0.84       332



In [361]:
# Imports
from sklearn import svm
from sklearn.metrics import classification_report

In [362]:
# Create the classifier model
svm_model = svm.SVC()
 
# Fit the model to the data using X_train_scaled and y_train
svm_model = svm_model.fit(X_train_scaled, y_train)

# Use the trained model to predict the trading signals for the training data
training_signal_predictions = svm_model.predict(X_train_scaled)

# Display the sample predictions
training_signal_predictions[:10]

array([0., 0., 1., 0., 1., 0., 0., 1., 0., 0.])

In [363]:
# Evaluate the model using a classification report
training_report = classification_report(y_train, training_signal_predictions)
print(training_report)

              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94       749
         1.0       0.98      0.64      0.77       245

    accuracy                           0.91       994
   macro avg       0.94      0.82      0.86       994
weighted avg       0.92      0.91      0.90       994



In [364]:
# Use the trained model to predict the trading signals for the testing data.
testing_signal_predictions = svm_model.predict(X_test_scaled)

In [365]:
# Evaluate the model's ability to predict the trading signal for the testing data
svm_testing_report = classification_report(y_test, testing_signal_predictions)
print(svm_testing_report)

              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92       256
         1.0       0.91      0.51      0.66        76

    accuracy                           0.88       332
   macro avg       0.89      0.75      0.79       332
weighted avg       0.88      0.88      0.86       332

