# Machine Learning Trading Bot



In [7]:
# Imports

import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, r2_score

# Create X and y data sets
Manufactured extra columns for X from existing dataset
New columns created are 
    frontFuture_pctchange,
    aggregateOpenInterest_pctchange,
    aggregateCallOpenInterest_pctchange,
    aggregatePutOpenInterest_pctchange,
    aggregateVolume_pctchange

In [8]:
# Read the file in dataframe
file_path = './Resources/Crude.csv'
fundamental_df = pd.read_csv(Path(file_path),index_col='date',parse_dates=True,infer_datetime_format=True)

# Separate out y, which is daily pct change of spotPrice and then drop the spotPrice from the original dataframe
# fundamental_df[
#     ['spotPrice_pctchange',
#      'frontFuture_pctchange'
#     ]] = fundamental_df[
#     ['spotPrice',
#      'frontMonthPrice'
#     ]].pct_change()

fundamental_df[
    ['spotPrice_pctchange',
     'frontFuture_pctchange',
     'aggregateOpenInterest_pctchange',
     'aggregateCallOpenInterest_pctchange',
     'aggregatePutOpenInterest_pctchange',
     'aggregateVolume_pctchange'
    ]] = fundamental_df[
    ['spotPrice',
     'frontMonthPrice',
     'aggregateOpenInterest',
     'aggregateCallOpenInterest',
     'aggregatePutOpenInterest',
     'aggregateVolume'
    ]].pct_change()

fundamental_df = fundamental_df.dropna()
fundamental_df = fundamental_df.drop(columns=['spotPrice','frontMonthPrice'])

fundamental_df['signal'] = np.where(fundamental_df['spotPrice_pctchange'] >=0,1,-1)
# y = fundamental_df['spotPrice_pctchange']
y = fundamental_df['signal']
fundamental_df.drop(columns=['signal'],inplace=True)
X = fundamental_df.shift().dropna()

fundamental_df.tail()

# Create X_train, y_train, X_test and y_test
consume offset_years as a parameter

In [9]:
offset_years = 7
training_begin = X.index.min()
training_end = training_begin + DateOffset(years=offset_years)

X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

test_begin = X.loc[training_end : ].index.min()
X_test = X.loc[test_begin : ]
y_test = y.loc[test_begin : ]


# Standardize the data set

In [10]:
# Scale the features DataFrames

# Create a StandardScaler instance
scaler = StandardScaler()

# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)

# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Create a model
# Fit the model with X_train_scaled, y_train data
# Predict the y with using y_test data
# Create comparison report

In [11]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Create models

# Logistic Regression model
LR_model = LogisticRegression(random_state=1)
LR_model.fit(X_train_scaled,y_train)
y_predict_test_LR = LR_model.predict(X_test_scaled)
print('Logistic Regression model')
print(classification_report(y_test,y_predict_test_LR))


# Decision Tree classifier model
DTC_model = DecisionTreeClassifier(random_state=1)
DTC_model.fit(X_train_scaled,y_train)
y_predict_test_DTC = DTC_model.predict(X_test_scaled)
print('Decision Tree Classifier')
print(classification_report(y_test,y_predict_test_DTC))


# SVM model
SVM_model = SVC(random_state=1)
SVM_model.fit(X_train_scaled,y_train)
y_predict_test_SVM = SVM_model.predict(X_test_scaled)
print('SVM Classifier')
print(classification_report(y_test,y_predict_test_SVM))


# GaussianNB model
GaussianNB_model = GaussianNB()
GaussianNB_model.fit(X_train_scaled,y_train)
y_predict_test_GaussianNB = GaussianNB_model.predict(X_test_scaled)
print('GaussianNB Classifier')
print(classification_report(y_test,y_predict_test_GaussianNB))


# RandomForestClassifier model
RandomForestClassifier_model = RandomForestClassifier()
RandomForestClassifier_model.fit(X_train_scaled,y_train)
y_predict_test_RandomForestClassifier = RandomForestClassifier_model.predict(X_test_scaled)
print('RandomForestClassifier Classifier')
print(classification_report(y_test,y_predict_test_RandomForestClassifier))

Logistic Regression model
              precision    recall  f1-score   support

          -1       0.43      0.20      0.27       361
           1       0.55      0.79      0.65       451

    accuracy                           0.53       812
   macro avg       0.49      0.49      0.46       812
weighted avg       0.50      0.53      0.48       812

Decision Tree Classifier
              precision    recall  f1-score   support

          -1       0.48      0.43      0.45       361
           1       0.58      0.63      0.60       451

    accuracy                           0.54       812
   macro avg       0.53      0.53      0.53       812
weighted avg       0.53      0.54      0.54       812

SVM Classifier
              precision    recall  f1-score   support

          -1       0.46      0.32      0.38       361
           1       0.56      0.70      0.62       451

    accuracy                           0.53       812
   macro avg       0.51      0.51      0.50       812
weighted

# Create output file for Aggregator

In [12]:
output_df = pd.DataFrame(index=X_test.index)
output_df.index.names = ['Date']
output_df['y'] = y_predict_test_LR
output_df.to_csv(Path('./Resources/fundamental_output.csv'))
display(output_df)

Unnamed: 0_level_0,y
Date,Unnamed: 1_level_1
2019-01-07,1
2019-01-08,1
2019-01-09,1
2019-01-10,-1
2019-01-11,1
...,...
2022-03-25,1
2022-03-28,-1
2022-03-29,1
2022-03-30,1
