In [94]:
# Initial imports.
import pandas as pd
import numpy as np
import datetime as dt
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [95]:
# Loading data after database join of seasons/quarters file and Apple's stock file
# https://towardsdatascience.com/4-tricks-you-should-know-to-parse-date-columns-with-pandas-read-csv-27355bb2ad0e#:~:text=By%20default%2C%20date%20columns%20are%20parsed%20using%20the,a%20different%20date%20format%2C%20for%20example%2C%20YYYY-DD-MM%20HH%3AMM%3ASS%3A
file_path =Path("./AAPL_Mock_ML_Volume.csv")
apple_df = pd.read_csv(file_path, parse_dates=['Date'])
apple_df.head()

Unnamed: 0,Date,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Volume_24hr_Change,Gain_Loss_Volume
0,1981-02-12,Winter,Q1,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,716800,Gain
1,1981-02-13,Winter,Q1,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-3404800,Loss
2,1981-02-17,Winter,Q1,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,1120000,Gain
3,1981-02-18,Winter,Q1,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,6966400,Gain
4,1981-02-19,Winter,Q1,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,3068800,Gain


In [96]:
apple_df['Date'] = pd.to_datetime(apple_df['Date'])
apple_df['Date'] = apple_df['Date'].map(dt.datetime.toordinal)
apple_df.head()

Unnamed: 0,Date,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Volume_24hr_Change,Gain_Loss_Volume
0,723223,Winter,Q1,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,716800,Gain
1,723224,Winter,Q1,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-3404800,Loss
2,723228,Winter,Q1,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,1120000,Gain
3,723229,Winter,Q1,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,6966400,Gain
4,723230,Winter,Q1,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,3068800,Gain


In [97]:
# Must remain in order to activate label_binarize
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
apple_gain_loss_df = apple_df.copy()
apple_gain_loss_df['Gain_Loss_Volume'] = le.fit_transform(apple_gain_loss_df['Gain_Loss_Volume'])
apple_gain_loss_df.head()

Unnamed: 0,Date,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Volume_24hr_Change,Gain_Loss_Volume
0,723223,Winter,Q1,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,716800,0
1,723224,Winter,Q1,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-3404800,1
2,723228,Winter,Q1,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,1120000,0
3,723229,Winter,Q1,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,6966400,0
4,723230,Winter,Q1,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,3068800,0


In [98]:
# If in doubt look up scikit learn label_binarize
from sklearn.preprocessing import label_binarize

binarized_gain_loss = label_binarize(y=list(apple_gain_loss_df['Gain_Loss_Volume']), classes=[1,0])
apple_gain_loss_df = apple_df.copy()
apple_gain_loss_df['Gain_Loss_Volume'] = binarized_gain_loss
apple_gain_loss_df.head()

Unnamed: 0,Date,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Volume_24hr_Change,Gain_Loss_Volume
0,723223,Winter,Q1,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,716800,1
1,723224,Winter,Q1,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-3404800,0
2,723228,Winter,Q1,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,1120000,1
3,723229,Winter,Q1,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,6966400,1
4,723230,Winter,Q1,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,3068800,1


In [99]:
# Perform binary encoding of Season and Quarter columns
apple_binary_encoded = pd.get_dummies(apple_gain_loss_df, columns=["Season", "Quarter"])
apple_binary_encoded.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Volume_24hr_Change,Gain_Loss_Volume,Season_Spring,Season_Winter,Quarter_Q1,Quarter_Q2
0,723223,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,716800,1,0,1,1,0
1,723224,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-3404800,0,0,1,1,0
2,723228,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,1120000,1,0,1,1,0
3,723229,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,6966400,1,0,1,1,0
4,723230,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,3068800,1,0,1,1,0


In [100]:
# If in doubt look up scikit learn label_binarize
from sklearn.preprocessing import label_binarize

binarized_gain_loss = label_binarize(y=list(apple_gain_loss_df['Gain_Loss_Volume']), classes=[1,0])
apple_gain_loss_df = apple_df.copy()
apple_gain_loss_df['Gain_Loss_Volume'] = binarized_gain_loss
apple_gain_loss_df.head()

Unnamed: 0,Date,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Volume_24hr_Change,Gain_Loss_Volume
0,723223,Winter,Q1,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,716800,0
1,723224,Winter,Q1,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-3404800,1
2,723228,Winter,Q1,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,1120000,0
3,723229,Winter,Q1,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,6966400,0
4,723230,Winter,Q1,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,3068800,0


In [101]:
# Define features set
X = apple_binary_encoded.copy()

In [102]:
y = apple_gain_loss_df["Gain_Loss_Volume"]
X = X.drop(["Gain_Loss_Volume","Volume_24hr_Change"], axis=1)
X.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Season_Spring,Season_Winter,Quarter_Q1,Quarter_Q2
0,723223,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,0,1,1,0
1,723224,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,0,1,1,0
2,723228,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,0,1,1,0
3,723229,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,0,1,1,0
4,723230,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,0,1,1,0


In [103]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(30, 11)

In [104]:
#Create a Logistic Regression Model

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [105]:
#Fit (train) or model using the training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [106]:
# Make ppredictions
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,1,1
4,0,0
5,1,0
6,0,1
7,1,1
8,0,0
9,0,1


In [107]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7
