In [1]:
# Initial imports.
import pandas as pd
import numpy as np
import datetime as dt
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data after database join of seasons/quarters file and Apple's stock file
# https://towardsdatascience.com/4-tricks-you-should-know-to-parse-date-columns-with-pandas-read-csv-27355bb2ad0e#:~:text=By%20default%2C%20date%20columns%20are%20parsed%20using%20the,a%20different%20date%20format%2C%20for%20example%2C%20YYYY-DD-MM%20HH%3AMM%3ASS%3A
file_path =Path("../AAPL_Pre_Official_DB_7_30.csv")
apple_df = pd.read_csv(file_path, parse_dates=['Date'])
apple_df.head()

Unnamed: 0,Date,Year,Month,Season,Quarter,Open,High,Low,Close,Adj Close,...,Open_Change,High_Change,Low_Change,Close_Change,Volume_Change,Gain_Loss_Open,Gain_Loss_High,Gain_Loss_Low,Gain_Loss_Close,Gain_Loss_Volume
0,1980-12-12,1980,12,Winter,Q4,0.128348,0.128906,0.128348,0.128348,0.100751,...,0.128348,0.128906,0.128348,0.128348,0,Gain,Gain,Gain,Gain,Gain
1,1980-12-15,1980,12,Winter,Q4,0.12221,0.12221,0.121652,0.121652,0.095495,...,-0.006138,-0.006696,-0.006696,-0.006696,-293148800,Loss,Loss,Loss,Loss,Loss
2,1980-12-16,1980,12,Winter,Q4,0.113281,0.113281,0.112723,0.112723,0.088485,...,-0.008929,-0.008929,-0.008929,-0.008929,-70156800,Loss,Loss,Loss,Loss,Loss
3,1980-12-17,1980,12,Winter,Q4,0.115513,0.116071,0.115513,0.115513,0.090676,...,0.002232,0.00279,0.00279,0.00279,-19286400,Gain,Gain,Gain,Gain,Loss
4,1980-12-18,1980,12,Winter,Q4,0.118862,0.11942,0.118862,0.118862,0.093304,...,0.003349,0.003349,0.003349,0.003349,-12992000,Gain,Gain,Gain,Gain,Loss


In [None]:
apple_adj_df = apple_df.drop(['Open_Change','High_Change','Low_Change','Gain_Loss_High','Gain_Loss_Open','Gain_Loss_Low','Gain_Loss_Volume'], axis=1)
apple_adj_df.head()

In [5]:
apple_adj_df['Date'] = pd.to_datetime(apple_adj_df['Date'])
apple_adj_df['Date'] = apple_adj_df['Date'].map(dt.datetime.toordinal)
apple_adj_df.head()

Unnamed: 0,Date,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Close_24hr_Change,Volume_24hr_Change,Gain_Loss_Close
0,723223,Winter,Q1,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,0.116629,716800,Gain
1,723224,Winter,Q1,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-0.00279,-3404800,Loss
2,723228,Winter,Q1,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,0.00279,1120000,Gain
3,723229,Winter,Q1,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,0.005023,6966400,Gain
4,723230,Winter,Q1,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,-0.007255,3068800,Loss


In [6]:
# Must remain in order to activate label_binarize
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
apple_gain_loss_df = apple_adj_df.copy()
apple_gain_loss_df['Gain_Loss_Close'] = le.fit_transform(apple_gain_loss_df['Gain_Loss_Close'])
apple_gain_loss_df.head()

Unnamed: 0,Date,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Close_24hr_Change,Volume_24hr_Change,Gain_Loss_Close
0,723223,Winter,Q1,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,0.116629,716800,0
1,723224,Winter,Q1,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-0.00279,-3404800,1
2,723228,Winter,Q1,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,0.00279,1120000,0
3,723229,Winter,Q1,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,0.005023,6966400,0
4,723230,Winter,Q1,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,-0.007255,3068800,1


In [7]:
# If in doubt look up scikit learn label_binarize
from sklearn.preprocessing import label_binarize

binarized_gain_loss = label_binarize(y=list(apple_gain_loss_df['Gain_Loss_Close']), classes=[1,0])
apple_gain_loss_df = apple_df.copy()
apple_gain_loss_df['Gain_Loss_Close'] = binarized_gain_loss
apple_gain_loss_df.head()

Unnamed: 0,Date,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Close_24hr_Change,Volume_24hr_Change,Gain_Loss_Close
0,723223,Winter,Q1,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,0.116629,716800,1
1,723224,Winter,Q1,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-0.00279,-3404800,0
2,723228,Winter,Q1,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,0.00279,1120000,1
3,723229,Winter,Q1,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,0.005023,6966400,1
4,723230,Winter,Q1,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,-0.007255,3068800,0


In [8]:
# Perform binary encoding of Season and Quarter columns
apple_binary_encoded = pd.get_dummies(apple_gain_loss_df, columns=["Season", "Quarter"])
apple_binary_encoded.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Close_24hr_Change,Volume_24hr_Change,Gain_Loss_Close,Season_Spring,Season_Winter,Quarter_Q1,Quarter_Q2
0,723223,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,0.116629,716800,1,0,1,1,0
1,723224,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-0.00279,-3404800,0,0,1,1,0
2,723228,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,0.00279,1120000,1,0,1,1,0
3,723229,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,0.005023,6966400,1,0,1,1,0
4,723230,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,-0.007255,3068800,0,0,1,1,0


In [9]:
# If in doubt look up scikit learn label_binarize
from sklearn.preprocessing import label_binarize

binarized_gain_loss = label_binarize(y=list(apple_gain_loss_df['Gain_Loss_Close']), classes=[1,0])
apple_gain_loss_df = apple_df.copy()
apple_gain_loss_df['Gain_Loss_Close'] = binarized_gain_loss
apple_gain_loss_df.head()

Unnamed: 0,Date,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Close_24hr_Change,Volume_24hr_Change,Gain_Loss_Close
0,723223,Winter,Q1,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,0.116629,716800,0
1,723224,Winter,Q1,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-0.00279,-3404800,1
2,723228,Winter,Q1,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,0.00279,1120000,0
3,723229,Winter,Q1,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,0.005023,6966400,0
4,723230,Winter,Q1,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,-0.007255,3068800,1


In [10]:
# Define features set
X = apple_binary_encoded.copy()

In [11]:
y = apple_gain_loss_df["Gain_Loss_Close"]
#X = X.drop(["Gain_Loss_Volume"], axis=1)
X = X.drop(["Gain_Loss_Close","Close_24hr_Change"], axis=1)
X.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Volume_24hr_Change,Season_Spring,Season_Winter,Quarter_Q1,Quarter_Q2
0,723223,0.117188,0.117188,0.116629,0.116629,0.091552,14560000,716800,0,1,1,0
1,723224,0.114955,0.114955,0.113839,0.113839,0.089362,11155200,-3404800,0,1,1,0
2,723228,0.116629,0.117188,0.116629,0.116629,0.091552,12275200,1120000,0,1,1,0
3,723229,0.121652,0.122768,0.121652,0.121652,0.095495,19241600,6966400,0,1,1,0
4,723230,0.114955,0.114955,0.114397,0.114397,0.0898,22310400,3068800,0,1,1,0


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(30, 12)

In [13]:
#Create a Logistic Regression Model

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [14]:
#Fit (train) or model using the training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [15]:
# Make ppredictions
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,1,0
3,1,1
4,0,0
5,1,1
6,0,1
7,0,1
8,0,0
9,1,1


In [16]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7
