In [1]:
# Initial imports.
import pandas as pd
import numpy as np
import datetime as dt
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data after database join of seasons/quarters file and Apple's stock file
# https://towardsdatascience.com/4-tricks-you-should-know-to-parse-date-columns-with-pandas-read-csv-27355bb2ad0e#:~:text=By%20default%2C%20date%20columns%20are%20parsed%20using%20the,a%20different%20date%20format%2C%20for%20example%2C%20YYYY-DD-MM%20HH%3AMM%3ASS%3A
file_path =Path("../AAPL_Pre_Official_DB_7_31.csv")
apple_df = pd.read_csv(file_path, parse_dates=['Date'])
apple_df.head()

Unnamed: 0,Date,Year,Month,Season,Quarter,Open,High,Low,Close,Adj Close,...,Open_Change,High_Change,Low_Change,Close_Change,Volume_Change,Gain_Loss_Open,Gain_Loss_High,Gain_Loss_Low,Gain_Loss_Close,Gain_Loss_Volume
0,1980-12-12,1980,12,Winter,Q4,0.128348,0.128906,0.128348,0.128348,0.100751,...,0.128348,0.128906,0.128348,0.128348,0,Gain,Gain,Gain,Gain,Gain
1,1980-12-15,1980,12,Winter,Q4,0.12221,0.12221,0.121652,0.121652,0.095495,...,-0.006138,-0.006696,-0.006696,-0.006696,-293148800,Loss,Loss,Loss,Loss,Loss
2,1980-12-16,1980,12,Winter,Q4,0.113281,0.113281,0.112723,0.112723,0.088485,...,-0.008929,-0.008929,-0.008929,-0.008929,-70156800,Loss,Loss,Loss,Loss,Loss
3,1980-12-17,1980,12,Winter,Q4,0.115513,0.116071,0.115513,0.115513,0.090676,...,0.002232,0.00279,0.00279,0.00279,-19286400,Gain,Gain,Gain,Gain,Loss
4,1980-12-18,1980,12,Winter,Q4,0.118862,0.11942,0.118862,0.118862,0.093304,...,0.003349,0.003349,0.003349,0.003349,-12992000,Gain,Gain,Gain,Gain,Loss


In [3]:
apple_adj_df = apple_df.drop(['Open_Change','High_Change','Close_Change','Gain_Loss_High','Gain_Loss_Open','Gain_Loss_Close','Gain_Loss_Volume'], axis=1)
apple_adj_df.head()

Unnamed: 0,Date,Year,Month,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Low_Change,Volume_Change,Gain_Loss_Low
0,1980-12-12,1980,12,Winter,Q4,0.128348,0.128906,0.128348,0.128348,0.100751,469033600,0.128348,0,Gain
1,1980-12-15,1980,12,Winter,Q4,0.12221,0.12221,0.121652,0.121652,0.095495,175884800,-0.006696,-293148800,Loss
2,1980-12-16,1980,12,Winter,Q4,0.113281,0.113281,0.112723,0.112723,0.088485,105728000,-0.008929,-70156800,Loss
3,1980-12-17,1980,12,Winter,Q4,0.115513,0.116071,0.115513,0.115513,0.090676,86441600,0.00279,-19286400,Gain
4,1980-12-18,1980,12,Winter,Q4,0.118862,0.11942,0.118862,0.118862,0.093304,73449600,0.003349,-12992000,Gain


In [4]:
apple_adj_df['Date'] = pd.to_datetime(apple_adj_df['Date'])
apple_adj_df['Date'] = apple_adj_df['Date'].map(dt.datetime.toordinal)
apple_adj_df.head()

Unnamed: 0,Date,Year,Month,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Low_Change,Volume_Change,Gain_Loss_Low
0,723161,1980,12,Winter,Q4,0.128348,0.128906,0.128348,0.128348,0.100751,469033600,0.128348,0,Gain
1,723164,1980,12,Winter,Q4,0.12221,0.12221,0.121652,0.121652,0.095495,175884800,-0.006696,-293148800,Loss
2,723165,1980,12,Winter,Q4,0.113281,0.113281,0.112723,0.112723,0.088485,105728000,-0.008929,-70156800,Loss
3,723166,1980,12,Winter,Q4,0.115513,0.116071,0.115513,0.115513,0.090676,86441600,0.00279,-19286400,Gain
4,723167,1980,12,Winter,Q4,0.118862,0.11942,0.118862,0.118862,0.093304,73449600,0.003349,-12992000,Gain


In [5]:
# Must remain in order to activate label_binarize
#If in doubt refer to sklearn
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
apple_gain_loss_df = apple_adj_df.copy()
apple_gain_loss_df['Gain_Loss_Low'] = le.fit_transform(apple_gain_loss_df['Gain_Loss_Low'])
apple_gain_loss_df.head()

Unnamed: 0,Date,Year,Month,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Low_Change,Volume_Change,Gain_Loss_Low
0,723161,1980,12,Winter,Q4,0.128348,0.128906,0.128348,0.128348,0.100751,469033600,0.128348,0,0
1,723164,1980,12,Winter,Q4,0.12221,0.12221,0.121652,0.121652,0.095495,175884800,-0.006696,-293148800,1
2,723165,1980,12,Winter,Q4,0.113281,0.113281,0.112723,0.112723,0.088485,105728000,-0.008929,-70156800,1
3,723166,1980,12,Winter,Q4,0.115513,0.116071,0.115513,0.115513,0.090676,86441600,0.00279,-19286400,0
4,723167,1980,12,Winter,Q4,0.118862,0.11942,0.118862,0.118862,0.093304,73449600,0.003349,-12992000,0


In [6]:
# If in doubt look up scikit learn label_binarize
from sklearn.preprocessing import label_binarize

binarized_gain_loss = label_binarize(y=list(apple_gain_loss_df['Gain_Loss_Low']), classes=[1,0])
apple_gain_loss_df = apple_adj_df.copy()
apple_gain_loss_df['Gain_Loss_Low'] = binarized_gain_loss
apple_gain_loss_df.head()

Unnamed: 0,Date,Year,Month,Season,Quarter,Open,High,Low,Close,Adj Close,Volume,Low_Change,Volume_Change,Gain_Loss_Low
0,723161,1980,12,Winter,Q4,0.128348,0.128906,0.128348,0.128348,0.100751,469033600,0.128348,0,1
1,723164,1980,12,Winter,Q4,0.12221,0.12221,0.121652,0.121652,0.095495,175884800,-0.006696,-293148800,0
2,723165,1980,12,Winter,Q4,0.113281,0.113281,0.112723,0.112723,0.088485,105728000,-0.008929,-70156800,0
3,723166,1980,12,Winter,Q4,0.115513,0.116071,0.115513,0.115513,0.090676,86441600,0.00279,-19286400,1
4,723167,1980,12,Winter,Q4,0.118862,0.11942,0.118862,0.118862,0.093304,73449600,0.003349,-12992000,1


In [7]:
# Perform binary encoding of Season and Quarter columns
apple_binary_encoded = pd.get_dummies(apple_gain_loss_df, columns=["Season", "Quarter"])
apple_binary_encoded.head()

Unnamed: 0,Date,Year,Month,Open,High,Low,Close,Adj Close,Volume,Low_Change,Volume_Change,Gain_Loss_Low,Season_Fall,Season_Spring,Season_Summer,Season_Winter,Quarter_Q1,Quarter_Q2,Quarter_Q3,Quarter_Q4
0,723161,1980,12,0.128348,0.128906,0.128348,0.128348,0.100751,469033600,0.128348,0,1,0,0,0,1,0,0,0,1
1,723164,1980,12,0.12221,0.12221,0.121652,0.121652,0.095495,175884800,-0.006696,-293148800,0,0,0,0,1,0,0,0,1
2,723165,1980,12,0.113281,0.113281,0.112723,0.112723,0.088485,105728000,-0.008929,-70156800,0,0,0,0,1,0,0,0,1
3,723166,1980,12,0.115513,0.116071,0.115513,0.115513,0.090676,86441600,0.00279,-19286400,1,0,0,0,1,0,0,0,1
4,723167,1980,12,0.118862,0.11942,0.118862,0.118862,0.093304,73449600,0.003349,-12992000,1,0,0,0,1,0,0,0,1


In [8]:
# Define features set
X = apple_binary_encoded.copy()
X = X.drop(["Gain_Loss_Low","Low_Change","Volume_Change"], axis=1)
X.head()

Unnamed: 0,Date,Year,Month,Open,High,Low,Close,Adj Close,Volume,Season_Fall,Season_Spring,Season_Summer,Season_Winter,Quarter_Q1,Quarter_Q2,Quarter_Q3,Quarter_Q4
0,723161,1980,12,0.128348,0.128906,0.128348,0.128348,0.100751,469033600,0,0,0,1,0,0,0,1
1,723164,1980,12,0.12221,0.12221,0.121652,0.121652,0.095495,175884800,0,0,0,1,0,0,0,1
2,723165,1980,12,0.113281,0.113281,0.112723,0.112723,0.088485,105728000,0,0,0,1,0,0,0,1
3,723166,1980,12,0.115513,0.116071,0.115513,0.115513,0.090676,86441600,0,0,0,1,0,0,0,1
4,723167,1980,12,0.118862,0.11942,0.118862,0.118862,0.093304,73449600,0,0,0,1,0,0,0,1


In [9]:
y = apple_gain_loss_df["Gain_Loss_Low"]
#X = X.drop(["Gain_Loss_Volume"], axis=1)


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(7669, 17)

In [11]:
#Create a Logistic Regression Model

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [12]:
#Fit (train) or model using the training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [13]:
# Make ppredictions
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,0
2,1,1
3,1,1
4,1,1
5,1,0
6,1,1
7,1,0
8,1,1
9,1,1


In [14]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.5557293703558858
