In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from pathlib import Path
import sklearn as skl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

import pickle

In [2]:
stock_data = Path('StockAIClean2.csv')
stock_df = pd.read_csv(stock_data)
stock_df.head()

Unnamed: 0,No.,Ticker,Company,Sector,Mkt Cap Cat,For P/E Cat,Dividend,EPS growth this year (%) Cat,EPS growth next year (%) Cat,EPS growth past 5 years (%) Cat,...,Sales growth past 5 years (%) Cat,Sales Cat,Float Short (%) Cat,Profit Margin (%) Cat,Performance (Year) (%) Cat,Employees Cat,Analyst Rec Cat,Volume Cat,Var % Cat,Var%
0,1,AAPL,Apple Inc.,Technology,Mg Cap,Med,Yes,Yes,Yes,Yes,...,Yes,Hi,Lo,Pos,Up,Hi,Buy,Hi,Neg,-1.2
1,2,ACN,Accenture plc,Technology,Lg Cap,Med,Yes,Yes,Yes,Yes,...,Yes,Hi,Lo,Pos,Up,Hi,Buy,Med,Pos,5.3
2,3,ADBE,Adobe Inc.,Technology,Lg Cap,Hi,No,Yes,Yes,Yes,...,Yes,Med,Lo,Pos,Up,Med,Buy,Med,Neg,-0.3
3,4,ADSK,"Autodesk, Inc.",Technology,Lg Cap,Med,No,Yes,Yes,Yes,...,Yes,Med,Lo,Pos,Down,Med,Buy,Lo,Pos,9.1
4,5,AI,"C3.ai, Inc.",Technology,Mid Cap,Hi,No,No,Yes,No,...,Yes,Lo,Hi,Neg,Up,Sm,Hold,Hi,Neg,-36.0


In [3]:
len(stock_df)

72

In [4]:
stock_df1 = stock_df.drop(columns=['No.', 'Ticker', 'Company', 'Sector'])
stock_df1.head()

Unnamed: 0,Mkt Cap Cat,For P/E Cat,Dividend,EPS growth this year (%) Cat,EPS growth next year (%) Cat,EPS growth past 5 years (%) Cat,EPS growth next 5 years (%) Cat,Sales growth past 5 years (%) Cat,Sales Cat,Float Short (%) Cat,Profit Margin (%) Cat,Performance (Year) (%) Cat,Employees Cat,Analyst Rec Cat,Volume Cat,Var % Cat,Var%
0,Mg Cap,Med,Yes,Yes,Yes,Yes,Yes,Yes,Hi,Lo,Pos,Up,Hi,Buy,Hi,Neg,-1.2
1,Lg Cap,Med,Yes,Yes,Yes,Yes,Yes,Yes,Hi,Lo,Pos,Up,Hi,Buy,Med,Pos,5.3
2,Lg Cap,Hi,No,Yes,Yes,Yes,Yes,Yes,Med,Lo,Pos,Up,Med,Buy,Med,Neg,-0.3
3,Lg Cap,Med,No,Yes,Yes,Yes,Yes,Yes,Med,Lo,Pos,Down,Med,Buy,Lo,Pos,9.1
4,Mid Cap,Hi,No,No,Yes,No,Yes,Yes,Lo,Hi,Neg,Up,Sm,Hold,Hi,Neg,-36.0


In [5]:
stock_df2 = pd.get_dummies(stock_df1, columns=['Analyst Rec Cat', "Mkt Cap Cat", "For P/E Cat", "Dividend", "EPS growth this year (%) Cat", "EPS growth next year (%) Cat", "EPS growth past 5 years (%) Cat", "EPS growth next 5 years (%) Cat", "Sales growth past 5 years (%) Cat", "Sales Cat", "Float Short (%) Cat", "Profit Margin (%) Cat", "Performance (Year) (%) Cat", "Employees Cat", "Volume Cat", "Var % Cat"])
stock_df2

Unnamed: 0,Var%,Analyst Rec Cat_Buy,Analyst Rec Cat_Hold,Mkt Cap Cat_Lg Cap,Mkt Cap Cat_Mg Cap,Mkt Cap Cat_Mic Cap,Mkt Cap Cat_Mid Cap,Mkt Cap Cat_Sm Cap,For P/E Cat_Hi,For P/E Cat_Low,...,Performance (Year) (%) Cat_Down,Performance (Year) (%) Cat_Up,Employees Cat_Hi,Employees Cat_Med,Employees Cat_Sm,Volume Cat_Hi,Volume Cat_Lo,Volume Cat_Med,Var % Cat_Neg,Var % Cat_Pos
0,-1.2,1,0,0,1,0,0,0,0,0,...,0,1,1,0,0,1,0,0,1,0
1,5.3,1,0,1,0,0,0,0,0,0,...,0,1,1,0,0,0,0,1,0,1
2,-0.3,1,0,1,0,0,0,0,1,0,...,0,1,0,1,0,0,0,1,1,0
3,9.1,1,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1
4,-36.0,0,1,0,0,0,1,0,1,0,...,0,1,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,12.7,1,0,1,0,0,0,0,1,0,...,0,1,0,1,0,1,0,0,0,1
68,-68.2,0,1,0,0,0,1,0,1,0,...,0,1,0,0,1,1,0,0,1,0
69,5.8,0,1,0,0,1,0,0,0,1,...,1,0,0,0,1,0,1,0,0,1
70,1.6,1,0,1,0,0,0,0,1,0,...,0,1,0,1,0,0,0,1,0,1


In [6]:
X = stock_df2.drop(columns=['Analyst Rec Cat_Buy'])
y = stock_df2['Analyst Rec Cat_Buy']

In [7]:
y

0     1
1     1
2     1
3     1
4     0
     ..
67    1
68    0
69    0
70    1
71    1
Name: Analyst Rec Cat_Buy, Length: 72, dtype: uint8

In [8]:
X.columns

Index(['Var%', 'Analyst Rec Cat_Hold', 'Mkt Cap Cat_Lg Cap',
       'Mkt Cap Cat_Mg Cap', 'Mkt Cap Cat_Mic Cap', 'Mkt Cap Cat_Mid Cap',
       'Mkt Cap Cat_Sm Cap', 'For P/E Cat_Hi', 'For P/E Cat_Low',
       'For P/E Cat_Med', 'Dividend_No', 'Dividend_Yes',
       'EPS growth this year (%) Cat_No', 'EPS growth this year (%) Cat_Yes',
       'EPS growth next year (%) Cat_No', 'EPS growth next year (%) Cat_Yes',
       'EPS growth past 5 years (%) Cat_No',
       'EPS growth past 5 years (%) Cat_Yes',
       'EPS growth next 5 years (%) Cat_No',
       'EPS growth next 5 years (%) Cat_Yes',
       'Sales growth past 5 years (%) Cat_No',
       'Sales growth past 5 years (%) Cat_Yes', 'Sales Cat_Hi', 'Sales Cat_Lo',
       'Sales Cat_Med', 'Float Short (%) Cat_Hi', 'Float Short (%) Cat_Lo',
       'Float Short (%) Cat_Med', 'Profit Margin (%) Cat_Neg',
       'Profit Margin (%) Cat_Pos', 'Performance (Year) (%) Cat_Down',
       'Performance (Year) (%) Cat_Up', 'Employees Cat_Hi',
      

In [9]:
stock_df2.isnull()

Unnamed: 0,Var%,Analyst Rec Cat_Buy,Analyst Rec Cat_Hold,Mkt Cap Cat_Lg Cap,Mkt Cap Cat_Mg Cap,Mkt Cap Cat_Mic Cap,Mkt Cap Cat_Mid Cap,Mkt Cap Cat_Sm Cap,For P/E Cat_Hi,For P/E Cat_Low,...,Performance (Year) (%) Cat_Down,Performance (Year) (%) Cat_Up,Employees Cat_Hi,Employees Cat_Med,Employees Cat_Sm,Volume Cat_Hi,Volume Cat_Lo,Volume Cat_Med,Var % Cat_Neg,Var % Cat_Pos
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
68,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
69,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
70,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1, stratify=y)

In [11]:
# create a scaler instance
X_scaler = skl.preprocessing.StandardScaler()

In [12]:
# fit/train the scaler
X_scaler.fit(X_train)

In [13]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# create a logistic regression model
#model = LogisticRegression(solver='lbfgs', random_state=1)
from sklearn.linear_model import LogisticRegressionCV
classifier = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', 
            max_iter=10000, random_state=1)
classifier

In [15]:
classifier.fit(X_train, y_train)

In [16]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8888888888888888
Testing Data Score: 0.9444444444444444


In [17]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,0,0


In [18]:
# Display the accuracy score for the test dataset.
score = balanced_accuracy_score(y_test, predictions)
print('Accuracy score: ', score)

Accuracy score:  0.75


In [19]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1,1
Actual 1,0,16


In [20]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.94      1.00      0.97        16

    accuracy                           0.94        18
   macro avg       0.97      0.75      0.82        18
weighted avg       0.95      0.94      0.94        18



In [21]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resample, y_resample = ros.fit_resample(X_train, y_train)

Counter(y_resample)
y_resample.value_counts()


0    46
1    46
Name: Analyst Rec Cat_Buy, dtype: int64

In [22]:
classifier1 = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', 
            max_iter=10000, random_state=1)
classifier1

In [23]:
# Fit the model using the resampled training data
classifier1.fit(X_resample, y_resample)

In [24]:
# Make a prediction using the testing data
predictions1= classifier1.predict(X_test)
pd.DataFrame({"Prediction": predictions1, "Actual": y_test})

Unnamed: 0,Prediction,Actual
66,1,1
5,1,1
15,1,1
42,1,1
27,1,1
57,1,1
55,1,1
47,1,1
22,1,1
68,0,0


In [25]:
# Print the balanced_accuracy score of the model 
score1 = balanced_accuracy_score(y_test, predictions1)
print('New accuracy score: ', score1)


New accuracy score:  1.0


In [26]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions1)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2,0
Actual 1,0,16


In [27]:
print(classification_report(y_test, predictions1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00        16

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18



In [28]:
import pickle

with open("classifier.pkl", "wb") as f:
    pickle.dump(classifier,f)

with open("classifier1.pkl", "wb") as f:
    pickle.dump(classifier1,f)

In [29]:
# # save the model to curent directory
# filename = 'classifier1.pkl'
# pickle.dump(classifier1, open(filename, 'wb'))

In [30]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, y_test)
# loaded_model