In [None]:
# importing necessary modules.
import pandas as pd
import config7 as cfg
from sqlalchemy import create_engine
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.impute import SimpleImputer

In [None]:
# creating a connection to an aws rds postgres cloud database.
rds_connection_string = f'postgres:{cfg.password}@test-db.cy2enoewwvsi.us-east-2.rds.amazonaws.com:5432/stocks_db'
engine = create_engine(f'postgres://{rds_connection_string}')

In [None]:
# checking out the table names in the database.
engine.table_names()

In [None]:
# checking out the SandP_500 table.
sandp = pd.read_sql_query('SELECT * FROM sandp', con=engine)
sandp.head()

In [None]:
# converting industry columns into 1's and 0's.
industry = pd.get_dummies(sandp['industry'])

In [None]:
# converting sector columns into 1's and 0's.
sector = pd.get_dummies(sandp['sector'])

In [None]:
# bringing everything together.
sandp = pd.concat([sandp, industry, sector], axis=1)

In [None]:
# dropping industry and sector columns.
sandp.drop(['industry', 'sector'], axis=1, inplace=True)

In [None]:
sandp = sandp.drop(columns=['symbol', 'name'])

In [None]:
# for x in sandp.columns:
#     imr = SimpleImputer(missing_values=-99, strategy='mean')
#     imr = imr.fit(sandp[[x]])
#     sandp[x] = imr.transform(sandp[[x]]).ravel()
# sandp

In [None]:
# dropping unnecessary columns.
sandp.drop(['percent_return_on_investment'], axis=1, inplace=True)

In [None]:
# sandp[['price_to_bookvalue', 'price_to_book', 'price_to_sales', 'price_to_earnings', 'price_to_free_cash_flow', 'price_to_operating_cash_flow', 'operating_income_growth', 'operating_cash_flow_growth', 'free_cash_flow_growth', 'return_on_equity', 'asset_growth', 'Advertising & Marketing Services', 'Aerospace & Defense',
#  'Agriculture','Airlines', 'Application Software','Asset Management','Autos','Banks','Beverages - Alcoholic','Beverages - Non-Alcoholic',
#  'Biotechnology','Brokers & Exchanges','Building Materials','Business Services','Chemicals','Communication Equipment','Communication Services',
#  'Computer Hardware','Conglomerates','Consulting & Outsourcing','Consumer Packaged Goods','Credit Services','Drug Manufacturers',
#  'Employment Services','Engineering & Construction','Entertainment','Farm & Construction Machinery','Forest Products',
#  'Health Care Plans','Health Care Providers','Homebuilding & Construction','Industrial Distribution','Industrial Products',
#  'Insurance','Insurance - Life','Insurance - Property & Casualty','Insurance - Specialty','Insurance—Life','Manufacturing - Apparel & Furniture',
#  'Medical Devices','Medical Diagnostics & Research','Medical Distribution','Medical Instruments & Equipment','Metals & Mining',
#  'Oil & Gas - Drilling','Oil & Gas - E&P','Oil & Gas - Integrated','Oil & Gas - Midstream','Oil & Gas - Refining & Marketing',
#  'Oil & Gas - Services','Online Media','Packaging & Containers','Personal Services','REITs','Real Estate Services','Restaurants',
#  'Retail - Apparel & Specialty','Retail - Defensive','Semiconductors','Steel','Tobacco Products','Transportation & Logistics',
#  'Travel & Leisure','Truck Manufacturing','Utilities - Independent Power Producers','Utilities - Regulated','Waste Management',
#  'Basic Materials','Communication Services','Consumer Cyclical','Consumer Defensive','Energy','Financial Services','Healthcare',
#  'Industrials','Real Estate','Technology','Utilities']]

In [None]:
X = sandp[['price_to_bookvalue', 'price_to_book', 'price_to_sales', 'price_to_earnings', 'price_to_free_cash_flow', 'price_to_operating_cash_flow', 'operating_income_growth', 'operating_cash_flow_growth', 'free_cash_flow_growth', 'return_on_equity', 'asset_growth', 'Medical Diagnostics & Research', 'Airlines',
'Retail - Apparel & Specialty', 'Computer Hardware','Drug Manufacturers', 'Medical Distribution', 'Medical Devices','Application Software', 'Semiconductors',
'Consumer Packaged Goods', 'Business Services', 'Credit Services','Utilities - Regulated', 'Insurance - Life', 'Insurance', 'REITs',
'Brokers & Exchanges', 'Chemicals','Insurance - Property & Casualty', 'Biotechnology','Asset Management', 'Communication Services', 'Online Media',
'Health Care Plans', 'Industrial Products', 'Oil & Gas - E&P','Autos', 'Conglomerates', 'Banks','Medical Instruments & Equipment', 'Travel & Leisure',
'Packaging & Containers', 'Farm & Construction Machinery','Real Estate Services', 'Entertainment', 'Agriculture','Transportation & Logistics', 'Restaurants', 'Retail - Defensive',
'Communication Equipment', 'Oil & Gas - Integrated','Homebuilding & Construction', 'Health Care Providers','Industrial Distribution', 'Manufacturing - Apparel & Furniture',
'Metals & Mining', 'Aerospace & Defense', 'Insurance—Life','Oil & Gas - Services', 'Oil & Gas - Refining & Marketing',
'Oil & Gas - Drilling', 'Personal Services','Advertising & Marketing Services', 'Building Materials','Engineering & Construction', 'Oil & Gas - Midstream',
'Beverages - Non-Alcoholic', 'Tobacco Products','Utilities - Independent Power Producers', 'Steel','Truck Manufacturing', 'Insurance - Specialty',
'Employment Services', 'Waste Management', 'Beverages - Alcoholic','Consulting & Outsourcing', 'Forest Products']]
target = sandp['above_below_sandp_return']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, target_train, target_test = train_test_split(X, target, random_state=1)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train, target_train)

In [None]:
print(f'Training Data Score: {classifier.score(X_train, target_train)}')
print(f'Testing Data Score: {classifier.score(X_test, target_test)}')

In [None]:
predictions = classifier.predict(X_test)
pd.DataFrame({'Prediction': predictions, 'Actual': target_test}).reset_index(drop=True).head()

In [None]:
target_test.value_counts()

In [None]:
target_test.mean()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(target_test, predictions))

In [23]:
import pickle
# Save the trained model as a pickle string.
saved_model = pickle.dumps(classifier)

In [24]:
saved_model

b'\x80\x03csklearn.linear_model.logistic\nLogisticRegression\nq\x00)\x81q\x01}q\x02(X\x07\x00\x00\x00penaltyq\x03X\x02\x00\x00\x00l2q\x04X\x04\x00\x00\x00dualq\x05\x89X\x03\x00\x00\x00tolq\x06G?\x1a6\xe2\xeb\x1cC-X\x01\x00\x00\x00Cq\x07G?\xf0\x00\x00\x00\x00\x00\x00X\r\x00\x00\x00fit_interceptq\x08\x88X\x11\x00\x00\x00intercept_scalingq\tK\x01X\x0c\x00\x00\x00class_weightq\nNX\x0c\x00\x00\x00random_stateq\x0bNX\x06\x00\x00\x00solverq\x0cX\x04\x00\x00\x00warnq\rX\x08\x00\x00\x00max_iterq\x0eKdX\x0b\x00\x00\x00multi_classq\x0fh\rX\x07\x00\x00\x00verboseq\x10K\x00X\n\x00\x00\x00warm_startq\x11\x89X\x06\x00\x00\x00n_jobsq\x12NX\x08\x00\x00\x00l1_ratioq\x13NX\x08\x00\x00\x00classes_q\x14cnumpy.core.multiarray\n_reconstruct\nq\x15cnumpy\nndarray\nq\x16K\x00\x85q\x17C\x01bq\x18\x87q\x19Rq\x1a(K\x01K\x02\x85q\x1bcnumpy\ndtype\nq\x1cX\x02\x00\x00\x00i8q\x1dK\x00K\x01\x87q\x1eRq\x1f(K\x03X\x01\x00\x00\x00<q NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00tq!b\x89C\x10\x00\x00\x00\x00\x00\x00\x00\x00\x

In [30]:
# Load the pickled model
clf_from_pickle = pickle.loads(saved_model)

# Use the loaded pickled model to make predictions
predictions = clf_from_pickle.predict(X)

pd.DataFrame({'Prediction': predictions.ravel(), 'Actual': X}).head()

ValueError: Shape of passed values is (79, 2), indices imply (468, 2)