In [33]:
# importing necessary modules.
import pandas as pd
import config as cfg
from sqlalchemy import create_engine
import numpy as np
import matplotlib.pyplot as plt

In [34]:
# creating a connection to an aws rds postgres cloud database.
rds_connection_string = f'postgres:{cfg.password}@test-db.cy2enoewwvsi.us-east-2.rds.amazonaws.com:5432/stocks_db'
engine = create_engine(f'postgres://{rds_connection_string}')

In [35]:
# checking out the table names in the database.
engine.table_names()

['sandp', 'russell_2000', 'sandp_russell', 'new_route', 'sandp2']

In [36]:
# checking out the SandP_500 table.
sandp_russell = pd.read_sql_query('SELECT * FROM sandp_russell', con=engine)
sandp_russell.head()

Unnamed: 0,symbol,name,price_to_bookvalue,price_to_book,price_to_sales,price_to_earnings,receivables_turnover,price_to_free_cash_flow,price_to_operating_cash_flow,enterprise_value_multiple,...,asset_growth,book_value_per_share_growth,debt_growth,randd_expense_growth,sganda_expense_growth,percent_return_on_investment,above_below_sandp_return,market_cap,industry,sector
0,A,"Agilent Technologies, Inc.",2.5517,5.17165,1.4813,6.30885,7.0014,18.1807,13.57995,16.746223,...,0.18005,0.15185,0.31885,-0.0674,0.0188,215.407785,1,7462700000.0,Medical Diagnostics & Research,Healthcare
1,AAL,"American Airlines Group, Inc.",-99.0,-99.0,0.11565,-99.0,27.3348,-99.0,2.20925,61.812855,...,-0.0017,-0.06235,0.00975,-99.0,0.01435,492.976589,1,613380900.0,Airlines,Industrials
2,AAN,"Aaron's, Inc.",1.5315,1.9561,0.78855,12.3786,27.7757,59.12785,19.9267,0.126805,...,0.1041,0.1287,-0.3807,-99.0,0.08135,153.807704,1,559709500.0,Consulting & Outsourcing,Industrials
3,AAON,"AAON, Inc.",3.4176,3.4176,1.6373,16.86505,6.7351,20.69945,10.9623,0.287753,...,0.06795,0.13045,-0.9878,-99.0,-0.02605,454.948301,1,384204200.0,Building Materials,Basic Materials
4,AAP,"Advance Auto Parts, Inc.",2.9566,3.1038,0.706,14.1684,57.0461,7.4185,5.4613,0.499984,...,0.0367,0.1952,-0.5715,-99.0,0.1036,158.290852,1,1912092000.0,Retail - Apparel & Specialty,Consumer Cyclical


In [37]:
# converting industry columns into 1's and 0's.
industry = pd.get_dummies(sandp_russell['industry'])

In [38]:
# converting sector columns into 1's and 0's.
sector = pd.get_dummies(sandp_russell['sector'])

In [39]:
# bringing everything together.
sandp_russell = pd.concat([sandp_russell, industry, sector], axis=1)

In [40]:
# dropping industry and sector columns.
sandp_russell.drop(['industry', 'sector'], axis=1, inplace=True)

In [41]:
# dropping unnecessary columns.
sandp_russell.drop(['symbol', 'name', 'percent_return_on_investment'], axis=1, inplace=True)

In [42]:
# viewing dataframe.
sandp_russell.head()

Unnamed: 0,price_to_bookvalue,price_to_book,price_to_sales,price_to_earnings,receivables_turnover,price_to_free_cash_flow,price_to_operating_cash_flow,enterprise_value_multiple,gross_profit_margin,pretax_profit_margin,...,Communication Services,Consumer Cyclical,Consumer Defensive,Energy,Financial Services,Healthcare,Industrials,Real Estate,Technology,Utilities
0,2.5517,5.17165,1.4813,6.30885,7.0014,18.1807,13.57995,16.746223,0.52485,0.057228,...,0,0,0,0,0,1,0,0,0,0
1,-99.0,-99.0,0.11565,-99.0,27.3348,-99.0,2.20925,61.812855,0.495744,-0.018258,...,0,0,0,0,0,0,1,0,0,0
2,1.5315,1.9561,0.78855,12.3786,27.7757,59.12785,19.9267,0.126805,0.812846,0.103087,...,0,0,0,0,0,0,1,0,0,0
3,3.4176,3.4176,1.6373,16.86505,6.7351,20.69945,10.9623,0.287753,0.250523,0.156079,...,0,0,0,0,0,0,0,0,0,0
4,2.9566,3.1038,0.706,14.1684,57.0461,7.4185,5.4613,0.499984,0.488529,0.083949,...,0,1,0,0,0,0,0,0,0,0


In [43]:
X = sandp_russell[['price_to_bookvalue', 'price_to_book', 'price_to_sales', 'price_to_earnings', 'price_to_free_cash_flow', 'price_to_operating_cash_flow', 'operating_income_growth', 'free_cash_flow_growth', 'return_on_equity', 'asset_growth', 'Medical Diagnostics & Research', 'Airlines',
'Retail - Apparel & Specialty', 'Computer Hardware','Drug Manufacturers', 'Medical Distribution', 'Medical Devices','Application Software', 'Semiconductors',
'Consumer Packaged Goods', 'Business Services', 'Credit Services','Utilities - Regulated', 'Insurance - Life', 'Insurance', 'REITs',
'Brokers & Exchanges', 'Chemicals','Insurance - Property & Casualty', 'Biotechnology','Asset Management', 'Online Media',
'Health Care Plans', 'Industrial Products', 'Oil & Gas - E&P','Autos', 'Conglomerates', 'Banks','Medical Instruments & Equipment', 'Travel & Leisure',
'Packaging & Containers', 'Farm & Construction Machinery','Real Estate Services', 'Entertainment', 'Agriculture','Transportation & Logistics', 'Restaurants', 'Retail - Defensive',
'Communication Equipment', 'Oil & Gas - Integrated','Homebuilding & Construction', 'Health Care Providers','Industrial Distribution', 'Manufacturing - Apparel & Furniture',
'Metals & Mining', 'Aerospace & Defense', 'Insurance—Life','Oil & Gas - Services', 'Oil & Gas - Refining & Marketing',
'Oil & Gas - Drilling', 'Personal Services','Advertising & Marketing Services', 'Building Materials','Engineering & Construction', 'Oil & Gas - Midstream',
'Beverages - Non-Alcoholic', 'Tobacco Products','Utilities - Independent Power Producers', 'Steel','Truck Manufacturing', 'Insurance - Specialty',
'Employment Services', 'Waste Management', 'Beverages - Alcoholic','Consulting & Outsourcing', 'Forest Products']]
target = sandp_russell['above_below_sandp_return']

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, target_train, target_test = train_test_split(X, target, random_state=1, stratify=target)

In [45]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
classifier.fit(X_train, target_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
print(f'Training Data Score: {classifier.score(X_train, target_train)}')
print(f'Testing Data Score: {classifier.score(X_test, target_test)}')

Training Data Score: 0.641860465116279
Testing Data Score: 0.5916473317865429


In [48]:
predictions = classifier.predict(X_test)
pd.DataFrame({'Prediction': predictions, 'Actual': target_test}).reset_index(drop=True).head()

Unnamed: 0,Prediction,Actual
0,0,0
1,1,0
2,0,1
3,0,0
4,0,0
