In [112]:
# importing necessary modules.
import pandas as pd
import config7 as cfg
from sqlalchemy import create_engine
import numpy as np
import matplotlib.pyplot as plt

In [113]:
# creating a connection to an aws rds postgres cloud database.
rds_connection_string = f'postgres:{cfg.password}@test-db.cy2enoewwvsi.us-east-2.rds.amazonaws.com:5432/stocks_db'
engine = create_engine(f'postgres://{rds_connection_string}')

In [114]:
# checking out the table names in the database.
engine.table_names()

['sandp', 'avg_sandp', 'russell_2000', 'sandp_russell', 'new_route', 'sandp2']

In [115]:
# checking out the SandP_500 table.
sandp = pd.read_sql_query('SELECT * FROM sandp', con=engine)
sandp

Unnamed: 0,symbol,name,price_to_bookvalue,price_to_book,price_to_sales,price_to_earnings,receivables_turnover,price_to_free_cash_flow,price_to_operating_cash_flow,enterprise_value_multiple,...,asset_growth,book_value_per_share_growth,debt_growth,randd_expense_growth,sganda_expense_growth,percent_return_on_investment,above_below_sandp_return,market_cap,industry,sector
0,A,"Agilent Technologies, Inc.",2.55170,5.17165,1.48130,6.30885,7.00140,18.18070,13.57995,16.746223,...,0.18005,0.15185,0.31885,-0.06740,0.01880,215.407785,1,7.462700e+09,Medical Diagnostics & Research,Healthcare
1,AAL,"American Airlines Group, Inc.",-99.00000,-99.00000,0.11565,-99.00000,27.33480,-99.00000,2.20925,61.812855,...,-0.00170,-0.06235,0.00975,-99.00000,0.01435,492.976589,1,6.133809e+08,Airlines,Industrials
2,AAP,"Advance Auto Parts, Inc.",2.95660,3.10380,0.70600,14.16840,57.04610,7.41850,5.46130,0.499984,...,0.03670,0.19520,-0.57150,-99.00000,0.10360,158.290852,1,1.912092e+09,Retail - Apparel & Specialty,Consumer Cyclical
3,AAPL,"Apple, Inc.",5.42765,4.62715,3.98235,19.54495,10.09060,17.32790,15.35960,0.493341,...,0.44800,0.44245,-99.00000,0.26940,0.21645,641.441749,1,9.151793e+10,Computer Hardware,Technology
4,ABBV,"AbbVie, Inc.",16.46870,-99.00000,3.01900,10.48360,-99.00000,9.21230,8.74540,0.001609,...,-99.00000,-99.00000,-99.00000,0.46130,0.14080,127.493713,0,6.619065e+10,Drug Manufacturers,Healthcare
5,ABC,AmerisourceBergen Corp.,2.65245,39.54195,0.10130,13.48380,19.74495,9.91215,8.09975,2.439843,...,0.08720,0.11490,0.06560,-99.00000,0.01405,258.118955,1,7.017275e+09,Medical Distribution,Healthcare
6,ABMD,"ABIOMED, Inc.",3.53820,5.63330,4.51390,-99.00000,5.86270,-99.00000,-52.55390,-99.000000,...,-0.04700,-0.11940,-99.00000,0.02470,0.09900,2583.734150,1,4.310088e+08,Medical Devices,Healthcare
7,ABT,Abbott Laboratories,1.64610,5.42750,1.15640,7.34805,5.12415,5.54100,4.78745,2.092158,...,0.19580,0.15215,0.29370,0.18890,0.11545,167.453799,1,3.765926e+10,Drug Manufacturers,Healthcare
8,ACN,Accenture Plc,7.75270,6.74855,1.01325,13.02975,6.11045,7.59970,7.49890,1.573638,...,0.01790,0.03930,-0.11090,-99.00000,0.02345,363.726903,1,2.405644e+10,Application Software,Technology
9,ADBE,"Adobe, Inc.",3.28150,17.69555,5.09575,33.68390,7.29175,17.04700,14.94845,1.887970,...,0.18440,0.10660,1.18980,0.02870,0.08435,750.511115,1,8.597522e+09,Application Software,Technology


In [116]:
# converting industry columns into 1's and 0's.
industry = pd.get_dummies(sandp['industry'])

In [117]:
# converting sector columns into 1's and 0's.
sector = pd.get_dummies(sandp['sector'])

In [118]:
# bringing everything together.
sandp = pd.concat([sandp, industry, sector], axis=1)

In [119]:
# dropping industry and sector columns.
sandp.drop(['industry', 'sector'], axis=1, inplace=True)

In [120]:
sandp = sandp.drop(columns=['symbol', 'name'])

In [121]:
# dropping unnecessary columns.
sandp.drop(['percent_return_on_investment'], axis=1, inplace=True)

In [122]:
names = []
for x in sandp.columns:
    names.append(x)

In [123]:
X = sandp[['net_income_growth','short_term_coverage_ratio','eps_diluted_growth', 'gross_profit_margin', 'price_to_book', 'price_to_sales', 'price_to_free_cash_flow', 'asset_growth', 'Airlines',
'Computer Hardware','Drug Manufacturers', 'Medical Devices','Application Software', 'Semiconductors','Consumer Packaged Goods', 
'Business Services', 'Credit Services','Utilities - Regulated', 'Insurance - Life', 'REITs','Brokers & Exchanges','Biotechnology',
'Asset Management', 'Online Media','Oil & Gas - E&P','Autos', 'Banks', 'Travel & Leisure', 'Entertainment','Agriculture',
'Transportation & Logistics','Oil & Gas - Integrated','Industrial Distribution','Metals & Mining','Oil & Gas - Services',
'Personal Services','Engineering & Construction', 'Oil & Gas - Midstream','Beverages - Non-Alcoholic', 'Truck Manufacturing', 
'Employment Services', 'Forest Products']]
target = sandp['above_below_sandp_return']

In [124]:
from sklearn.model_selection import train_test_split
X_train, X_test, target_train, target_test = train_test_split(X, target, random_state=1)

In [125]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [126]:
classifier.fit(X_train, target_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [127]:
print(f'Training Data Score: {classifier.score(X_train, target_train)}')
print(f'Testing Data Score: {classifier.score(X_test, target_test)}')

Training Data Score: 0.7207977207977208
Testing Data Score: 0.7777777777777778


In [128]:
predictions = classifier.predict(X_test)
pd.DataFrame({'Prediction': predictions, 'Actual': target_test}).reset_index(drop=True).head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,1
2,1,1
3,1,1
4,1,1


In [129]:
target_test.value_counts()

1    63
0    54
Name: above_below_sandp_return, dtype: int64

In [130]:
target_test.mean()

0.5384615384615384

In [131]:
from sklearn.metrics import classification_report
print(classification_report(target_test, predictions))

              precision    recall  f1-score   support

           0       0.82      0.67      0.73        54
           1       0.75      0.87      0.81        63

    accuracy                           0.78       117
   macro avg       0.79      0.77      0.77       117
weighted avg       0.78      0.78      0.77       117



In [132]:
predictions2 = classifier.predict(X)
pd.DataFrame({'Prediction': predictions2, 'Actual': target}).reset_index(drop=True).head()

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,0,1
4,0,0


In [133]:
# checking out the SandP_500 table.
sandp = pd.read_sql_query('SELECT * FROM sandp', con=engine)
sandp.head()

Unnamed: 0,symbol,name,price_to_bookvalue,price_to_book,price_to_sales,price_to_earnings,receivables_turnover,price_to_free_cash_flow,price_to_operating_cash_flow,enterprise_value_multiple,...,asset_growth,book_value_per_share_growth,debt_growth,randd_expense_growth,sganda_expense_growth,percent_return_on_investment,above_below_sandp_return,market_cap,industry,sector
0,A,"Agilent Technologies, Inc.",2.5517,5.17165,1.4813,6.30885,7.0014,18.1807,13.57995,16.746223,...,0.18005,0.15185,0.31885,-0.0674,0.0188,215.407785,1,7462700000.0,Medical Diagnostics & Research,Healthcare
1,AAL,"American Airlines Group, Inc.",-99.0,-99.0,0.11565,-99.0,27.3348,-99.0,2.20925,61.812855,...,-0.0017,-0.06235,0.00975,-99.0,0.01435,492.976589,1,613380900.0,Airlines,Industrials
2,AAP,"Advance Auto Parts, Inc.",2.9566,3.1038,0.706,14.1684,57.0461,7.4185,5.4613,0.499984,...,0.0367,0.1952,-0.5715,-99.0,0.1036,158.290852,1,1912092000.0,Retail - Apparel & Specialty,Consumer Cyclical
3,AAPL,"Apple, Inc.",5.42765,4.62715,3.98235,19.54495,10.0906,17.3279,15.3596,0.493341,...,0.448,0.44245,-99.0,0.2694,0.21645,641.441749,1,91517930000.0,Computer Hardware,Technology
4,ABBV,"AbbVie, Inc.",16.4687,-99.0,3.019,10.4836,-99.0,9.2123,8.7454,0.001609,...,-99.0,-99.0,-99.0,0.4613,0.1408,127.493713,0,66190650000.0,Drug Manufacturers,Healthcare


In [134]:
sandp = sandp[['name', 'percent_return_on_investment', 'above_below_sandp_return']]

In [135]:
sandp.head()

Unnamed: 0,name,percent_return_on_investment,above_below_sandp_return
0,"Agilent Technologies, Inc.",215.407785,1
1,"American Airlines Group, Inc.",492.976589,1
2,"Advance Auto Parts, Inc.",158.290852,1
3,"Apple, Inc.",641.441749,1
4,"AbbVie, Inc.",127.493713,0


In [136]:
sandp['predictions'] = predictions2

In [137]:
sandp.head()

Unnamed: 0,name,percent_return_on_investment,above_below_sandp_return,predictions
0,"Agilent Technologies, Inc.",215.407785,1,1
1,"American Airlines Group, Inc.",492.976589,1,1
2,"Advance Auto Parts, Inc.",158.290852,1,1
3,"Apple, Inc.",641.441749,1,0
4,"AbbVie, Inc.",127.493713,0,0


In [138]:
predicted_positives = []
for (x,y) in zip(sandp['predictions'],sandp['percent_return_on_investment']):
    if x == 1:
        predicted_positives.append(y)


In [139]:
predicted = pd.DataFrame(predicted_positives)

In [140]:
len(predicted)

308

In [141]:
for x in predicted[0]:
    if x < 0:
        print(x)

-22.4076884167931
-58.564988730278
-20.8840486867393
-6.37676456271296
-12.3555266156601


In [142]:
gain = predicted.sum()

In [143]:
(gain - len(predicted))/len(predicted)

0    289.720024
dtype: float64

In [144]:
sandp_gain = sandp['percent_return_on_investment'].sum()

In [145]:
(sandp_gain - len(sandp['percent_return_on_investment']))/len(sandp['percent_return_on_investment'])

228.6078093352651

In [146]:
# import pickle
# # Save the trained model as a pickle string.
# saved_model = pickle.dumps(classifier)

In [147]:
# saved_model

In [643]:
# # Load the pickled model
# clf_from_pickle = pickle.loads(saved_model)

# # Use the loaded pickled model to make predictions
# predictions = clf_from_pickle.predict(X)

# pd.DataFrame({'Prediction': predictions.ravel(), 'Actual': X}).head()