In [600]:
# importing necessary modules.
import pandas as pd
import config7 as cfg
from sqlalchemy import create_engine
import numpy as np
import matplotlib.pyplot as plt

In [601]:
# creating a connection to an aws rds postgres cloud database.
rds_connection_string = f'postgres:{cfg.password}@test-db.cy2enoewwvsi.us-east-2.rds.amazonaws.com:5432/stocks_db'
engine = create_engine(f'postgres://{rds_connection_string}')

In [602]:
# checking out the table names in the database.
engine.table_names()

['new_route', 'avg_sandp', 'sandp2', 'sandp', 'russell_2000', 'sandp_russell']

In [603]:
# checking out the SandP_500 table.
sandp = pd.read_sql_query('SELECT * FROM sandp', con=engine)
sandp.head()

Unnamed: 0,symbol,name,price_to_bookvalue,price_to_book,price_to_sales,price_to_earnings,receivables_turnover,price_to_free_cash_flow,price_to_operating_cash_flow,enterprise_value_multiple,...,asset_growth,book_value_per_share_growth,debt_growth,randd_expense_growth,sganda_expense_growth,percent_return_on_investment,above_below_sandp_return,market_cap,industry,sector
0,A,"Agilent Technologies, Inc.",2.5517,5.17165,1.4813,6.30885,7.0014,18.1807,13.57995,16.746223,...,0.18005,0.15185,0.31885,-0.0674,0.0188,215.407785,0,7462700000.0,Medical Diagnostics & Research,Healthcare
1,AAL,"American Airlines Group, Inc.",-99.0,-99.0,0.11565,-99.0,27.3348,-99.0,2.20925,61.812855,...,-0.0017,-0.06235,0.00975,-99.0,0.01435,492.976589,1,613380900.0,Airlines,Industrials
2,AAP,"Advance Auto Parts, Inc.",2.9566,3.1038,0.706,14.1684,57.0461,7.4185,5.4613,0.499984,...,0.0367,0.1952,-0.5715,-99.0,0.1036,158.290852,0,1912092000.0,Retail - Apparel & Specialty,Consumer Cyclical
3,AAPL,"Apple, Inc.",5.42765,4.62715,3.98235,19.54495,10.0906,17.3279,15.3596,0.493341,...,0.448,0.44245,-99.0,0.2694,0.21645,641.441749,1,91517930000.0,Computer Hardware,Technology
4,ABBV,"AbbVie, Inc.",16.4687,-99.0,3.019,10.4836,-99.0,9.2123,8.7454,0.001609,...,-99.0,-99.0,-99.0,0.4613,0.1408,127.493713,0,66190650000.0,Drug Manufacturers,Healthcare


In [604]:
sandp['above_below_sandp_return'].value_counts()

0    299
1    169
Name: above_below_sandp_return, dtype: int64

In [605]:
# converting industry columns into 1's and 0's.
industry = pd.get_dummies(sandp['industry'])

In [606]:
# converting sector columns into 1's and 0's.
sector = pd.get_dummies(sandp['sector'])

In [607]:
# bringing everything together.
sandp = pd.concat([sandp, industry, sector], axis=1)

In [608]:
# dropping industry and sector columns.
sandp.drop(['industry', 'sector'], axis=1, inplace=True)

In [609]:
sandp = sandp.drop(columns=['symbol', 'name'])

In [610]:
# dropping unnecessary columns.
sandp.drop(['percent_return_on_investment'], axis=1, inplace=True)

In [611]:
names = []
for x in sandp.columns:
    names.append(x)

In [612]:
X = sandp[['net_income_growth','short_term_coverage_ratio','eps_diluted_growth', 'gross_profit_margin', 'price_to_book', 'price_to_sales', 'price_to_free_cash_flow', 'asset_growth', 'Airlines',
'Computer Hardware','Drug Manufacturers', 'Medical Devices','Application Software', 'Semiconductors','Consumer Packaged Goods', 
'Business Services', 'Credit Services','Utilities - Regulated', 'Insurance - Life', 'REITs','Brokers & Exchanges','Biotechnology',
'Asset Management', 'Online Media','Oil & Gas - E&P','Autos', 'Banks', 'Travel & Leisure', 'Entertainment','Agriculture',
'Transportation & Logistics','Oil & Gas - Integrated','Industrial Distribution','Metals & Mining','Oil & Gas - Services',
'Personal Services','Engineering & Construction', 'Oil & Gas - Midstream','Beverages - Non-Alcoholic', 'Truck Manufacturing', 
'Employment Services', 'Forest Products']]
target = sandp['above_below_sandp_return']

In [613]:
from sklearn.model_selection import train_test_split
X_train, X_test, target_train, target_test = train_test_split(X, target, random_state=76)

In [614]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [615]:
classifier.fit(X_train, target_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [616]:
print(f'Training Data Score: {classifier.score(X_train, target_train)}')
print(f'Testing Data Score: {classifier.score(X_test, target_test)}')

Training Data Score: 0.717948717948718
Testing Data Score: 0.7521367521367521


In [617]:
predictions = classifier.predict(X_test)
pd.DataFrame({'Prediction': predictions, 'Actual': target_test}).reset_index(drop=True).head()

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,1,1
3,1,1
4,0,0


In [618]:
target_test.value_counts()

0    80
1    37
Name: above_below_sandp_return, dtype: int64

In [619]:
target_test.mean()

0.3162393162393162

In [620]:
from sklearn.metrics import classification_report
print(classification_report(target_test, predictions))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83        80
           1       0.65      0.46      0.54        37

    accuracy                           0.75       117
   macro avg       0.72      0.67      0.69       117
weighted avg       0.74      0.75      0.74       117



In [621]:
predictions2 = classifier.predict(X)
pd.DataFrame({'Prediction': predictions2, 'Actual': target}).reset_index(drop=True).head()

Unnamed: 0,Prediction,Actual
0,0,0
1,1,1
2,0,0
3,0,1
4,0,0


In [622]:
# checking out the SandP_500 table.
sandp = pd.read_sql_query('SELECT * FROM sandp', con=engine)
sandp.head()

Unnamed: 0,symbol,name,price_to_bookvalue,price_to_book,price_to_sales,price_to_earnings,receivables_turnover,price_to_free_cash_flow,price_to_operating_cash_flow,enterprise_value_multiple,...,asset_growth,book_value_per_share_growth,debt_growth,randd_expense_growth,sganda_expense_growth,percent_return_on_investment,above_below_sandp_return,market_cap,industry,sector
0,A,"Agilent Technologies, Inc.",2.5517,5.17165,1.4813,6.30885,7.0014,18.1807,13.57995,16.746223,...,0.18005,0.15185,0.31885,-0.0674,0.0188,215.407785,0,7462700000.0,Medical Diagnostics & Research,Healthcare
1,AAL,"American Airlines Group, Inc.",-99.0,-99.0,0.11565,-99.0,27.3348,-99.0,2.20925,61.812855,...,-0.0017,-0.06235,0.00975,-99.0,0.01435,492.976589,1,613380900.0,Airlines,Industrials
2,AAP,"Advance Auto Parts, Inc.",2.9566,3.1038,0.706,14.1684,57.0461,7.4185,5.4613,0.499984,...,0.0367,0.1952,-0.5715,-99.0,0.1036,158.290852,0,1912092000.0,Retail - Apparel & Specialty,Consumer Cyclical
3,AAPL,"Apple, Inc.",5.42765,4.62715,3.98235,19.54495,10.0906,17.3279,15.3596,0.493341,...,0.448,0.44245,-99.0,0.2694,0.21645,641.441749,1,91517930000.0,Computer Hardware,Technology
4,ABBV,"AbbVie, Inc.",16.4687,-99.0,3.019,10.4836,-99.0,9.2123,8.7454,0.001609,...,-99.0,-99.0,-99.0,0.4613,0.1408,127.493713,0,66190650000.0,Drug Manufacturers,Healthcare


In [623]:
sandp = sandp[['name', 'percent_return_on_investment', 'above_below_sandp_return']]

In [624]:
sandp.head()

Unnamed: 0,name,percent_return_on_investment,above_below_sandp_return
0,"Agilent Technologies, Inc.",215.407785,0
1,"American Airlines Group, Inc.",492.976589,1
2,"Advance Auto Parts, Inc.",158.290852,0
3,"Apple, Inc.",641.441749,1
4,"AbbVie, Inc.",127.493713,0


In [625]:
sandp['predictions'] = predictions2

In [626]:
sandp.head()

Unnamed: 0,name,percent_return_on_investment,above_below_sandp_return,predictions
0,"Agilent Technologies, Inc.",215.407785,0,0
1,"American Airlines Group, Inc.",492.976589,1,1
2,"Advance Auto Parts, Inc.",158.290852,0,0
3,"Apple, Inc.",641.441749,1,0
4,"AbbVie, Inc.",127.493713,0,0


In [627]:
sandp['predictions'].value_counts()

0    361
1    107
Name: predictions, dtype: int64

In [628]:
r = 0
predicted_positives = []
for (x,y) in zip(sandp['predictions'],sandp['percent_return_on_investment']):
    if x == 1:
        predicted_positives.append(y)
        if y > 228:
            r += 1
print(r)

74


In [629]:
predicted = pd.DataFrame(predicted_positives)

In [630]:
len(predicted)

107

In [631]:
r/len(predicted)

0.6915887850467289

In [632]:
for x in predicted[0]:
    if x < 0:
        print(x)

In [633]:
gain = predicted.sum()

In [634]:
gain

0    39688.326776
dtype: float64

In [635]:
(gain - len(predicted))/len(predicted)

0    369.918942
dtype: float64

In [636]:
sandp_gain = sandp['percent_return_on_investment'].sum()

In [637]:
(sandp_gain - len(sandp['percent_return_on_investment']))/len(sandp['percent_return_on_investment'])

228.6078093352651

In [560]:
# import pickle
# # Save the trained model as a pickle string.
# saved_model = pickle.dumps(classifier)

In [429]:
# saved_model

In [170]:
# # Load the pickled model
# clf_from_pickle = pickle.loads(saved_model)

# # Use the loaded pickled model to make predictions
# predictions = clf_from_pickle.predict(X)

# pd.DataFrame({'Prediction': predictions.ravel(), 'Actual': X}).head()