In [3]:
# importing necessary modules.
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import matplotlib.pyplot as plt
import config9 as cfg

In [4]:
# saving a postgress password and database name to variables to access postgres sql optimal portfolio database.
postgres_password = cfg.password
database_name = 'optimal_portfolio'

In [5]:
# creating a connection to an aws rds postgres cloud database.
rds_connection_string = f'postgres:{postgres_password}@localhost:5432/{database_name}'
engine = create_engine(f'postgres://{rds_connection_string}')

In [6]:
# checking out the table names in the database.
engine.table_names()

['sandp']

In [7]:
# checking out the SandP_500 table.
sandp = pd.read_sql_query('SELECT * FROM sandp', con=engine)
sandp.head()

Unnamed: 0,symbol,name,price_to_bookvalue,price_to_book,price_to_sales,price_to_earnings,receivables_turnover,price_to_free_cash_flow,price_to_operating_cash_flow,enterprise_value_multiple,...,inventory_growth,asset_growth,book_value_per_share_growth,debt_growth,randd_expense_growth,sganda_expense_growth,percent_return_on_investment,industry,sector,above_below_sandp_return
0,A,"Agilent Technologies, Inc.",2.5517,5.17165,1.4813,6.30885,7.0014,18.1807,13.57995,16.746223,...,0.0758,0.18005,0.15185,0.31885,-0.0674,0.0188,215.407785,Medical Diagnostics & Research,Healthcare,0
1,AAL,"American Airlines Group, Inc.",-99.0,-99.0,0.11565,-99.0,27.3348,-99.0,2.20925,61.812855,...,0.0637,-0.0017,-0.06235,0.00975,-99.0,0.01435,492.976589,Airlines,Industrials,1
2,AAP,"Advance Auto Parts, Inc.",2.9566,3.1038,0.706,14.1684,57.0461,7.4185,5.4613,0.499984,...,0.0054,0.0367,0.1952,-0.5715,-99.0,0.1036,158.290852,Retail - Apparel & Specialty,Consumer Cyclical,0
3,AAPL,"Apple, Inc.",5.42765,4.62715,3.98235,19.54495,10.0906,17.3279,15.3596,0.493341,...,0.6019,0.448,0.44245,-99.0,0.2694,0.21645,641.441749,Computer Hardware,Technology,1
4,ABBV,"AbbVie, Inc.",16.4687,-99.0,3.019,10.4836,-99.0,9.2123,8.7454,0.001609,...,-99.0,-99.0,-99.0,-99.0,0.4613,0.1408,127.493713,Drug Manufacturers,Healthcare,0


In [8]:
# converting industry columns into 1's and 0's.
industry = pd.get_dummies(sandp['industry'])

In [9]:
# converting sector columns into 1's and 0's.
sector = pd.get_dummies(sandp['sector'])

In [10]:
# bringing everything together.
sandp = pd.concat([sandp, industry, sector], axis=1)

In [11]:
# dropping industry and sector columns.
sandp.drop(['industry', 'sector'], axis=1, inplace=True)

In [12]:
sandp = sandp.drop(columns=['symbol', 'name'])

In [13]:
# dropping unnecessary columns.
sandp.drop(['percent_return_on_investment'], axis=1, inplace=True)

In [14]:
# looping through the columns to find to be able to view all the column names.
names = []
for x in sandp.columns:
    names.append(x)

In [15]:
# setting an sandp dataframe with just the predictor feature columns to a variable.
X = sandp[['net_income_growth','short_term_coverage_ratio','eps_diluted_growth', 'gross_profit_margin', 'price_to_book', 'price_to_sales', 'price_to_free_cash_flow', 'asset_growth', 'Airlines',
'Computer Hardware','Drug Manufacturers', 'Medical Devices','Application Software', 'Semiconductors','Consumer Packaged Goods', 
'Business Services', 'Credit Services','Utilities - Regulated', 'Insurance - Life', 'REITs','Brokers & Exchanges','Biotechnology',
'Asset Management', 'Online Media','Oil & Gas - E&P','Autos', 'Banks', 'Travel & Leisure', 'Entertainment','Agriculture',
'Transportation & Logistics','Oil & Gas - Integrated','Industrial Distribution','Metals & Mining','Oil & Gas - Services',
'Personal Services','Engineering & Construction', 'Oil & Gas - Midstream','Beverages - Non-Alcoholic', 'Truck Manufacturing', 
'Employment Services', 'Forest Products']]

# setting the above_below_sandp_return column of 0's and 1's that represent if a stock beat the sandp return as a variable, which is the variable we want to predict in our upcoming machine learning module.
target = sandp['above_below_sandp_return']

In [16]:
# splitting the 468 stocks/rows in our sandp dataframe into a training set and a testing set. 
from sklearn.model_selection import train_test_split
X_train, X_test, target_train, target_test = train_test_split(X, target, random_state=76)

In [17]:
# creating a logistic regression model and saving it as a variable called classifier.
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
# fitting/training the logsitic regression model on the training data.
classifier.fit(X_train, target_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
# viewing the accuracy of the logistic regression model on the training and testing data.
print(f'Training Data Score: {classifier.score(X_train, target_train)}')
print(f'Testing Data Score: {classifier.score(X_test, target_test)}')

Training Data Score: 0.7150997150997151
Testing Data Score: 0.7521367521367521


In [28]:
# appplying the model to the testing data and viewing the results.
predictions = classifier.predict(X_test)
pd.DataFrame({'Prediction': predictions, 'Actual': target_test}).reset_index(drop=True).head()

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,1,1
3,1,1
4,0,0


In [29]:
# viwewing the distribution of the testing data's 0's and 1's.
target_test.value_counts()

0    80
1    37
Name: above_below_sandp_return, dtype: int64

In [30]:
# calculating the mean of the testing data, so we can compare it to the machine learning models accurac. If the accuracy is above the mean, it means we are doing better than just randomly choosing all 1's.
target_test.mean()

0.3162393162393162

In [31]:
# viewin the overal report card of our logistic regression model.
from sklearn.metrics import classification_report
print(classification_report(target_test, predictions))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83        80
           1       0.65      0.46      0.54        37

    accuracy                           0.75       117
   macro avg       0.72      0.67      0.69       117
weighted avg       0.74      0.75      0.74       117



In [32]:
# applying our machine learning model on the whole sandp/468 stocks to get a predicted list of 0's and 1's.
predictions2 = classifier.predict(X)
# viewing results.
pd.DataFrame({'Prediction': predictions2, 'Actual': target}).reset_index(drop=True).head()

Unnamed: 0,Prediction,Actual
0,0,0
1,1,1
2,1,0
3,0,1
4,0,0


In [33]:
# reading in from the sql database and checking out the SandP_500 table.
sandp = pd.read_sql_query('SELECT * FROM sandp', con=engine)
sandp.head()

Unnamed: 0,symbol,name,price_to_bookvalue,price_to_book,price_to_sales,price_to_earnings,receivables_turnover,price_to_free_cash_flow,price_to_operating_cash_flow,enterprise_value_multiple,...,inventory_growth,asset_growth,book_value_per_share_growth,debt_growth,randd_expense_growth,sganda_expense_growth,percent_return_on_investment,industry,sector,above_below_sandp_return
0,A,"Agilent Technologies, Inc.",2.5517,5.17165,1.4813,6.30885,7.0014,18.1807,13.57995,16.746223,...,0.0758,0.18005,0.15185,0.31885,-0.0674,0.0188,215.407785,Medical Diagnostics & Research,Healthcare,0
1,AAL,"American Airlines Group, Inc.",-99.0,-99.0,0.11565,-99.0,27.3348,-99.0,2.20925,61.812855,...,0.0637,-0.0017,-0.06235,0.00975,-99.0,0.01435,492.976589,Airlines,Industrials,1
2,AAP,"Advance Auto Parts, Inc.",2.9566,3.1038,0.706,14.1684,57.0461,7.4185,5.4613,0.499984,...,0.0054,0.0367,0.1952,-0.5715,-99.0,0.1036,158.290852,Retail - Apparel & Specialty,Consumer Cyclical,0
3,AAPL,"Apple, Inc.",5.42765,4.62715,3.98235,19.54495,10.0906,17.3279,15.3596,0.493341,...,0.6019,0.448,0.44245,-99.0,0.2694,0.21645,641.441749,Computer Hardware,Technology,1
4,ABBV,"AbbVie, Inc.",16.4687,-99.0,3.019,10.4836,-99.0,9.2123,8.7454,0.001609,...,-99.0,-99.0,-99.0,-99.0,0.4613,0.1408,127.493713,Drug Manufacturers,Healthcare,0


In [34]:
# refining the sandp dataframe.
sandp = sandp[['name', 'percent_return_on_investment', 'above_below_sandp_return']]

In [35]:
# viewing the newly refined dataframe.
sandp.head()

Unnamed: 0,name,percent_return_on_investment,above_below_sandp_return
0,"Agilent Technologies, Inc.",215.407785,0
1,"American Airlines Group, Inc.",492.976589,1
2,"Advance Auto Parts, Inc.",158.290852,0
3,"Apple, Inc.",641.441749,1
4,"AbbVie, Inc.",127.493713,0


In [36]:
# appending the predictions made from applying the logistic regression model to the total 468 sandp stocks to the read in and refined sandp dataframe.
sandp['predictions'] = predictions2

In [37]:
# viewing the results.
sandp.head()

Unnamed: 0,name,percent_return_on_investment,above_below_sandp_return,predictions
0,"Agilent Technologies, Inc.",215.407785,0,0
1,"American Airlines Group, Inc.",492.976589,1,1
2,"Advance Auto Parts, Inc.",158.290852,0,1
3,"Apple, Inc.",641.441749,1,0
4,"AbbVie, Inc.",127.493713,0,0


In [38]:
# getting a sense of the distribution of our predictions.
sandp['predictions'].value_counts()

0    360
1    108
Name: predictions, dtype: int64

In [39]:
# looping through the predictions columns and appending only the the 1's corresponding returns, stocks predicted to beat the sandp, to a list
r = 0
predicted_positives = []
for (x,y) in zip(sandp['predictions'],sandp['percent_return_on_investment']):
    if x == 1:
        predicted_positives.append(y)
        
        # getting a feel for how many stocks of our predicted 1's actually did beat the sandp return.
        if y > 228:
            r += 1

# viewing the number of stocks that did beat the sandp out of our 105 predicted 1's.            
print(r)

74


In [40]:
# making a dataframe out of the predicted 1's list created above and saving to a variable.
predicted = pd.DataFrame(predicted_positives)

In [41]:
# observing the length of the dataframe.
len(predicted)

108

In [42]:
# calculating the accuracy of our results, aka the how many of our predicted 1's were actually 1's.
r/len(predicted)

0.6851851851851852

In [43]:
# viewing how many our our predicted 1's were under 0.
for x in predicted[0]:
    if x < 0:
        print(x)

In [44]:
# summing up the predicted column, which is the percent gain of investment over 10 years, and labeling as gain.
gain = predicted.sum()

In [45]:
# viewing the percent gain.
gain

0    39981.339611
dtype: float64

In [46]:
# calculating the percent return, which is gain/profit on investment divided by the cost of the investment, which is the number of stocks in our portfolio, 107.
((gain/100)/len(predicted))*100

0    370.197589
dtype: float64

In [47]:
# summing up the sandp percent gain over all 468 stocks.
sandp_gain = sandp['percent_return_on_investment'].sum()

In [48]:
# calculating the percent return of the sandp just like above so we can compare to the results above. Want percent return above to be higher than this one.
((sandp_gain/100)/len(sandp['percent_return_on_investment']))*100

229.60780933526505