In [1]:
import pandas as pd
import requests
import yfinance as yf
import time
from tqdm import tqdm

<h3>Downloading Ticker Names

In [2]:
# Import Fortune 500 tickers

url='https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#S&P_500_component_stocks'

f500_df=pd.read_html(url, header=0)[0]
f500_df.head()

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,ACN,Accenture,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [3]:
# Save the first 100 ticker symbols to a list

ticker_list = list(f500_df['Symbol'])[:100]
ticker_param = ' '.join(ticker_list)

<h3>Downloading Ticker Info

In [4]:
# tickers = yf.Tickers(ticker_param)

# ticker_dicts = []
# for t in tqdm(ticker_list):
#     ticker_info = tickers.tickers[t].info
#     ticker_dict = dict(ticker_info)
#     ticker_dicts.append(ticker_dict)

# ticker_info_df = pd.DataFrame(ticker_dicts)
# ticker_info_df.to_csv('data/ticker_info.csv')

In [5]:
ticker_info_df = pd.read_csv('data/ticker_info.csv')

<h3>Downloading Stock Prices

In [6]:
# Import historical stock prices from Yahoo finance and save to a dataframe (last month)

data = yf.download(ticker_param, start="2021-07-01", end="2021-07-31")

[*********************100%***********************]  100 of 100 completed

2 Failed downloads:
- BF.B: No data found for this date range, symbol may be delisted
- BRK.B: No data found, symbol may be delisted


In [7]:
month_open = data.loc['2021-07-01', 'Open']
month_close = data.loc['2021-07-30', 'Close']

month_open = month_open.rename('monthOpenPrice')
month_close = month_close.rename('monthClosePrice')

<h3>Merging and Subsetting Data

In [8]:
# Merge the opening and closing prices to the ticker info dataframe

merged_df = ticker_info_df.merge(month_open, how='left', left_on='symbol', right_on=month_open.index)
merged_df = merged_df.merge(month_close, how='left', left_on='symbol', right_on=month_close.index)

In [9]:
# Subset the data

keep_cols = [
    'symbol',
    'sector',
    'fullTimeEmployees',
    'industry',
    'ebitdaMargins',
    'profitMargins',
    'grossMargins',
    'operatingCashflow',
    'operatingMargins',
    'ebitda',
    'grossProfits',
    'freeCashflow',
    'currentRatio',
    'returnOnAssets',
    'targetMeanPrice',
    'debtToEquity',
    'returnOnEquity',
    'totalCash',
    'totalDebt',
    'totalRevenue',
    'totalCashPerShare',
    'revenuePerShare',
    'quickRatio',
    'heldPercentInstitutions',
    'netIncomeToCommon',
    'priceToBook',
    'heldPercentInsiders',
    'monthOpenPrice',
    'monthClosePrice'
]

subset_df = merged_df[keep_cols].copy()

In [10]:
# Impute null values using the mean

for col in subset_df.columns:
    if subset_df[col].dtype == 'float64':
        subset_df.loc[:, col].fillna(subset_df.loc[:, col].mean(), inplace=True)

In [11]:
subset_df[subset_df['sector'].isna()]['symbol']

64    BRK.B
77     BF.B
Name: symbol, dtype: object

In [12]:
# Drop BRK.B and BF.B (see download error message above)

subset_df.dropna(inplace=True)
subset_df.reset_index(drop=True, inplace=True)

<h3>"Predict" Last Month's Price Increase/Decrease with Current Stock Info

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [14]:
# Get dummies for categorical data

subset_df.index = subset_df['symbol']
subset_df.drop(columns=['symbol'], inplace=True)

industry_dummies = pd.get_dummies(subset_df['industry'], drop_first = True)
sector_dummies = pd.get_dummies(subset_df['sector'], drop_first = True)

dummies_df = pd.concat([subset_df, industry_dummies, sector_dummies], axis=1)

dummies_df.drop(columns=['industry', 'sector'], inplace=True)

dummy_names = list(industry_dummies.columns) + list(sector_dummies.columns)

In [15]:
# Define target variable

dummies_df['priceGain'] = (dummies_df['monthOpenPrice'] < dummies_df['monthClosePrice']).astype(int)
model_df = dummies_df.drop(columns = ['monthOpenPrice', 'monthClosePrice']).copy()

In [16]:
# Train test split

X = model_df.drop(columns=['priceGain']).copy()
y = model_df['priceGain']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=9)

In [17]:
# Fit models

clf = RandomForestClassifier(random_state=9)
logit = LogisticRegression()

clf_fit = clf.fit(X_train, y_train)
logit_fit = logit.fit(X_train, y_train)

clf_preds = clf_fit.predict(X_test)
log_preds = logit_fit.predict(X_test)

In [18]:
# Show the results

print('Logistic Regression Results: ')
print(classification_report(y_test, log_preds))
print('')
print('Random Forest Results: ')
print(classification_report(y_test, clf_preds))

Logistic Regression Results: 
              precision    recall  f1-score   support

           0       0.53      0.50      0.51        20
           1       0.67      0.69      0.68        29

    accuracy                           0.61        49
   macro avg       0.60      0.59      0.60        49
weighted avg       0.61      0.61      0.61        49


Random Forest Results: 
              precision    recall  f1-score   support

           0       0.56      0.45      0.50        20
           1       0.67      0.76      0.71        29

    accuracy                           0.63        49
   macro avg       0.61      0.60      0.60        49
weighted avg       0.62      0.63      0.62        49



In [19]:
# Show the "naive" predictions

y_train.value_counts()/len(y_train)

1    0.591837
0    0.408163
Name: priceGain, dtype: float64

<h3>Conclusion</h3>
Our predictions perform better than a guess. <br>
These preliminary results indicate that it is worth developing these models to see if performance improves. <br>
At the moment, the Random Forest model looks more promising (based on the f1-score for the 1s).