# Daily Model Evaluation

In [1]:
# Basics
from pymongo import MongoClient
import os
import numpy as np
import pandas as pd
import time
import mysql.connector
from sqlalchemy import create_engine

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
%matplotlib inline


In [15]:
# Create Predictions Dataframe

# Create variabels
db = os.environ['DB_NAME']
user = os.environ['DB_USER']
pwd = os.environ['DB_PWD']
IP = os.environ['IP']

connection_string = 'mysql+mysqlconnector://'+user+':'+pwd+'@'+IP+':3306'+'/'+db
query = 'SELECT * from daily_model_predictions'
engine = create_engine(connection_string, echo=False)
cnx = engine.raw_connection()

predictions = pd.read_sql(query, cnx, index_col='run_time')

predictions.index=pd.to_datetime(predictions.index)

In [16]:
predictions.head()

Unnamed: 0_level_0,model_name,model_version_number,Company,Prediction
run_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-05-21 17:25:27,Daily_Stock_Prediction_latest.,1526790783,AAPL,1
2018-05-21 17:25:27,Daily_Stock_Prediction_latest.,1526790783,AMZN,1
2018-05-21 17:25:27,Daily_Stock_Prediction_latest.,1526790783,BA,1
2018-05-21 17:25:27,Daily_Stock_Prediction_latest.,1526790783,BABA,0
2018-05-21 17:25:27,Daily_Stock_Prediction_latest.,1526790783,BAC,0


In [37]:
predictions_daily = predictions.groupby([pd.Grouper(freq="D"), 'Company'])['Prediction'].count().to_frame()
predictions_daily.columns = ['Number of Predictions']
predictions_daily['Number of Increase'] = predictions.groupby([pd.Grouper(freq="D"), 'Company'])['Prediction'].sum()
predictions_daily['Score'] = predictions_daily['Number of Increase']/predictions_daily['Number of Predictions']
predictions_daily['Day Prediction'] = np.where(predictions_daily['Score']>=0.5, 1, 0)

# Rename the Index
predictions_daily = predictions_daily.reindex(predictions_daily.index.rename(['Time', 'Company']))

In [38]:
predictions_daily.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Predictions,Number of Increase,Score,Day Prediction
Time,Company,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-05-21,AAPL,8,8,1.0,1
2018-05-21,AMZN,8,8,1.0,1
2018-05-21,BA,8,2,0.25,0
2018-05-21,BABA,8,2,0.25,0
2018-05-21,BAC,8,5,0.625,1


In [17]:
#Setup Mongo and create the database and collection
User = os.environ['MONGODB_USER']
password = os.environ['MONGODB_PASS']
IP = os.environ['IP']

client = MongoClient(IP, username=User, password=password)
db = client['stock_tweets']

#Grab references
iex_coll_reference = db.iex

In [18]:
stock_data = pd.DataFrame(list(iex_coll_reference.find()))
stock_data.head()

Unnamed: 0,Ticker,_id,companyName,high,latestPrice,latestUpdate,latestVolume,low,marketCap,open,peRatio,previousClose,sector,week52High,week52Low,ytdChange
0,AAPL,5aa6c1ee12035200013785b3,Apple Inc.,182.2,181.73,2018-03-12 18:07:31,22006161.0,180.21,922100382490,180.23,19.75,179.98,Technology,181.73,138.62,0.044816
1,FB,5aa6c1ee12035200013785b4,Facebook Inc.,186.1,185.46,2018-03-12 18:07:22,9955873.0,184.22,538761440579,185.26,30.16,185.23,Technology,195.32,137.6,0.021001
2,GOOG,5aa6c1ee12035200013785b5,Alphabet Inc.,1177.05,1172.005,2018-03-12 18:06:08,1353262.0,1157.42,814892181648,1163.85,47.14,1160.04,Technology,1186.89,803.37,0.089239
3,GOOGL,5aa6c1ee12035200013785b6,Alphabet Inc.,1178.16,1173.88,2018-03-12 18:07:13,1586022.0,1159.2,816195864516,1165.0,36.67,1160.84,Technology,1198.0,824.3,0.081652
4,AMZN,5aa6c1ef12035200013785b7,Amazon.com Inc.,1605.33,1600.745,2018-03-12 18:07:10,3518847.0,1586.7,774932152651,1592.6,351.04,1578.89,Technology,1600.745,833.5,0.327903


In [34]:
# Need to find end of day last price
# Need to convert the created_at to a time stamp
stock_data.index=pd.to_datetime(stock_data['latestUpdate'])
stock_data['latestUpdate'] = pd.to_datetime(stock_data['latestUpdate'])
#Group By hourly and stock price
# Need to get the first stock price in teh hour, and then the last to take the difference to see how much change.
stock_daily = stock_data.sort_values('latestUpdate').groupby([pd.Grouper(freq="D"), 'Ticker']).first()['latestPrice'].to_frame()
stock_daily.columns = ['First_Price']
stock_daily['Last_Price'] = stock_data.sort_values('latestUpdate').groupby([pd.Grouper(freq="D"), 'Ticker']).last()['latestPrice']

# Then need to take the difference and turn into a percentage.
stock_daily['Price_Percent_Change'] = ((stock_daily['Last_Price'] 
                                                   - stock_daily['First_Price'])/stock_daily['First_Price'])*100



# Classification Labels
stock_daily['Price_Change'] = np.where(stock_daily['Price_Percent_Change']>=0, 1, 0)

# Rename the Index
stock_daily = stock_daily.reindex(stock_daily.index.rename(['Time', 'Company']))

In [35]:
stock_daily.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,First_Price,Last_Price,Price_Percent_Change,Price_Change
Time,Company,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-02-26,PCLN,1905.64,1905.64,0.0,1
2018-03-12,AAPL,181.73,181.75,0.011005,1
2018-03-12,AMZN,1600.745,1598.39,-0.147119,0
2018-03-12,BA,345.91,344.19,-0.497239,0
2018-03-12,BABA,192.9,192.74,-0.082945,0


In [39]:
# Join Dataframes
daily_df = pd.concat([predictions_daily, stock_daily], axis=1, join='inner')
daily_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Predictions,Number of Increase,Score,Day Prediction,First_Price,Last_Price,Price_Percent_Change,Price_Change
Time,Company,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-05-21,AAPL,8,8,1.0,1,188.18,187.51,-0.356042,0
2018-05-21,AMZN,8,8,1.0,1,1585.83,1584.43,-0.088282,0
2018-05-21,BA,8,2,0.25,0,358.7,363.81,1.424589,1
2018-05-21,BABA,8,2,0.25,0,198.04,197.72,-0.161584,0
2018-05-21,BAC,8,5,0.625,1,30.53,30.535,0.016377,1


In [41]:
# To flatten after combined everything. 
daily_df.reset_index(inplace=True)

In [43]:
daily_df_subset = daily_df[['Time','Company','Day Prediction','Price_Change']]
daily_df_subset.head()

Unnamed: 0,Time,Company,Day Prediction,Price_Change
0,2018-05-21,AAPL,1,0
1,2018-05-21,AMZN,1,0
2,2018-05-21,BA,0,1
3,2018-05-21,BABA,0,0
4,2018-05-21,BAC,1,1


In [45]:
from sklearn.metrics import classification_report
print(classification_report(daily_df_subset['Day Prediction'], daily_df_subset['Price_Change']))

             precision    recall  f1-score   support

          0       0.37      0.44      0.40        57
          1       0.53      0.46      0.49        78

avg / total       0.46      0.45      0.46       135

