# Daily Model Deployment

1. Load Model from S3
2. Query past 10 mintues of data from MongoDB
3. Build Features
4. Make Predictions
5. Log predictions to database

In [64]:
############### Initialize ###################

# Basics
from pymongo import MongoClient
import os
import numpy as np
import pandas as pd
import time
import boto3
import io
import warnings
warnings.filterwarnings('ignore')
import time
from datetime import date, datetime, timedelta
import subprocess


# NLP
import nltk
import spacy
spacy.load('en')
from nltk.corpus import stopwords
import preprocessor as p

# Model Infrastructure
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn import metrics
import dill as pickle

# Models
from sklearn.linear_model import LogisticRegression

# Database Setup
import mysql.connector
from sqlalchemy import create_engine
from sqlalchemy import inspect
from sqlalchemy import MetaData
from sqlalchemy import Table
from sqlalchemy import Column
from sqlalchemy import Integer, String, DateTime, Float

In [3]:
########## Database Setup #################
User = os.environ['DB_USER']
password = os.environ['DB_PWD']
dbname = os.environ['DB_NAME']
IP = os.environ['IP']

engine = create_engine('mysql+mysqlconnector://{}:{}@{}:3306/{}'.format(User,
                                                                        password, IP, dbname), echo=False)
conn = engine.connect()

# Check to see if the tables are created and if not, create them
meta = MetaData(engine)

# Create prediction table
if not engine.dialect.has_table(engine, 'daily_model_predictions'):
    print('Daily_Model_predictions Table does not exist')
    print('Daily_Model_predictions Table being created....')
    # Time, Source, Current Count, Count Diff
    t1 = Table('daily_model_predictions', meta,
               Column('run_time', DateTime, default=datetime.utcnow),
               Column('model_name', String(30)),
               Column('model_version_number', Integer),
               Column('Company', String(30)),
               Column('Prediction', Integer))
    t1.create()
else:
    print('Model_predictions Table Exists')

# Create table object
meta = MetaData(engine, reflect=True)
daily_model_predictions_table = meta.tables['daily_model_predictions']    

# Write Function
def database_log(name, version_number, company, prediction):
    #Need to log these items to a database.
        
    ins = daily_model_predictions_table.insert().values(
            run_time = datetime.now(),
            model_name = name,
            model_version_number = version_number,
            Company = company,
            Prediction = prediction
               )
    conn.execute(ins)

Daily_Model_predictions Table does not exist
Daily_Model_predictions Table being created....


In [4]:
# Download the Model
subprocess.run(['aws', 's3','cp','s3://brandyn-twitter-sentiment-analysis/Models/Daily_Stock_Prediction_latest.pk','./Models'])

CompletedProcess(args=['aws', 's3', 'cp', 's3://brandyn-twitter-sentiment-analysis/Models/Daily_Stock_Prediction_latest.pk', './Models'], returncode=0)

In [5]:
# Load Model
### Validate Pickle ###
filename = 'Daily_Stock_Prediction_latest.pk'

with open('./Models/'+filename, 'rb') as f:
    model = pickle.load(f)

In [68]:
###################### Bring In Data #######################
#Setup Mongo and create the database and collection
User = os.environ['MONGODB_USER']
password = os.environ['MONGODB_PASS']
IP = os.environ['IP']

client = MongoClient(IP, username=User, password=password)
db = client['stock_tweets']

#Grab references
twitter_coll_reference = db.twitter
iex_coll_reference = db.iex

# Create Time bound
ten_min_bound =  pd.to_datetime(datetime.utcnow() - timedelta(hours = 24))

In [69]:
###################### Build Twitter Data Frames #####################


# Create Data Frame from Mongo DB
twitter_data = pd.DataFrame(list(twitter_coll_reference.find()))

# Take a subset of the data, dont need all points to convert and this greatly speeds up
twitter_data_subset = twitter_data.tail(400)

# Need to convert the created_at to a time stamp and set to index
twitter_data_subset['created_at'] = pd.to_datetime(twitter_data_subset['created_at'])
twitter_data_subset.index=twitter_data_subset['created_at']

# Create time bounded dataframe
twitter_data = twitter_data_subset[twitter_data_subset['created_at'] >= ten_min_bound]

# Delimited the Company List into separate rows
delimited_twitter_data=[]

for item in twitter_data.itertuples():
    #twitter_dict={}
    for company in item[1]:
        twitter_dict={}
        twitter_dict['created_at']=item[0]
        twitter_dict['company']=company
        twitter_dict['text']=item[11]
        twitter_dict['user_followers_count']=item[12]
        twitter_dict['user_name']=item[13]
        twitter_dict['user_statuses_count']=item[15]
        delimited_twitter_data.append(twitter_dict)

delimited_twitter_df = pd.DataFrame(delimited_twitter_data) 
delimited_twitter_df.set_index('created_at', inplace=True)

# Create hourly data frame
twitter_delimited_daily = delimited_twitter_df.groupby([pd.Grouper(freq="D"), 'company']).count()['text'].to_frame()
twitter_delimited_daily.columns = ['Number_of_Tweets']

# Concatenate the text with a space to not combine words.
twitter_delimited_daily['text']=delimited_twitter_df.groupby([pd.Grouper(freq="D"), 'company'])['text'].apply(lambda x: ' '.join(x))
# Number of Users
twitter_delimited_daily['Number_of_Users'] = delimited_twitter_df.groupby([pd.Grouper(freq="D"), 'company'])['user_name'].nunique()

# Rename Index
twitter_delimited_daily = twitter_delimited_daily.reindex(twitter_delimited_daily.index.rename(['Time', 'Company']))

In [70]:
twitter_delimited_daily.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Number_of_Tweets,text,Number_of_Users
Time,Company,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-05-18,AAPL,12,ThinkScript Tutorial: Using Custom Scripts In ...,10
2018-05-18,AMZN,23,@JaviFusco When it hits 2718 they gonna bail l...,22
