# Hourly Model Development

Primary Evaluation Metric: AUROC
Satisficing Metrics: F1 Score

In [45]:
############### Initialize ###################

# Basics
from pymongo import MongoClient
import os
import numpy as np
import pandas as pd
import time
import boto3
import io
import warnings
warnings.filterwarnings('ignore')

# NLP
import nltk
import spacy
spacy.load('en')
from nltk.corpus import stopwords
import preprocessor as p

# Model Infrastructure
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score, roc_curve, auc


# Models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
import xgboost as xgb
from xgboost.sklearn import XGBClassifier, XGBRegressor

In [2]:
###################### Bring In Data #######################
#Setup Mongo and create the database and collection
User = os.environ['MONGODB_USER']
password = os.environ['MONGODB_PASS']
IP = os.environ['IP']

client = MongoClient(IP, username=User, password=password)
db = client['stock_tweets']

#Grab references
twitter_coll_reference = db.twitter
iex_coll_reference = db.iex

In [3]:
###################### Build Twitter Data Frames #####################

start_time = time.time()
# Create Data Frame
twitter_data = pd.DataFrame(list(twitter_coll_reference.find()))

# Need to convert the created_at to a time stamp and set to index
twitter_data.index=pd.to_datetime(twitter_data['created_at'])

# Delimited the Company List into separate rows
delimited_twitter_data=[]

for item in twitter_data.itertuples():
    #twitter_dict={}
    for company in item[1]:
        twitter_dict={}
        twitter_dict['created_at']=item[0]
        twitter_dict['company']=company
        twitter_dict['text']=item[11]
        twitter_dict['user_followers_count']=item[12]
        twitter_dict['user_name']=item[13]
        twitter_dict['user_statuses_count']=item[15]
        delimited_twitter_data.append(twitter_dict)

delimited_twitter_df = pd.DataFrame(delimited_twitter_data) 
delimited_twitter_df.set_index('created_at', inplace=True)

# Create hourly data frame
twitter_delimited_hourly = delimited_twitter_df.groupby([pd.Grouper(freq="H"), 'company']).count()['text'].to_frame()
twitter_delimited_hourly.columns = ['Number_of_Tweets']

# Concatenate the text with a space to not combine words.
twitter_delimited_hourly['text']=delimited_twitter_df.groupby([pd.Grouper(freq="H"), 'company'])['text'].apply(lambda x: ' '.join(x))
# Number of Users
twitter_delimited_hourly['Number_of_Users'] = delimited_twitter_df.groupby([pd.Grouper(freq="H"), 'company'])['user_name'].nunique()

# Rename Index
twitter_delimited_hourly = twitter_delimited_hourly.reindex(twitter_delimited_hourly.index.rename(['Time', 'Company']))

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 32.234617710113525 seconds ---


In [4]:
##################### Build Stock Data Frames ###########################
start_time = time.time()

stock_data = pd.DataFrame(list(iex_coll_reference.find()))

# Need to convert the created_at to a time stamp
stock_data.index=pd.to_datetime(stock_data['latestUpdate'])
stock_data['latestUpdate'] = pd.to_datetime(stock_data['latestUpdate'])
#Group By hourly and stock price
# Need to get the first stock price in teh hour, and then the last to take the difference to see how much change.
stock_delimited_hourly = stock_data.sort_values('latestUpdate').groupby([pd.Grouper(freq="H"), 'Ticker']).first()['latestPrice'].to_frame()
stock_delimited_hourly.columns = ['First_Price']
stock_delimited_hourly['Last_Price'] = stock_data.sort_values('latestUpdate').groupby([pd.Grouper(freq="H"), 'Ticker']).last()['latestPrice']

# Then need to take the difference and turn into a percentage.
stock_delimited_hourly['Price_Percent_Change'] = ((stock_delimited_hourly['Last_Price'] 
                                                   - stock_delimited_hourly['First_Price'])/stock_delimited_hourly['First_Price'])*100

# Need to also show Percent from open price
stock_delimited_hourly['Open_Price'] = stock_data.groupby([pd.Grouper(freq="H"), 'Ticker'])['open'].mean()
stock_delimited_hourly['Price_Percent_Open'] = ((stock_delimited_hourly['Last_Price'] 
                                                 - stock_delimited_hourly['Open_Price'])/stock_delimited_hourly['Open_Price'])*100

# Also include mean volume
stock_delimited_hourly['Mean_Volume'] = stock_data.groupby([pd.Grouper(freq="H"), 'Ticker'])['latestVolume'].mean()

# Classification Labels
stock_delimited_hourly['Price_Change'] = np.where(stock_delimited_hourly['Price_Percent_Change']>=0, 1, 0)
stock_delimited_hourly['Open_Price_Change'] = np.where(stock_delimited_hourly['Price_Percent_Open']>=0, 1, 0)

# Rename the Index
stock_delimited_hourly = stock_delimited_hourly.reindex(stock_delimited_hourly.index.rename(['Time', 'Company']))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 14.78155779838562 seconds ---


In [5]:
######################### Combine Data Frames ##############################
hourly_df = pd.concat([twitter_delimited_hourly, stock_delimited_hourly], axis=1, join='inner')

# To flatten after combined everything. 
hourly_df.reset_index(inplace=True)
hourly_df.head()

Unnamed: 0,Time,Company,Number_of_Tweets,text,Number_of_Users,First_Price,Last_Price,Price_Percent_Change,Open_Price,Price_Percent_Open,Mean_Volume,Price_Change,Open_Price_Change
0,2018-03-12 18:00:00,AAPL,94,@JoKiddo But how proprietary is that? Does it ...,83,181.73,181.69,-0.022011,180.23,0.810076,23456930.0,0,1
1,2018-03-12 18:00:00,AMZN,103,Amazon hits $1600 $AMZN Americans reported one...,61,1600.745,1597.725,-0.188662,1592.6,0.321801,3729830.0,0,1
2,2018-03-12 18:00:00,BA,25,"Thus, its cheaper for $AAPL to built than to b...",20,345.91,344.7,-0.349802,355.02,-2.906878,4380395.0,0,0
3,2018-03-12 18:00:00,BABA,10,"Thus, its cheaper for $AAPL to built than to b...",8,192.9,192.93,0.015552,192.0,0.484375,14057520.0,1,1
4,2018-03-12 18:00:00,BAC,5,Open an account with @RobinhoodApp and get a s...,3,32.98,32.87,-0.333535,32.67,0.612182,37167100.0,0,1


In [6]:
# Clean the Tweets
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.HASHTAG)
def preprocess_tweet(tweet):
    return p.clean(tweet)

# Clean the tweets, by removing special characters
start_time = time.time()
hourly_df['Clean_text'] = hourly_df['text'].apply(lambda x: preprocess_tweet(x))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 4.5234694480896 seconds ---


In [7]:
hourly_df.head()

Unnamed: 0,Time,Company,Number_of_Tweets,text,Number_of_Users,First_Price,Last_Price,Price_Percent_Change,Open_Price,Price_Percent_Open,Mean_Volume,Price_Change,Open_Price_Change,Clean_text
0,2018-03-12 18:00:00,AAPL,94,@JoKiddo But how proprietary is that? Does it ...,83,181.73,181.69,-0.022011,180.23,0.810076,23456930.0,0,1,But how proprietary is that? Does it really ma...
1,2018-03-12 18:00:00,AMZN,103,Amazon hits $1600 $AMZN Americans reported one...,61,1600.745,1597.725,-0.188662,1592.6,0.321801,3729830.0,0,1,Amazon hits $1600 $AMZN Americans reported one...
2,2018-03-12 18:00:00,BA,25,"Thus, its cheaper for $AAPL to built than to b...",20,345.91,344.7,-0.349802,355.02,-2.906878,4380395.0,0,0,"Thus, its cheaper for $AAPL to built than to b..."
3,2018-03-12 18:00:00,BABA,10,"Thus, its cheaper for $AAPL to built than to b...",8,192.9,192.93,0.015552,192.0,0.484375,14057520.0,1,1,"Thus, its cheaper for $AAPL to built than to b..."
4,2018-03-12 18:00:00,BAC,5,Open an account with @RobinhoodApp and get a s...,3,32.98,32.87,-0.333535,32.67,0.612182,37167100.0,0,1,Open an account with and get a stock like $HPQ...


In [36]:
# Split Between Outcome and Features
features = hourly_df[['Number_of_Tweets', 'Number_of_Users','Mean_Volume','Clean_text']]
classification_price = hourly_df['Price_Change']
classification_open = hourly_df['Open_Price_Change']
regression_price = hourly_df['Price_Percent_Change']
regression_open = hourly_df['Price_Percent_Open']

### Classification

#### Price Change

In [9]:
# Check is Data is imbalanced
hourly_df['Price_Change'].value_counts()

0    3766
1    3690
Name: Price_Change, dtype: int64

In [15]:
# Split the Data to avoid Leakage
#splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features,classification_price,test_size=0.2)

In [26]:
####### Old Way ########
# Was separating the data transformation from the mode because I used to do it this way when there is a lot of data. Since there is not, can do it
# in one step
start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()

def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])

vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',
                             lowercase=True, use_idf=True, max_df=2, max_features=1000,
                             min_df=2, norm='l2', smooth_idf=True, ngram_range=(1, 2))


train_tfidf = vectorizer.fit(X_train['Clean_text'])
tweets_train = train_tfidf.transform(X_train['Clean_text'])
tweets_test = train_tfidf.transform(X_test['Clean_text'])

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 44.77558612823486 seconds ---


In [27]:
################ Logistic Regression ##########################
start_time = time.time()
parameters = {
                'penalty':['l1','l2'],
                'C':[0.1, 0.001, 1, 10, 100],
                'class_weight':['balanced']
               
              }

lr = LogisticRegression()

grid = GridSearchCV(lr, parameters, scoring='accuracy', cv=3, verbose=0)
#Fit the Data
grid.fit(tweets_train, y_train)
y = grid.predict(tweets_test)
print(classification_report(y, y_test))
print(grid.score(tweets_test, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

             precision    recall  f1-score   support

          0       0.03      0.53      0.07        51
          1       0.97      0.48      0.64      1441

avg / total       0.93      0.48      0.62      1492

0.4839142091152815
-- Execution time: 0.24537062644958496 seconds ---


In [28]:
grid.best_params_

{'C': 10, 'class_weight': 'balanced', 'penalty': 'l1'}

In [None]:
###################### XGB #############################
start_time = time.time()
xgb_model = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', scale_pos_weight=1, seed=27) 
 

parameters = {'n_jobs':[-1],
             'max_depth':range(3,10,2),
             'min_child_weight':range(1,6,2)}

clf = GridSearchCV(xgb_model, parameters,cv=3, verbose=0,n_jobs=1)
clf.fit(tweets_train, y_train)
print(clf.score(tweets_test, y_test))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

In [11]:
################# New Way ##################
# Will create pipeline that combines all the steps and then apply the model at the end.
# Since all the features are apart of the features dataframe, for the NLP only need the clean text, but still want to add other things.

# Will create a class to handle this. 
class DataSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, features):
        if self.key=='text':
            return features['Clean_text']
        else:
            return features[['Number_of_Tweets', 'Number_of_Users','Mean_Volume']]
        


In [30]:
####### Logistic Regression ############

start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()

# Define Model
lr_model = LogisticRegression()

# Define custom Tokenizer
def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])

# Define Vectorizer
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',
                             lowercase=True, use_idf=True, max_df=2, max_features=1000,
                             min_df=2, norm='l2', smooth_idf=True, ngram_range=(1, 2))

# Define Pipeline and Feature Union
pipeline = Pipeline([
    # Use Feature Union to combine features from the Tweet and other features gathered
    ('union', FeatureUnion(
        transformer_list=[
            # Pipeline for text
            ('tweet', Pipeline([
                ('selector', DataSelector(key='text')),
                ('vectidf', vectorizer)
                                
            ])),
            
            # Pipeline for getting other features
            ('other', Pipeline([
                ('seclector', DataSelector(key='other'))
             ])),
        ],
                       
    )),
    # Use Logistic Regression Classifier
    ('lr', lr_model)
])

# Grid Search
parameters = {
                'lr__penalty':['l1'],
                'lr__C':[10],
                'lr__class_weight':['balanced'],
                'union__transformer_weights':[{'tweet':0.5, 'other':0.5},{'tweet':0.2, 'other':0.8},{'tweet':0.8, 'other':0.2}]
               
              }

grid = GridSearchCV(pipeline, parameters, scoring='accuracy', cv=3, verbose=0, n_jobs=5)

# Fit the grid
grid.fit(X_train, y_train)
# Predictions
y = grid.predict(X_test)
print("-- Execution time: %s seconds ---" % (time.time() - start_time))
print(classification_report(y, y_test))

-- Execution time: 95.41935133934021 seconds ---
             precision    recall  f1-score   support

          0       0.32      0.53      0.40       478
          1       0.68      0.49      0.57      1014

avg / total       0.57      0.50      0.51      1492



In [31]:
grid.best_params_

{'lr__C': 10,
 'lr__class_weight': 'balanced',
 'lr__penalty': 'l1',
 'union__transformer_weights': {'other': 0.8, 'tweet': 0.2}}

In [32]:
grid.best_score_

0.505700871898055

### Open Price

In [37]:
# Check is Data is imbalanced
hourly_df['Open_Price_Change'].value_counts()

0    3807
1    3649
Name: Open_Price_Change, dtype: int64

In [47]:
# Split the Data to avoid Leakage
#splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features,classification_open,test_size=0.2)

In [50]:
####### Logistic Regression ############

start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()

# Define Model
lr_model = LogisticRegression()


# Define custom Tokenizer
def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])


# Define Vectorizer
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',
                             lowercase=True, use_idf=True, max_df=2, max_features=1000,
                             min_df=2, norm='l2', smooth_idf=True, ngram_range=(1, 2))

# Define Pipeline and Feature Union
pipeline = Pipeline([
    # Use Feature Union to combine features from the Tweet and other features gathered
    ('union', FeatureUnion(
        transformer_list=[
            # Pipeline for text
            ('tweet', Pipeline([
                ('selector', DataSelector(key='text')),
                ('vectidf', vectorizer),
                #('svd', TruncatedSVD(500)),
                #('norm',Normalizer(copy=False))
                                
            ])),
            
            # Pipeline for getting other features
            ('other', Pipeline([
                ('seclector', DataSelector(key='other'))
             ])),
            
        ],
                       
    )),
    # Use Logistic Regression Classifier
    ('lr', lr_model)
])


# Grid Search
parameters = {
                'lr__penalty':['l1'],
                'lr__C':[10],
                'lr__class_weight':['balanced'],
                'union__transformer_weights':[{'tweet':0.5, 'other':0.5},{'tweet':0.2, 'other':0.8},{'tweet':0.8, 'other':0.2}]
               
              }


grid = GridSearchCV(pipeline, parameters, scoring='roc_auc', cv=2, verbose=0, n_jobs=5)

# Fit the grid
grid.fit(X_train, y_train)
# Predictions
y = grid.predict(X_test)
print("-- Execution time: %s seconds ---" % (time.time() - start_time))
print(classification_report(y, y_test))

prediction_proba = grid.predict_proba(X_test)
prediction_proba = [p[1] for p in prediction_proba]
print(roc_auc_score(y_test, prediction_proba))

-- Execution time: 86.12768578529358 seconds ---
             precision    recall  f1-score   support

          0       0.36      0.53      0.43       512
          1       0.68      0.52      0.59       980

avg / total       0.57      0.52      0.53      1492

0.5454491547352078


In [51]:
grid.best_score_

0.5661916236066759

In [52]:
####### XGB ############
warnings.filterwarnings('ignore')
start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()

# Define Model
xgb_model = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', scale_pos_weight=1, seed=27) 
 


# Define custom Tokenizer
def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])

# Define Vectorizer
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',
                             lowercase=True, use_idf=True, max_df=2, max_features=1000,
                             min_df=2, norm='l2', smooth_idf=True, ngram_range=(1, 2))

# Define Pipeline and Feature Union
pipeline = Pipeline([
    # Use Feature Union to combine features from the Tweet and other features gathered
    ('union', FeatureUnion(
        transformer_list=[
            # Pipeline for text
            ('tweet', Pipeline([
                ('selector', DataSelector(key='text')),
                ('vectidf', vectorizer)
                                
            ])),
            
            # Pipeline for getting other features
            ('other', Pipeline([
                ('seclector', DataSelector(key='other'))
             ])),
        ],
                       
    )),
    # Use Logistic Regression Classifier
    ('xgb', xgb_model)
])



# Grid Search
parameters = {
             'xgb__n_jobs':[-1],
             #'xgb__max_depth':range(3,10,2),
             #'xgb__min_child_weight':range(1,6,2),
             'union__transformer_weights':[{'tweet':0.5, 'other':0.5},{'tweet':0.2, 'other':0.8},{'tweet':0.8, 'other':0.2}]
               
              }

grid = GridSearchCV(pipeline, parameters, scoring='accuracy', cv=2, verbose=0, n_jobs=1)

# Fit the grid
grid.fit(X_train, y_train)
# Predictions
y = grid.predict(X_test)
print("-- Execution time: %s seconds ---" % (time.time() - start_time))
print(classification_report(y, y_test))

-- Execution time: 203.15611171722412 seconds ---
             precision    recall  f1-score   support

          0       0.50      0.49      0.49       762
          1       0.48      0.49      0.48       730

avg / total       0.49      0.49      0.49      1492



## Regression