# Top Companies Daily Model

In [1]:
############### Initialize ###################

# Basics
from pymongo import MongoClient
import os
import numpy as np
import pandas as pd
import time
import boto3
import io
import warnings
warnings.filterwarnings('ignore')

# NLP
import nltk
import spacy
spacy.load('en')
from nltk.corpus import stopwords
import preprocessor as p

# Model Infrastructure
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn import metrics

# Models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
import xgboost as xgb
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn.naive_bayes import BernoulliNB

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
###################### Bring In Data #######################
#Setup Mongo and create the database and collection
User = os.environ['MONGODB_USER']
password = os.environ['MONGODB_PASS']
IP = os.environ['IP']

client = MongoClient(IP, username=User, password=password)
db = client['stock_tweets']

#Grab references
twitter_coll_reference = db.twitter
iex_coll_reference = db.iex

In [3]:
###################### Build Twitter Data Frames #####################

start_time = time.time()
# Create Data Frame
twitter_data = pd.DataFrame(list(twitter_coll_reference.find()))

# Need to convert the created_at to a time stamp and set to index
twitter_data.index=pd.to_datetime(twitter_data['created_at'])

# Delimited the Company List into separate rows
delimited_twitter_data=[]

for item in twitter_data.itertuples():
    #twitter_dict={}
    for company in item[1]:
        twitter_dict={}
        twitter_dict['created_at']=item[0]
        twitter_dict['company']=company
        twitter_dict['text']=item[11]
        twitter_dict['user_followers_count']=item[12]
        twitter_dict['user_name']=item[13]
        twitter_dict['user_statuses_count']=item[15]
        delimited_twitter_data.append(twitter_dict)

delimited_twitter_df = pd.DataFrame(delimited_twitter_data) 
delimited_twitter_df.set_index('created_at', inplace=True)

# Create hourly data frame
twitter_delimited_hourly = delimited_twitter_df.groupby([pd.Grouper(freq="D"), 'company']).count()['text'].to_frame()
twitter_delimited_hourly.columns = ['Number_of_Tweets']

# Concatenate the text with a space to not combine words.
twitter_delimited_hourly['text']=delimited_twitter_df.groupby([pd.Grouper(freq="D"), 'company'])['text'].apply(lambda x: ' '.join(x))
# Number of Users
twitter_delimited_hourly['Number_of_Users'] = delimited_twitter_df.groupby([pd.Grouper(freq="D"), 'company'])['user_name'].nunique()

# Rename Index
twitter_delimited_hourly = twitter_delimited_hourly.reindex(twitter_delimited_hourly.index.rename(['Time', 'Company']))

print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 31.369733095169067 seconds ---


In [4]:
##################### Build Stock Data Frames ###########################
start_time = time.time()

stock_data = pd.DataFrame(list(iex_coll_reference.find()))

# Need to convert the created_at to a time stamp
stock_data.index=pd.to_datetime(stock_data['latestUpdate'])
stock_data['latestUpdate'] = pd.to_datetime(stock_data['latestUpdate'])
#Group By hourly and stock price
# Need to get the first stock price in teh hour, and then the last to take the difference to see how much change.
stock_delimited_hourly = stock_data.sort_values('latestUpdate').groupby([pd.Grouper(freq="D"), 'Ticker']).first()['latestPrice'].to_frame()
stock_delimited_hourly.columns = ['First_Price']
stock_delimited_hourly['Last_Price'] = stock_data.sort_values('latestUpdate').groupby([pd.Grouper(freq="D"), 'Ticker']).last()['latestPrice']

# Then need to take the difference and turn into a percentage.
stock_delimited_hourly['Price_Percent_Change'] = ((stock_delimited_hourly['Last_Price'] 
                                                   - stock_delimited_hourly['First_Price'])/stock_delimited_hourly['First_Price'])*100

# Need to also show Percent from open price
stock_delimited_hourly['Open_Price'] = stock_data.groupby([pd.Grouper(freq="D"), 'Ticker'])['open'].mean()
stock_delimited_hourly['Price_Percent_Open'] = ((stock_delimited_hourly['Last_Price'] 
                                                 - stock_delimited_hourly['Open_Price'])/stock_delimited_hourly['Open_Price'])*100

# Also include mean volume
stock_delimited_hourly['Mean_Volume'] = stock_data.groupby([pd.Grouper(freq="D"), 'Ticker'])['latestVolume'].mean()

# Classification Labels
stock_delimited_hourly['Price_Change'] = np.where(stock_delimited_hourly['Price_Percent_Change']>=0, 1, 0)
stock_delimited_hourly['Open_Price_Change'] = np.where(stock_delimited_hourly['Price_Percent_Open']>=0, 1, 0)

# Rename the Index
stock_delimited_hourly = stock_delimited_hourly.reindex(stock_delimited_hourly.index.rename(['Time', 'Company']))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 10.226880311965942 seconds ---


In [20]:
######################### Combine Data Frames ##############################
daily_df = pd.concat([twitter_delimited_hourly, stock_delimited_hourly], axis=1, join='inner')

# To flatten after combined everything. 
daily_df.reset_index(inplace=True)
daily_df = daily_df[((daily_df['Company']=='FB') | 
                     (daily_df['Company']=='TSLA') | 
                     (daily_df['Company']=='AMZN'))]
daily_df.head()

Unnamed: 0,Time,Company,Number_of_Tweets,text,Number_of_Users,First_Price,Last_Price,Price_Percent_Change,Open_Price,Price_Percent_Open,Mean_Volume,Price_Change,Open_Price_Change
1,2018-03-12,AMZN,275,Amazon hits $1600 $AMZN Americans reported one...,162,1600.745,1598.39,-0.147119,1592.6,0.363556,4376277.0,0,1
8,2018-03-12,FB,128,RT @stockstreamtv: https://t.co/wpKWA7TkY5 // ...,74,185.46,184.76,-0.37744,185.26,-0.269891,12538170.0,0,0
20,2018-03-12,TSLA,153,$TSLA so nice so obvious... $TSLA bears just g...,125,344.645,345.51,0.250983,328.9,5.050167,7304020.0,1,1
28,2018-03-13,AMZN,531,Commented on $EXPE $FB $MS $AMZN https://t.co/...,347,1614.1,1588.64,-1.57735,1617.326229,-1.773682,3913528.0,0,0
35,2018-03-13,FB,318,Trade: Short $XLV\n\nhttps://t.co/kQ3uA7vvbT ...,166,185.26,181.96,-1.78128,185.497943,-1.907268,9869588.0,0,0


In [21]:
# Clean the Tweets
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.HASHTAG)
def preprocess_tweet(tweet):
    return p.clean(tweet)

# Clean the tweets, by removing special characters
start_time = time.time()
daily_df['Clean_text'] = daily_df['text'].apply(lambda x: preprocess_tweet(x))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

-- Execution time: 1.8520677089691162 seconds ---


In [23]:
# Split Between Outcome and Features
features = daily_df[['Company','Number_of_Tweets', 'Number_of_Users','Mean_Volume','Clean_text']]
classification_price = daily_df['Price_Change']
classification_open = daily_df['Open_Price_Change']
regression_price = daily_df['Price_Percent_Change']
regression_open = daily_df['Price_Percent_Open']

In [24]:
# Want to leverage the Company name so need to create dummy variables. 
cat_feats = ['Company']
features = pd.get_dummies(features, columns=cat_feats, drop_first=True)
features.head()

Unnamed: 0,Number_of_Tweets,Number_of_Users,Mean_Volume,Clean_text,Company_FB,Company_TSLA
1,275,162,4376277.0,Amazon hits $1600 $AMZN Americans reported one...,0,0
8,128,74,12538170.0,: // $FB $AMZN $AAPL $NFLX $GOOGL $MSFT $TWTR ...,1,0
20,153,125,7304020.0,$TSLA so nice so obvious... $TSLA bears just g...,0,1
28,531,347,3913528.0,Commented on $EXPE $FB $MS $AMZN RT : Discreti...,0,0
35,318,166,9869588.0,Trade: Short $XLV Positions: long $FB $SPY $XL...,1,0


In [8]:
# Will create a class to handle this. 
class DataSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, features):
        if self.key=='text':
            return features['Clean_text']
        else:
            return features.loc[:, features.columns != 'Clean_text']

### Classification

In [25]:
# Check is Data is imbalanced
daily_df['Price_Change'].value_counts()

0    76
1    56
Name: Price_Change, dtype: int64

In [26]:
# Split the Data to avoid Leakage
#splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features,classification_price,test_size=0.2)

In [27]:
# Check to see how much variance is being explained

X = X_train['Clean_text']

start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()

def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])

vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',
                             lowercase=True, use_idf=True, max_df=0.5, 
                             min_df=2, norm='l2', smooth_idf=True, ngram_range=(1, 2))

tweets_tfidf = vectorizer.fit_transform(X)
print("Vectorizing Finished. Number of features: %d" % tweets_tfidf.get_shape()[1])
pipe = Pipeline(steps=[
                 ('svd', TruncatedSVD(100)),
                 ('norm',Normalizer(copy=False))
                       ])

pipe.fit_transform(tweets_tfidf)
print("Explained Variance: " + str(pipe.get_params()['svd'].explained_variance_ratio_.sum()))
print("-- Execution time: %s seconds ---" % (time.time() - start_time))

Vectorizing Finished. Number of features: 51475
Explained Variance: 0.9966214902817914
-- Execution time: 8.706959247589111 seconds ---


In [28]:
####### Logistic Regression ############

start_time = time.time()
# Create lemmatizer using spacy
lemmatizer = spacy.lang.en.English()

# Define Model
lr_model = LogisticRegression(n_jobs=5)

# Define custom Tokenizer
def custom_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens if not token.is_punct])

# Define Vectorizer
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, stop_words='english',
                             lowercase=True, use_idf=True, max_df=2,
                             min_df=2, norm='l2', smooth_idf=True, ngram_range=(1, 2))

# Define Pipeline and Feature Union
pipeline = Pipeline([
    # Use Feature Union to combine features from the Tweet and other features gathered
    ('union', FeatureUnion(
        transformer_list=[
            # Pipeline for text
            ('tweet', Pipeline([
                ('selector', DataSelector(key='text')),
                ('vectidf', vectorizer),
                ('svd', TruncatedSVD(100)),
                ('norm',Normalizer(copy=False))
                                
            ])),
            
            # Pipeline for getting other features
            ('other', Pipeline([
                ('seclector', DataSelector(key='other'))
             ])),
        ],
                       
    )),
    # Use Logistic Regression Classifier
    ('lr', lr_model)
])

# Grid Search
parameters = {
                'lr__penalty':['l1'],
                'lr__C':[10],
                'lr__class_weight':['balanced'],
                #'lr__solver':['newton-cg'],
                'union__transformer_weights':[{'tweet':0.7, 'other':0.3}]
               
              }

grid = GridSearchCV(pipeline, parameters, scoring='f1', cv=2, verbose=0, n_jobs=10)

# Fit the grid
grid.fit(X_train, y_train)
# Predictions
y = grid.predict(X_test)
print("-- Execution time: %s seconds ---" % (time.time() - start_time))
print(classification_report(y, y_test))

-- Execution time: 20.969570875167847 seconds ---
             precision    recall  f1-score   support

          0       0.25      0.57      0.35         7
          1       0.73      0.40      0.52        20

avg / total       0.60      0.44      0.47        27

