## Historical Data Collection

Pre-load these packages on your device in order to run this code

In [112]:
#nltk.download('stopwords')
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/noahz/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
#Load packages from tweepy
import tweepy as tp

#Load package from snscrape to scrape twitters frontend
import snscrape.modules.twitter as sntwitter

#NLP Modules
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#Genism module
import gensim

#Load additional packages
import time
import numpy as np
import pandas as pd
import requests
from collections import Counter
import itertools
import re
import sys
from retry import retry
import os
import dotenv

pd.set_option('display.max_colwidth', None)

### Tweey API Setup

This step must be repeated for each new key:value pair you'd like to store as local varaible. For the purpose of this repository, only four are required. Your twitter api key, api secret key, access token and access token secret. 

Below is an empty dictionary for you to copy paste your twitter api keys into

In [2]:
#key = 'value'
#twtr_api_key = ''
#twtr_api_secret_key = ''
#twtr_bearer_token = ''
#twtr_access_token = ''
#twtr_access_token_secret = ''
#alpha_apikey = ''

Copy paste each key value pair into the code below and run it for each key, value pair in order to save these as local environment variables

In [7]:
#Look for .env file and assign file path
dotenv_file = dotenv.find_dotenv()
#Load the .env file for editing
dotenv.load_dotenv(dotenv_file)
#Overwrite the os.environ dict with new key:value pair
os.environ['key'] = 'value'
#Update the new dict 
dotenv.set_key(dotenv_file,'key',os.environ['key'])

UnboundLocalError: local variable 'dest' referenced before assignment

Set api and secret key variables for use in this script

In [4]:
api_key = os.environ['twtr_api_key']
api_secret_key = os.environ['twtr_api_secret_key']
bearer_token = os.environ['twtr_bearer_token']
access_token = os.environ['twtr_access_token']
access_token_secret = os.environ['twtr_access_token_secret']
apikey = os.environ['alpha_apikey']

Authorize api keys in order to use api

In [5]:
#Authorization using api keys
auth = tp.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tp.API(auth)

### Stock name and symbol data loading

Import historical stock symbol and name data from csv assets in the data folder

In [7]:
#Import S&P 500 stocks
s_p = pd.read_csv('data/constituents_csv.csv')
s_p.drop(['Sector'],axis=1,inplace=True)
s_p['source'] = 's_p'

#Import nasdaq stocks
nasdaq = pd.read_csv('data/nasdaq-listed_csv.csv')
nasdaq.drop(['Security Name','Market Category','Test Issue','Financial Status','Round Lot Size'],axis=1,inplace=True)
nasdaq.rename(columns={'Company Name':'Name'},inplace=True)
nasdaq['source'] = 'nasdaq'

#Import nyse stocks
nyse = pd.read_csv('data/nyse-listed_csv.csv')
nyse.rename(columns={'ACT Symbol':'Symbol','Company Name':'Name'},inplace=True)
nyse['source'] = 'nyse'

#Import tsx stocks 
tsx = pd.read_csv('data/TSX.txt',sep='\t')
tsx.rename(columns={'Description':'Name'},inplace=True)
tsx['source'] = 'tsx'

#Import tsxv stocks 
tsxv = pd.read_csv('data/TSXV.txt',sep='\t')
tsxv.rename(columns={'Description':'Name'},inplace=True)
tsxv['source'] = 'tsxv'

#Import nyse list 2 stocks 
nyse_2 = pd.read_csv('data/NYSE.txt',sep='\t')
nyse_2.rename(columns={'Description':'Name'},inplace=True)
nyse_2['source'] = 'nyse'

#Append to single list and remove any duplicates
stock_names = s_p.append(nasdaq).append(nyse).append(tsx).append(tsxv).append(nyse_2)
stock_names['id'] = stock_names['Symbol'] + stock_names['source']
stock_names.drop_duplicates(subset='id',inplace=True)
stock_names.dropna(inplace=True)
stock_names.reset_index(drop=True,inplace=True)

### Pull historical stock information for GME

Grab GME price data from Jan 1st, 2021 to March 5th, 2021

In [None]:
base_url = "https://www.alphavantage.co/query"
req_av = requests.get(base_url,params={'function':'TIME_SERIES_DAILY',
                                       'symbol':'GME',
                                      'apikey':apikey,'outputsize':'full'}) 

req_av.status_code
req_av.url
gme = req_av.json()
gme['Time Series (Daily)']
df = pd.DataFrame(gme['Time Series (Daily)'])
df = df.T
df.index = pd.to_datetime(df.index)
df = df.sort_index()
df = df['2021']
df['1. open'] = pd.to_numeric(df['1. open'], downcast="float")
df['2. high'] = pd.to_numeric(df['2. high'], downcast="float")
df['3. low'] = pd.to_numeric(df['3. low'], downcast="float")
df['4. close'] = pd.to_numeric(df['4. close'], downcast="float")
df['5. volume'] = pd.to_numeric(df['5. volume'], downcast="float")
s= df['4. close']
idx = pd.date_range('01-01-2021', '03-05-2021')
s = s.reindex(idx,method='nearest')

### Define functions

Function to loop through a list of twitter account handles and pull their last num_posts using the tweepy API

In [10]:
#Initilization
array = [[]]

#Pass in any series list of twitter account names
@retry(tries=10,delay=2,backoff=4,max_delay=42)
def get_top_tweets(s,num_posts):
    for screen_name in s:
        try:
            #Only pull the last __ number of tweets from each account 
            for tweet in api.user_timeline(screen_name = screen_name, count = num_posts):
                
                #Initialize temp lists as empty
                data = []
                hashtag_list = []
                
                #Append tweet metadata to temp lists
                data.append(f'{screen_name}')
                data.append(tweet.created_at) 
                data.append(tweet.text)
                data.append(tweet.retweet_count)
                data.append(tweet.favorite_count)
                if len(tweet.entities.get('hashtags'))>0:
                    ht = [tweet.entities.get('hashtags')[x]['text'] for x in range(0,len(tweet.entities.get('hashtags')))]
                    hashtag_list.append(ht)
                data.append(hashtag_list)
                
                #Append lists to array
                array.append(data)
                
                #Reset temp lists to empty
                data = []
                hashtag_list = []
        
        #Pass over an account name if it is no longer active
        except tp.TweepError:
            time.sleep(60*15)
            continue
    return array

Function to convert a list of items into a single string

In [11]:
#Define a function to convert list items to a single string joined by semicolon 
def list_to_string(x):
    lis = x
    string = ''
    string = " ".join(lis)
    return string

Function to extract the hastags from the collected dataframe

In [12]:
def hts_extract(df):
    #Convert hashtags columns to a list
    hashtags = df['hashtags'].tolist()

    #Initialize two empty lists for temp storage
    hts = []
    ht = []
    ht_final = []

    #Iterate through and remove blank hashtags
    for x in hashtags:
        if x == '':
            pass
        else:
            hts.append(x)

    #Iterate through and split out multiple hastags into additional list elements
    for x in hts:
        if ';' in x:
            el = x.split(';')
            ht = ht + el
        else:
            ht.append(x)

    for item in ht:
        ht_final.append(item.lower())
    
    #Count the number of times a hastag was mentioned, store this in a dictionary along with the keyword 
    #Sort the dictionary from highest to lowest values
    top_hts = dict(sorted(Counter(ht_final).items(), key=lambda item: item[1],reverse=True))
    return top_hts

Function to tokenize, lemmatize and remove stopwords from the tweet corpus. Custom stopwords can be added in order to remove additional noise

In [13]:
def process_tweet(df):
    tokenizer = RegexpTokenizer('\w+|\$[\.]\S+')
    lem = WordNetLemmatizer()
    STOPWORDS = set(stopwords.words('english'))
    custom_stopwords = ['co','http','The','RT','I','day','We','This','ha','like','A','amp','If','year',
                      'morning','since','1','2','3','4','5','6','7','8','9','That','It','right','know','Here']
    custom_stop = set(custom_stopwords)
    df['processed_tweets'] = df['tweet'].apply(tokenizer.tokenize)
    df['processed_tweets'] = df['processed_tweets'].apply(lambda row: list([lem.lemmatize(i) for i in row]))
    df['processed_tweets'] = df['processed_tweets'].apply(lambda x:[i for i in x if i not in STOPWORDS] )
    df['processed_tweets'] = df['processed_tweets'].apply(lambda x:[i for i in x if i not in custom_stop] )

Function to scrape twitters front end using a list of keywords,start and end date. This function uses the snscrape module. 

In [15]:
@retry(tries=10,delay=2,backoff=4,max_delay=42)
def get_relevant_tweets(search_words,num_queries, start_date, end_date):
    
    #Intitilize empty list 
    tweets_list = [] 

    #For loop to go through each of the relevant keyword 
    for word in search_words:

        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{word} since:{start_date} until:{end_date}').get_items()):
            if i>num_queries:
                break
            tweets_list.append([tweet.id,tweet.username,tweet.date,tweet.content,word])
    
    return tweets_list

Function to scrate twitters front end using account name. This function can go back in time as far as needed as it is nto limited by tweepy's 7 day historical api limit. 

In [16]:
@retry(tries=10,delay=2,backoff=4,max_delay=42)
def get_historical_tweets(s,num_queries):
    
    #Intitilize empty list 
    tweets_list = [] 

    #For loop to go through each of the relevant keyword 
    for screen_name in s:

        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'from:{screen_name}').get_items()):
            if i>num_queries:
                break
            tweets_list.append([tweet.id,tweet.username,tweet.date,tweet.content])
    
    return tweets_list

Define a list of the top twitter accounts you would like to scrape. The list has been pre-populated with popular investment accounts from primentn investment professionals. 

In [8]:
#List of top twitter investmenet accounts
top_accounts = ['elonmusk','chamath','fundstrat','elerianm','CNBC','PeterLBrandt','SJosephBurns','IBDinvestors',
               'TheStalwart','jimcramer','bespokeinvest','steve_hanke','MarketWatch','wallstreetbets','WSBChairman']
s = pd.Series(top_accounts)

In [9]:
#Pull the follower count for each account and create a dictionary with this key value pair
follower_count = []
top_account_dic = {}
for account in s:
    follower_count.append(api.get_user(account).followers_count)

top_account_dic = {k:v for k,v in zip(s,follower_count)}

Load the word2vec model from the assets folder. Ensure to replace file path with your own path. 

In [17]:
# Start timer.
t0 = time.time()

# Import word vectors into "model."
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/noahz/Desktop/Desktop/GA/project-6-capstone/assets/lexvec.commoncrawl.300d.W.pos.vectors')

# Print results of timer.
print(time.time() - t0)

396.2274160385132


## Data collection and transformation

SNS scrape pull used to get historical data for model training. 500 posts for every account. 

In [18]:
#Reset tweet list to empty
tweets_list = []

t0 = time.time()

#Set the number of posts per account to be 500
tweets_list = get_historical_tweets(s,500)
print(time.time() - t0)

#Put the tweets list pull into a dataframe
df_sns_1 = pd.DataFrame(tweets_list, columns=['id','screen_name','created_at','tweet'])
df_sns_1.dropna(inplace = True)
df_sns_1.drop_duplicates(inplace=True)
df_sns_1.reset_index(drop=True,inplace=True)

#Profcess the tweet text
process_tweet(df_sns_1)

69.44915318489075


### Keyword analysis

In [19]:
#Pullout list of most frequent words 
words=[]
for x in df_sns_1['processed_tweets']:
    words += x

words = [i.lower() for i in words]
    
words_dic_sorted = dict(sorted(Counter(words).items(), key=lambda item: item[1],reverse=True))
words_dic_sorted_filt = {k: v for k, v in words_dic_sorted.items() if v > 100}

#Assign most frequently occuring keywords (greater than 100 occurences) to a new list named key words 
key_words = []
for key,value in words_dic_sorted_filt.items():
    key_words.append(key)

#Using the word2vec model, create a list of the most similar 3 words to every key words on the keywords list
similar_words = []

for i in key_words: 
    try:
        x = model.most_similar(i, topn=3)
        for word in range(len(x)):
            similar_words.append(x[word][0])
    except:
        continue

#Remove duplicated by assigning the similar words list to a set
similar_words_set = set(similar_words)

### Extract all of the @s in order to build a list of accounts to conduct a 2nd layer network twitter scrape

In [22]:
#Extract mentions to use in 2nd level network search
matches = []
for word in df_sns_1['tweet']:
    i = re.findall(r"\s([@][\w_-]+)",word,re.MULTILINE)
    if i:
        matches.append(i)

merged = list(itertools.chain(*matches))
s_ats = set(merged)
s_ats_list = list(s_ats)
s_ats_list = [x.replace('@','') for x in s_ats_list]

Use the new list of accounts to conduct an additional front end scrape using snscrape to pull the last 500 posts for every account in the 2nd layer network

In [23]:
#Reset tweet list to empty
tweets_list = []

t0 = time.time()
tweets_list = get_historical_tweets(s_ats_list,500)
print(time.time() - t0)

#Put the tweets list pull into a dataframe
df_sns_2 = pd.DataFrame(tweets_list, columns=['id','screen_name','created_at','tweet'])
df_sns_2.dropna(inplace = True)
df_sns_2.drop_duplicates(inplace=True)
df_sns_2.reset_index(drop=True,inplace=True)

Error retrieving https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_composer_source=true&include_ext_alt_text=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweets=true&q=from%3APrestonPysh&tweet_search_mode=live&count=100&query_source=spelling_expansion_revert_click&cursor=scroll%3AthGAVUV0VFVBaAgLP14IT47SUWgMCwsZa_7_olEnEVnPl5FYCJehgHREVGQVVMVDUBFQAVAAA%3D&pc=1&spelling_corrections=1&ext=mediaStats%252CcameraMoment: ReadTimeout(ReadTimeoutError("HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=10)")), retrying


4846.928550004959


### Second layer content scrape using word2vec similar words and keywords
Combine the key words and word2vec similar words to scrape twitters front end betwen Jan 1, 2021 and March 5, 2021 

In [25]:
#Combine key words, similar words from word2vec and top hashtags + top_hts_list when doing live pull
search_words = list(set(key_words + list(similar_words_set)))

#Reset tweet list to empty
tweets_list = []

#Call the get relevant tweets function to query all twitter posts for content matching our word2vec keywords
tweets_list = get_relevant_tweets(search_words = search_words, num_queries=50, 
                                  start_date = '2021-1-01', end_date = '2021-3-05')

#Put the tweets list pull into a dataframe
df_sns_3 = pd.DataFrame(tweets_list, columns=['id','screen_name','created_at','tweet','key_word'])

### Final data pre-processing

Combine all of the scraped dataframes into a single master dataframe named tweets

In [29]:
#Process each dataframe to be combined
df_sns_1.drop(['processed_tweets'],axis=1,inplace=True)
df_sns_3.drop(['key_word'],axis=1,inplace=True)

#Combine all dataframes into a single one
tweets = df_sns_1.append(df_sns_2).append(df_sns_3)
tweets.drop_duplicates(inplace=True)

Populate a column in the master dataframe with the account follower values from the created dictionary

In [33]:
follower_count_dic = {}

for screen_name in tweets['screen_name'].unique():
    try:
        user = api.get_user(screen_name) 
        followers_count = user.followers_count
        time.sleep(0.001)
        if followers_count > 10000:
            follower_count_dic[screen_name] = followers_count
        else:
            continue
    except:
        continue
        
tweets['followers'] = tweets['screen_name'].map(follower_count_dic)

NameError: name 'df' is not defined

Extract and isolate any mention of a stock symbol within a tweet

In [43]:
#Breakout the full tweets into a list of lematized and tokenized words
process_tweet(tweets)

#Extract mentions of stock symbols into a standardized new column using regex
tweets['stocks'] = tweets['tweet'].str.extract('([$][A-Za-z][\S]*)')
tweets['stocks'] = tweets['stocks'].str.replace(r'[^A-Za-z0-9]+', '')
tweets['stocks'] = tweets['stocks'].str.lower()

### Data export

Export processed data to a csv in the data folder

In [62]:
tweets.to_csv('data/twitter_historical_05.03.21.csv')
stock_names.to_csv('data/stock_names.csv')