## Live Data Collection

Pre-load these packages on your device in order to run this code

In [1]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [41]:
#Load packages from tweepy
import tweepy as tp

#Load package from snscrape to scrape twitters frontend
import snscrape.modules.twitter as sntwitter

#NLP Modules
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#Genism module
import gensim

#Load additional packages
import datetime
import time
import numpy as np
import pandas as pd
import requests
from collections import Counter
import itertools
import re
import sys
from retry import retry
import os
import dotenv

pd.set_option('display.max_colwidth', None)

### Tweey API Setup

This step must be repeated for each new key:value pair you'd like to store as local varaible. For the purpose of this repository, only four are required. Your twitter api key, api secret key, access token and access token secret. 

Below is an empty dictionary for you to copy paste your twitter api keys into

In [3]:
#key = 'value'
#twtr_api_key = ''
#twtr_api_secret_key = ''
#twtr_bearer_token = ''
#twtr_access_token = ''
#twtr_access_token_secret = ''

Copy paste each key value pair into the code below and run it for each key, value pair in order to save these as local environment variables

In [4]:
#Look for .env file and assign file path
dotenv_file = dotenv.find_dotenv()
#Load the .env file for editing
dotenv.load_dotenv(dotenv_file)
#Overwrite the os.environ dict with new key:value pair
os.environ['key'] = 'value'
#Update the new dict 
dotenv.set_key(dotenv_file,'key',os.environ['key'])

(True, 'key', 'value')

Set api and secret key variables for use in this script

In [5]:
api_key = os.environ['twtr_api_key']
api_secret_key = os.environ['twtr_api_secret_key']
bearer_token = os.environ['twtr_bearer_token']
access_token = os.environ['twtr_access_token']
access_token_secret = os.environ['twtr_access_token_secret']

Authorize api keys in order to use api

In [6]:
#Authorization using api keys
auth = tp.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tp.API(auth)

### Define functions

Function to loop through a list of twitter account handles and pull their last num_posts using the tweepy API

In [28]:
#Initilization
array = [[]]

#Pass in any series list of twitter account names
@retry(tries=10,delay=2,backoff=4,max_delay=42)
def get_top_tweets(s,num_posts):
    for screen_name in s:
        try:
            #Only pull the last __ number of tweets from each account 
            for tweet in api.user_timeline(screen_name = screen_name, count = num_posts):
                
                #Initialize temp lists as empty
                data = []
                
                #Append tweet metadata to temp lists
                data.append(f'{screen_name}')
                data.append(tweet.created_at) 
                data.append(tweet.text)
                data.append(tweet.retweet_count)
                data.append(tweet.favorite_count)
                
                #Append lists to array
                array.append(data)
                
                #Reset temp lists to empty
                data = []
        
        #Pass over an account name if it is no longer active
        except tp.TweepError:
            time.sleep(60*15)
            continue
    return array

Function to convert a list of items into a single string

In [29]:
#Define a function to convert list items to a single string joined by semicolon 
def list_to_string(x):
    lis = x
    string = ''
    string = " ".join(lis)
    return string

Function to tokenize, lemmatize and remove stopwords from the tweet corpus. Custom stopwords can be added in order to remove additional noise

In [30]:
def process_tweet(df):
    tokenizer = RegexpTokenizer('\w+|\$[\.]\S+')
    lem = WordNetLemmatizer()
    STOPWORDS = set(stopwords.words('english'))
    custom_stopwords = ['co','http','The','RT','I','day','We','This','ha','like','A','amp','If','year',
                      'morning','since','1','2','3','4','5','6','7','8','9','That','It','right','know','Here']
    custom_stop = set(custom_stopwords)
    df['processed_tweets'] = df['tweet'].apply(tokenizer.tokenize)
    df['processed_tweets'] = df['processed_tweets'].apply(lambda row: list([lem.lemmatize(i) for i in row]))
    df['processed_tweets'] = df['processed_tweets'].apply(lambda x:[i for i in x if i not in STOPWORDS] )
    df['processed_tweets'] = df['processed_tweets'].apply(lambda x:[i for i in x if i not in custom_stop] )

Function to scrape twitters front end using a list of keywords,start and end date. This function uses the snscrape module. 

In [31]:
@retry(tries=10,delay=2,backoff=4,max_delay=42)
def get_relevant_tweets(search_words,num_queries, start_date, end_date):
    
    #Intitilize empty list 
    tweets_list = [] 

    #For loop to go through each of the relevant keyword 
    for word in search_words:

        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'{word} since:{start_date} until:{end_date}').get_items()):
            if i>num_queries:
                break
            tweets_list.append([tweet.id,tweet.username,tweet.date,tweet.content,word])
    
    return tweets_list

Function to scrate twitters front end using account name. This function can go back in time as far as needed as it is nto limited by tweepy's 7 day historical api limit. 

In [32]:
@retry(tries=10,delay=2,backoff=4,max_delay=42)
def get_historical_tweets(s,num_queries):
    
    #Intitilize empty list 
    tweets_list = [] 

    #For loop to go through each of the relevant keyword 
    for screen_name in s:

        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(f'from:{screen_name}').get_items()):
            if i>num_queries:
                break
            tweets_list.append([tweet.id,tweet.username,tweet.date,tweet.content])
    
    return tweets_list

Define a list of the top twitter accounts you would like to scrape. The list has been pre-populated with popular investment accounts from primentn investment professionals. 

In [33]:
#List of top twitter investmenet accounts
top_accounts = ['elonmusk','chamath','fundstrat','elerianm','CNBC','PeterLBrandt','SJosephBurns','IBDinvestors',
               'TheStalwart','jimcramer','bespokeinvest','steve_hanke','MarketWatch','wallstreetbets','WSBChairman']
s = pd.Series(top_accounts)

In [34]:
#Pull the follower count for each account and create a dictionary with this key value pair
follower_count = []
top_account_dic = {}
for account in s:
    follower_count.append(api.get_user(account).followers_count)

top_account_dic = {k:v for k,v in zip(s,follower_count)}

Load the word2vec model from the assets folder. Ensure to replace file path with your own path. 

In [35]:
# Start timer.
t0 = time.time()

# Import word vectors into "model."
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/noahz/Desktop/Desktop/GA/project-6-capstone/assets/lexvec.commoncrawl.300d.W.pos.vectors')

# Print results of timer.
print(time.time() - t0)

423.77575492858887


## Data collection and transformation

Tweepy API Scrape

In [36]:
#Run the get top tweets function for historical tweets 
t0 = time.time()
array = get_top_tweets(s,10)
print(time.time() - t0)

df_1 = pd.DataFrame(array,columns=['screen_name','created_at','tweet','retweets','likes'])
df_1.dropna(inplace = True)

#Remove any dduplicate values in case accounts have not posted anything new
df_1.drop_duplicates(inplace=True)
df_1.reset_index(drop=True,inplace=True)

#Pre-process the tweets
process_tweet(df_1)

5.617655038833618


### Keyword analysis

In [68]:
#Pullout list of most frequent words 
words=[]
for x in df_1['processed_tweets']:
    words += x

words = [i.lower() for i in words]
    
words_dic_sorted = dict(sorted(Counter(words).items(), key=lambda item: item[1],reverse=True))
words_dic_sorted_filt = {k: v for k, v in words_dic_sorted.items() if v > 5}

#Assign most frequently occuring keywords (greater than 100 occurences) to a new list named key words 
key_words = []
for key,value in words_dic_sorted_filt.items():
    key_words.append(key)

#Using the word2vec model, create a list of the most similar 3 words to every key words on the keywords list
similar_words = []

for i in key_words: 
    try:
        x = model.most_similar(i, topn=3)
        for word in range(len(x)):
            similar_words.append(x[word][0])
    except:
        continue

#Remove duplicated by assigning the similar words list to a set
similar_words_set = set(similar_words)

  return (m / dist).astype(REAL)


### Extract all of the @s in order to build a list of accounts to conduct a 2nd layer network twitter scrape

In [38]:
#Extract mentions to use in 2nd level network search
matches = []
for word in df_1['tweet']:
    i = re.findall(r"\s([@][\w_-]+)",word,re.MULTILINE)
    if i:
        matches.append(i)

merged = list(itertools.chain(*matches))
s_ats = set(merged)
s_ats_list = list(s_ats)
s_ats_list = [x.replace('@','') for x in s_ats_list]

Use the new list of accounts to conduct an additional front end scrape using snscrape to pull the last 500 posts for every account in the 2nd layer network

In [39]:
#Reset tweet list to empty
tweets_list = []

t0 = time.time()
tweets_list = get_top_tweets(s_ats_list,10)
print(time.time() - t0)

#Put the tweets list pull into a dataframe
df_2 = pd.DataFrame(tweets_list, columns=['screen_name','created_at','tweet','retweets','likes'])

df_2.dropna(inplace = True)

#Remove any dduplicate values in case accounts have not posted anything new
df_2.drop_duplicates(inplace=True)
df_2.reset_index(drop=True,inplace=True)

#Pre-process the tweets
process_tweet(df_2)

18.15925407409668


### Second layer content scrape using word2vec similar words and keywords
Combine the key words and word2vec similar words to scrape twitters front end yesterday and today

In [69]:
#Pull in today and yesterdays date 
end_date = datetime.date.today()
start_date = end_date - datetime.timedelta(days = 1) 
end_date_str = end_date.strftime("%Y-%-m-%d")
start_date_str = start_date.strftime("%Y-%-m-%d")

In [70]:
#Combine key words, similar words from word2vec 
search_words = list(set(key_words + list(similar_words_set)))

#Reset tweet list to empty
tweets_list = []

#Call the get relevant tweets function to query all twitter posts for content matching our word2vec keywords
tweets_list = get_relevant_tweets(search_words = search_words, num_queries=50, 
                                  start_date = start_date_str, end_date = end_date_str)

#Put the tweets list pull into a dataframe
df_3 = pd.DataFrame(tweets_list, columns=['id','screen_name','created_at','tweet','key_word'])

#Pre-process the tweets
process_tweet(df_3)

Combine all of the scraped dataframes into a single master dataframe named tweets

In [80]:
#Process each dataframe to be combined
df_3['retweets'] = 0
df_3['likes'] = 0
df_3.drop(['key_word'],axis=1,inplace=True)
df_3.drop(['id'],axis=1,inplace=True)
df_3.drop(['processed_tweets'],axis=1,inplace=True)
df_2.drop(['processed_tweets'],axis=1,inplace=True)
df_1.drop(['processed_tweets'],axis=1,inplace=True)


#Combine all dataframes into a single one
tweets = df_1.append(df_2).append(df_3)
tweets.drop_duplicates(inplace=True)

KeyError: "['key_word'] not found in axis"

Populate a column in the master dataframe with the account follower values from the created dictionary

In [84]:
follower_count_dic = {}

for screen_name in tweets['screen_name'].unique():
    try:
        user = api.get_user(screen_name) 
        followers_count = user.followers_count
        time.sleep(0.001)
        if followers_count > 10000:
            follower_count_dic[screen_name] = followers_count
        else:
            continue
    except:
        continue
        
tweets['followers'] = tweets['screen_name'].map(follower_count_dic)

Extract and isolate any mention of a stock symbol within a tweet

In [85]:
#Breakout the full tweets into a list of lematized and tokenized words
process_tweet(tweets)

#Extract mentions of stock symbols into a standardized new column using regex
tweets['stocks'] = tweets['tweet'].str.extract('([$][A-Za-z][\S]*)')
tweets['stocks'] = tweets['stocks'].str.replace(r'[^A-Za-z0-9]+', '')
tweets['stocks'] = tweets['stocks'].str.lower()

Export processed data to a csv in the data folder

In [86]:
tweets.to_csv(f'data/twitter_live_{end_date}.csv')