# Todo List

#### Minimum Requirements

[x] Return top ticker names from a given time interval

[x] Filter comments for tickers

[X] Filter posts for tickers

[x] Remove stop words

#### Recommended

[X] Separate API credentials to separate file

[X] Bot detection

[X] Have threshold for karma for posts

[X] Have threshold for karma for comments

[X] Have threshold for age of Redditor

[X] Handle repeat tickers in the same post/comment instance

[ ] Visualization of tickers name per day (stacked bar chart)

[ ] Visualization of sectors

[ ] Determine percent change of ticker frequency (requires Future To-Do #1)

#### Future

[ ] Write the data to a file so that we can build data over time

[ ] Auto-rerun for live-stream of data

    [ ] Live ticker dashboard

    [ ] Live line graph for popular tickers

    [ ] Correlation between current market performance and ticker mentions

[ ] Sentimental analysis of tickers

### Useful Links

https://praw.readthedocs.io/en/latest/code_overview/models/comment.html?highlight=comment
https://api.pushshift.io/reddit/search/submission/?subreddit=learnpython&sort=desc&sort_type=created_utc&after=1523588521&before=1523934121&size=1000

In [1]:
!pip install praw



In [26]:
import string
import praw
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from datetime import datetime
from collections import defaultdict, OrderedDict
import json

# Get API credentials from local file
file = open('config.json')
config = json.load(file) 

# Initialize praw
reddit = praw.Reddit(client_id = config['client_id'],
                     client_secret = config['client_secret'],
                     user_agent = config['user_agent'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ken\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
moKenFilter = {'rh','dd', 'ev', 'usd', 'lol', 'td', 'ceo', 'cto', 'coo', 'cfo', 'ipo',
              'gild', 'buy', 'imo', 'usa', 'good', 'one', 'go', 'know', 'see', 'well', 'new',
              'hold', 'want', 'need', 'next', 'post', 'play', 'sub', 'big', 'ive', 'term', 'real',
              'man', 'ago', 'cash', 'nice', 'pump', 'edit', ''}

In [28]:
def timeDiff (unixTime, threshold):
    '''
    Calculate time difference between current time and input time.
    
    :return: True if time difference within threshold, False otherwise
    :return type: bool
    '''
    now = datetime.utcnow()
    timestamp = datetime.utcfromtimestamp(unixTime).strftime('%Y-%m-%d %H:%M:%S')
    timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
    triangle = (now - timestamp).total_seconds() / 3600
    if triangle <= (threshold):
        return True
    return False

In [29]:
def debugPrint(body, tickersDictGood, tickersDictFair, tickersDictPoor):
    '''
    Debug printing function to print out ticker values
    
    :param input body: the message to be printed initially
    :param type: str
    
    :param input tickersDictXXXX: the dataframe to print values from
    :param type: dict
    '''
    print(body + str(sum(tickersDictGood.values()) + 
                               sum(tickersDictFair.values()) +
                               sum(tickersDictPoor.values())
          ))
    print("Tickers from good accounts: " + str(sum(tickersDictGood.values())))
    print("Tickers from fair accounts: " + str(sum(tickersDictFair.values())))
    print("Tickers from poor accounts: " + str(sum(tickersDictPoor.values())))

In [35]:
# Function to extract tickers from dataframe column
def get_ticker_from_col(dictGood, dictFair, dictPoor, data_col, karma_col, age_col):
    '''
    Extract the ticker names that appear in a specific dataframe column
    
    :param input dictGood: ticker counts from users with old accs and high karma
    :param type: dict
    
    :param input dictGood: ticker counts from either old acc OR high karma users
    :param type: dict
    
    :param input dictGood: ticker counts from users with new acc and no karma (probably bots)
    :param type: dict
    
    :param input df_col: dataframe column to be read
    :param type: pd.DataFrame (single column only)
    '''
    # Filter comments for all tickers that show up
    punct_table = str.maketrans(dict.fromkeys(string.punctuation)) # Holds all punctuation
    for i in range(len(data_col)):
        tickers = set()
        for word in data_col.iloc[i].split():
            word = word.translate(punct_table) # Remove punctuation 
            if word == "": # if word is empty
                continue
            elif len(word) > 5: # if word is clearly not a ticker
                continue
            elif word.lower() in stopwords.words('english') or word.lower() in moKenFilter:
                continue
            elif word.isnumeric():
                continue
            elif word in tickers: # if current comment already encountered this specific ticker
                continue
            elif word in dictGood: #handle if ticker starts with $
                tickers.add(word)
                if karma_col.iloc[i] and age_col.iloc[i]:
                    dictGood[word] += 1
                elif karma_col.iloc[i] or age_col.iloc[i]:
                    dictFair[word] += 1
                else:
                    dictPoor[word] += 1

In [36]:
def validateAuthorKarma(redditor, KARMA_THRESHOLD):
    '''
    Determine if redditor has more karma than some threshold
    
    :param input redditor: redditor to check age of
    :param type: praw.redditor
    
    :param input KARMA_THRESHOLD: threshold to use for karma
    :param type: int'''
    if not redditor or not redditor.comment_karma or not redditor.link_karma:
        return False
    return redditor.comment_karma + redditor.link_karma > KARMA_THRESHOLD

In [37]:
def validateAuthorAge(redditor, AGE_THRESHOLD):
    '''
    Determine if redditor is older than some threshold
    
    :param input redditor: redditor to check age of
    :param type: praw.redditor
    
    :param input AGE_THRESHOLD: threshold to use for age
    :param type: int'''
    if not redditor or not redditor.created_utc: return False
    
    # replace with method 
    now = datetime.utcnow()
    timestamp = datetime.utcfromtimestamp(redditor.created_utc).strftime('%Y-%m-%d %H:%M:%S')
    timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
    triangle = (now - timestamp).total_seconds() / 3600
    return triangle >= AGE_THRESHOLD

In [38]:
# Function to extract top tickers from
def extract_tickers(input_subreddit, threshold=168, limit=100, debug=False):
    '''
    Extract the most frequent tickers from a given subreddit for a desired timeframe
    
    :param input_subreddit: subreddit to be queried
    :param type: str
    
    :param threshold: how far back (in hours) in the past to retrieve data from
    :param type: int
    :param default: 1 week = 7 days * 24 hours = 168 hours
    
    :param limit: set upper bound of number of posts to read
    :param type: int
    
    :param debug: debug flag to print stuff
    :param type: bool
    
    :return: sorted dictionaries for ticker appearances in a subreddit
             distributed based on account legitimacy
    :return type: dict
    '''
    start = datetime.utcnow()
    if debug: print("Starting time: ", start.strftime('%Y-%m-%d %H:%M:%S'))
    if debug: print("Running in debug mode with following params: " + str(input_subreddit) + ", " 
                                                                    + str(threshold) + ", "
                                                                    + str(limit))
    # Thresholds for account reliability
    AGE_THRESHOLD = 180 * 24 # number of days * hours in a day
    KARMA_THRESHOLD = 250 # minimum karma
    
    subreddit = reddit.subreddit(input_subreddit)
    
    
    # Gather all posts from subreddit that are within the threshold
    posts = []
    postsParsed = 0
    for post in subreddit.new(limit=limit):
        postsParsed += 1
        if debug and(postsParsed % 100 == 0): print("Posts Parsed: ", postsParsed)
        if timeDiff(post.created, threshold):
            posts.append([post.title, post.author, validateAuthorKarma(post.author, KARMA_THRESHOLD), 
                          validateAuthorAge(post.author, AGE_THRESHOLD), post.score, 
                          post.id, post.subreddit, post.selftext, post.created])
        else: continue
    posts = pd.DataFrame(posts,columns=['title', 'author', 'authorKarmaValid', 'authorAgeValid', 
                                        'score', 'id', 'subreddit', 'body', 'created'])
    
    if debug: print("Number of posts found in timeframe provided: " + str(len(posts)))
        
    # Gather all comments found in posts above
    comments = []
    index = 0;
    for postId in posts['id']:
        submission = reddit.submission(id=postId)
        submission.comments.replace_more(limit=0)
        for comment in submission.comments.list():
            index+=1
            if debug and index% 1000 == 0:print("Comments parsed: " + str(index))
            if comment.author != 'AutoModerator' and type(comment.author) is praw.models.reddit.redditor.Redditor:
                try:
                    if comment.author.is_suspended: 
                        print("Suspended Author Name:", comment.author)
                        continue
                except Exception:
                    index = index #remove this later
                
                comments.append([comment.score, comment.author, 
                                 validateAuthorKarma(comment.author, KARMA_THRESHOLD),
                                 validateAuthorAge(comment.author, AGE_THRESHOLD), 
                                 comment.body, comment.created,
                                 str(datetime.utcfromtimestamp(comment.author.created_utc).strftime('%Y-%m-%d %H:%M:%S'))])
    comments = pd.DataFrame(comments, columns=['score', 'author', 'authorKarmaValid', 
                                               'authorAgeValid', 'body','created', 'author_created'])
    
    if debug: print("Number of comments found in timeframe provided: " + str(len(comments)))
        
    # Set up structure to hold ticker counts
    tickers = pd.read_csv("tickers.csv") 
    tickers = set(tickers['Symbol'])
    tickersDictGood = dict() # Users that are legit (both old acc and decent karma)
    tickersDictFair = dict() # Users that are lurkers or new (either old acc or high karma, not both)
    tickersDictPoor = dict() # Users that are possibly bots (new acc and no karma)
    for ticker in tickers:
        tickersDictGood[ticker] = 0
        tickersDictFair[ticker] = 0
        tickersDictPoor[ticker] = 0
    
    # Extract tickers from comments
    get_ticker_from_col(tickersDictGood, tickersDictFair, tickersDictPoor, 
                        comments['body'], comments['authorKarmaValid'], comments['authorAgeValid'])
    if debug: debugPrint("Tickers from comments: ", tickersDictGood, tickersDictFair, tickersDictPoor)
        
    get_ticker_from_col(tickersDictGood, tickersDictFair, tickersDictPoor, 
                        posts['body'], posts['authorKarmaValid'], posts['authorAgeValid'])
    if debug: debugPrint("Tickers from comments and posts: ", tickersDictGood, 
                         tickersDictFair, tickersDictPoor)
        
    get_ticker_from_col(tickersDictGood, tickersDictFair, tickersDictPoor, 
                        posts['title'], posts['authorKarmaValid'], posts['authorAgeValid'])
    if debug: debugPrint("Tickers from comments, posts, and post titles: ", tickersDictGood, 
                         tickersDictFair, tickersDictPoor)
    if debug: print("Time elapsed: ", (datetime.utcnow()-start).total_seconds()/60)
        
    return (dict(OrderedDict(sorted(tickersDictGood.items(), key = lambda t: t[1] ,reverse=True))), 
            dict(OrderedDict(sorted(tickersDictFair.items(), key = lambda t: t[1] ,reverse=True))), 
            dict(OrderedDict(sorted(tickersDictPoor.items(), key = lambda t: t[1] ,reverse=True))),
            posts,
            comments,
           )

In [39]:
good, fair, poor, comments = extract_tickers('wallstreetbets', threshold = 7*24, 
                                              limit = 50, debug = True)

Starting time:  2021-02-17 03:38:23
Running in debug mode with following params: wallstreetbets, 168, 50
Number of posts found in timeframe provided: 50
Comments parsed: 1000
Comments parsed: 2000
Number of comments found in timeframe provided: 2114
Tickers from comments: 190
Tickers from good accounts: 163
Tickers from fair accounts: 25
Tickers from poor accounts: 2
Tickers from comments and posts: 218
Tickers from good accounts: 185
Tickers from fair accounts: 31
Tickers from poor accounts: 2
Tickers from comments, posts, and post titles: 239
Tickers from good accounts: 202
Tickers from fair accounts: 33
Tickers from poor accounts: 4
Time elapsed:  5.849374249999999


In [41]:
list(pennystocks_good.items())[:10]

[('GME', 39),
 ('AMC', 10),
 ('RIOT', 9),
 ('BP', 6),
 ('YOLO', 6),
 ('MVIS', 6),
 ('DNN', 5),
 ('AMD', 5),
 ('EPS', 4),
 ('CAT', 4)]

In [42]:
list(pennystocks_fair.items())[:10]

[('GME', 4),
 ('AMD', 2),
 ('MO', 2),
 ('ARKW', 1),
 ('TEAM', 1),
 ('MPC', 1),
 ('BAK', 1),
 ('EPS', 1),
 ('DNN', 1),
 ('RIOT', 1)]

In [43]:
list(pennystocks_poor.items())[:10]

[('AMD', 1),
 ('B', 1),
 ('YOLO', 1),
 ('GME', 1),
 ('GER', 0),
 ('CI', 0),
 ('AMED', 0),
 ('JPMF', 0),
 ('PBSM', 0),
 ('IEZ', 0)]

## Reference Code

# returns a list of time difference of seconds if within threshold return delta, otherwise -1
def dateDiff(listOfUnixTime, threshold):
    dateDiffs = []
    for item in listOfUnixTime:
        dt = datetime.utcfromtimestamp(item).strftime('%Y-%m-%d %H:%M:%S')
        date_time_obj = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S')
        triangle = (ts - date_time_obj).seconds
        if( triangle >= threshold ):
            dateDiffs.append(-1)
        else:
            dateDiffs.append(triangle)
    return dateDiffs

tickers = pd.read_csv("tickers.csv") 
tickers = set(tickers['Symbol'])
tickersDict = dict()
for ticker in tickers:
    tickersDict[ticker] = 0

wsb_subreddit = reddit.subreddit('wallstreetbets')
posts = []
for post in wsb_subreddit.new(limit=100):
    posts.append([post.title, post.score, post.id, post.subreddit, 
                  post.url, post.num_comments, post.selftext, post.created])

posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
dayInSeconds = 86400
postThreshold = dayInSeconds
posts['timeDiff'] = dateDiff(posts['created'], postThreshold)
posts = posts[posts.timeDiff > -1]

#set another threshold for post karma

comments = []
for postId in posts['id']:
    submission = reddit.submission(id=postId)
    submission.comments.replace_more(limit=0)
    for comment in submission.comments.list():
        if comment.author != 'AutoModerator':
            comments.append([ comment.score, comment.author, comment.body, comment.created])

commentsDF = pd.DataFrame(comments, columns=['score','author','body','created'])
commentsDF['timeDiff'] = dateDiff(commentsDF['created'], postThreshold)

# letTheBodiesHitTheFloor
punct_table = str.maketrans(dict.fromkeys(string.punctuation)) # Holds all punctuation
for body in commentsDF['body']:
    for word in body.split():
        word = word.translate(punct_table) # Remove punctuation 
        if word == "": # if word is empty
            continue
        elif len(word) > 5: # if word is clearly not a ticker
            continue
        elif word.lower() in stopwords.words('english') or word.lower() in moKenFilter:
            continue
        elif word.isnumeric():
            continue
        elif word in tickersDict: #handle if ticker starts with $
            tickersDict[word] += 1

tickersSorted = dict(OrderedDict(sorted(tickersDict.items(), key = lambda t: t[1] ,reverse=True)))