# Todo List

#### Minimum Requirements

[x] Return top ticker names from a given time interval

[x] Filter comments for tickers

[X] Filter posts for tickers

[x] Remove stop words

#### Recommended

[X] Separate API credentials to separate file

[X] Bot detection

[X] Have threshold for karma for posts

[X] Have threshold for karma for comments

[ ] Visualization of tickers name per day

[ ] Visualization of sectors

[ ] Determine percent change of ticker frequency (requires Future To-Do #1)

#### Future

[ ] Write the data to a file so that we can build data over time

[ ] Auto-rerun for live-stream of data

[ ] Sentimental analysis of tickers

### Useful Links

https://praw.readthedocs.io/en/latest/code_overview/models/comment.html?highlight=comment

In [None]:
!pip install praw

In [1]:
import string
import praw
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from datetime import datetime
from collections import defaultdict, OrderedDict
import json

# Get API credentials from local file
file = open('config.json')
config = json.load(file) 

# Initialize praw
reddit = praw.Reddit(client_id = config['client_id'],
                     client_secret = config['client_secret'],
                     user_agent = config['user_agent'])

[nltk_data] Downloading package stopwords to /Users/mo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
moKenFilter = {'rh','dd', 'ev', 'usd', 'lol', 'td', 'ceo', 'cto', 'coo', 'cfo', 'ipo',
              'gild', 'buy', 'imo', 'usa', 'good', 'one', 'go', 'know', 'see', 'well', 'new',
              'hold', 'want', 'need', 'next', 'post', 'play', 'sub', 'big', 'ive', 'term', 'real',
              'man', 'ago', 'cash', 'nice', 'pump', ''}

In [3]:
def timeDiff (unixTime, threshold):
    '''
    Calculate time difference between current time and input time.
    
    :return: True if time difference within threshold, False otherwise
    :return type: bool
    '''
    now = datetime.utcnow()
    timestamp = datetime.utcfromtimestamp(unixTime).strftime('%Y-%m-%d %H:%M:%S')
    timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
    triangle = (now - timestamp).seconds
    if triangle <= (threshold * 3600):
        return True
    return False

In [26]:
def debugPrint(body, tickersDictGood, tickersDictFair, tickersDictPoor):
    '''
    Debug printing function to print out ticker values
    
    :param input body: the message to be printed initially
    :param type: str
    
    :param input tickersDictXXXX: the dataframe to print values from
    :param type: dict
    '''
    print(body + str(sum(tickersDictGood.values()) + 
                               sum(tickersDictFair.values()) +
                               sum(tickersDictPoor.values())
          ))
    print("Tickers from good accounts: " + str(sum(tickersDictGood.values())))
    print("Tickers from fair accounts: " + str(sum(tickersDictFair.values())))
    print("Tickers from poor accounts: " + str(sum(tickersDictPoor.values())))

In [23]:
# Function to extract tickers from dataframe column
def get_ticker_from_col(dictGood, dictFair, dictPoor, data_col, karma_col, age_col):
    '''
    Extract the ticker names that appear in a specific dataframe column
    
    :param input dictGood: ticker counts from users with old accs and high karma
    :param type: dict
    
    :param input dictGood: ticker counts from either old acc OR high karma users
    :param type: dict
    
    :param input dictGood: ticker counts from users with new acc and no karma (probably bots)
    :param type: dict
    
    :param input df_col: dataframe column to be read
    :param type: pd.DataFrame (single column only)
    '''
    # Filter comments for all tickers that show up
    punct_table = str.maketrans(dict.fromkeys(string.punctuation)) # Holds all punctuation
            
    for i in range(len(data_col)):
        for word in data_col.iloc[i].split():
            word = word.translate(punct_table) # Remove punctuation 
            if word == "": # if word is empty
                continue
            elif len(word) > 5: # if word is clearly not a ticker
                continue
            elif word.lower() in stopwords.words('english') or word.lower() in moKenFilter:
                continue
            elif word.isnumeric():
                continue
            elif word in dictGood: #handle if ticker starts with $
                if karma_col.iloc[i] and age_col.iloc[i]:
                    dictGood[word] += 1
                elif karma_col.iloc[i] or age_col.iloc[i]:
                    dictFair[word] += 1
                else:
                    dictPoor[word] += 1

In [19]:
def validateAuthorKarma(redditor, KARMA_THRESHOLD):
    '''
    Determine if redditor has more karma than some threshold
    
    :param input redditor: redditor to check age of
    :param type: praw.redditor
    
    :param input KARMA_THRESHOLD: threshold to use for karma
    :param type: int'''
    if not redditor or not redditor.comment_karma or not redditor.link_karma:
        return False
    return redditor.comment_karma + redditor.link_karma > KARMA_THRESHOLD

In [20]:
def validateAuthorAge(redditor, AGE_THRESHOLD):
    '''
    Determine if redditor is older than some threshold
    
    :param input redditor: redditor to check age of
    :param type: praw.redditor
    
    :param input AGE_THRESHOLD: threshold to use for age
    :param type: int'''
    if not redditor or not redditor.created_utc: return False
    return timeDiff(redditor.created_utc, AGE_THRESHOLD)

In [27]:
# Function to extract top tickers from
def extract_tickers(input_subreddit, threshold=3600, limit=100, debug=False):
    '''
    Extract the most frequent tickers from a given subreddit for a desired timeframe
    
    :param input_subreddit: subreddit to be queried
    :param type: str
    
    :param threshold: how far back (in seconds) in the past to retrieve data from
    :param type: int
    
    :param limit: set upper bound of number of posts to read
    :param type: int
    
    :param debug: debug flag to print stuff
    :param type: bool
    
    :return: sorted dictionaries for ticker appearances in a subreddit
             distributed based on account legitimacy
    :return type: dict
    '''
    if debug: print("Running in debug mode with following params: " + str(input_subreddit) + ", " 
                                                                    + str(threshold) + ", "
                                                                    + str(limit))
    # Thresholds for account reliability
    AGE_THRESHOLD = 180 * 24 # number of days * hours in a day (gets converted to seconds in timeDiff function)
    KARMA_THRESHOLD = 250 # minimum karma
    
    subreddit = reddit.subreddit(input_subreddit)
    
    # Gather all posts from subreddit that are within the threshold
    posts = []
    comments = []
    for post in subreddit.new(limit=limit):
        if timeDiff(post.created, threshold):
            posts.append([post.title, post.author, validateAuthorKarma(post.author, KARMA_THRESHOLD), 
                          validateAuthorAge(post.author, AGE_THRESHOLD), post.score, 
                          post.id, post.subreddit, post.selftext, post.created])
        else: break
    posts = pd.DataFrame(posts,columns=['title', 'author', 'authorKarmaValid', 'authorAgeValid', 
                                        'score', 'id', 'subreddit', 'body', 'created'])
    
    if debug: print("Number of posts found in timeframe provided: " + str(len(posts)))
        
    # Gather all comments found in posts above
    comments = []
    for postId in posts['id']:
        submission = reddit.submission(id=postId)
        submission.comments.replace_more(limit=0)
        for comment in submission.comments.list():
            if comment.author != 'AutoModerator':
                comments.append([comment.score, comment.author, 
                                 validateAuthorKarma(comment.author, KARMA_THRESHOLD),
                                 validateAuthorAge(comment.author, AGE_THRESHOLD), 
                                 comment.body, comment.created])
    comments = pd.DataFrame(comments, columns=['score', 'author', 'authorKarmaValid', 
                                               'authorAgeValid', 'body','created'])
    
    if debug: print("Number of comments found in timeframe provided: " + str(len(comments)))
        
    # Set up structure to hold ticker counts
    tickers = pd.read_csv("tickers.csv") 
    tickers = set(tickers['Symbol'])
    tickersDictGood = dict() # Users that are legit (both old acc and decent karma)
    tickersDictFair = dict() # Users that are lurkers or new (either old acc or high karma, not both)
    tickersDictPoor = dict() # Users that are possibly bots (new acc and no karma)
    for ticker in tickers:
        tickersDictGood[ticker] = 0
        tickersDictFair[ticker] = 0
        tickersDictPoor[ticker] = 0
    
    # Extract tickers from comments
    get_ticker_from_col(tickersDictGood, tickersDictFair, tickersDictPoor, 
                        comments['body'], comments['authorKarmaValid'], comments['authorAgeValid'])
    if debug: debugPrint("Tickers from comments: ", tickersDictGood, tickersDictFair, tickersDictPoor)
        
    get_ticker_from_col(tickersDictGood, tickersDictFair, tickersDictPoor, 
                        posts['body'], posts['authorKarmaValid'], posts['authorAgeValid'])
    if debug: debugPrint("Tickers from comments and posts: ", tickersDictGood, 
                         tickersDictFair, tickersDictPoor)
        
    get_ticker_from_col(tickersDictGood, tickersDictFair, tickersDictPoor, 
                        posts['title'], posts['authorKarmaValid'], posts['authorAgeValid'])
    if debug: debugPrint("Tickers from comments, posts, and post titles: ", tickersDictGood, 
                         tickersDictFair, tickersDictPoor)
        
    return (dict(OrderedDict(sorted(tickersDictGood.items(), key = lambda t: t[1] ,reverse=True))), 
            dict(OrderedDict(sorted(tickersDictFair.items(), key = lambda t: t[1] ,reverse=True))), 
            dict(OrderedDict(sorted(tickersDictPoor.items(), key = lambda t: t[1] ,reverse=True)))
           )

In [32]:
pennystocks_good, pennystocks_fair, pennystocks_poor = extract_tickers('pennystocks', 
                                                                               threshold = 18 * 3600, 
                                                                               limit = 100,
                                                                               debug = True)

Running in debug mode with following params: pennystocks, 64800, 100
Number of posts found in timeframe provided: 100
Number of comments found in timeframe provided: 2266
Tickers from all comments: 334
Tickers from good accounts: 256
Tickers from fair accounts: 78
Tickers from poor accounts: 0
Tickers from all comments and posts: 480
Tickers from good accounts: 393
Tickers from fair accounts: 87
Tickers from poor accounts: 0
Tickers from all comments and posts and post titles: 502
Tickers from good accounts: 413
Tickers from fair accounts: 89
Tickers from poor accounts: 0


In [34]:
pennystocks_fair

{'PKG': 12,
 'PD': 6,
 'CDC': 5,
 'B': 3,
 'PS': 3,
 'ATOS': 3,
 'ZOM': 2,
 'CTRM': 2,
 'CBD': 2,
 'GEVO': 2,
 'FPX': 2,
 'DLNG': 1,
 'SYN': 1,
 'NVCN': 1,
 'UAMY': 1,
 'FNF': 1,
 'PT': 1,
 'CIDM': 1,
 'BBQ': 1,
 'BPY': 1,
 'DFFN': 1,
 'AZN': 1,
 'FLTR': 1,
 'DNN': 1,
 'ASYS': 1,
 'GTE': 1,
 'RIOT': 1,
 'FIS': 1,
 'PK': 1,
 'NAT': 1,
 'ETH': 1,
 'MJ': 1,
 'AQMS': 1,
 'TSLA': 1,
 'BNGO': 1,
 'DEF': 1,
 'BAM': 1,
 'XSPA': 1,
 'CY': 1,
 'SSL': 1,
 'AMC': 1,
 'EDIT': 1,
 'SNGX': 1,
 'MARA': 1,
 'ASRT': 1,
 'MXE': 1,
 'BMO': 1,
 'OBSV': 1,
 'KOD': 1,
 'IP': 1,
 'TBLT': 1,
 'OCGN': 1,
 'AI': 1,
 'SAVA': 1,
 'FAF': 1,
 'MSFT': 1,
 'STAR': 1,
 'MN': 1,
 'CATS': 0,
 'JJSF': 0,
 'HJLI': 0,
 'GNPX': 0,
 'PAG': 0,
 'APRN': 0,
 'CHIX': 0,
 'SFUN': 0,
 'FNV': 0,
 'TMUS': 0,
 'DBS': 0,
 'JMUB': 0,
 'HURN': 0,
 'IHI': 0,
 'FLWS': 0,
 'VTWV': 0,
 'JO': 0,
 'UNF': 0,
 'ECOL': 0,
 'AUSF': 0,
 'DVP': 0,
 'FDVV': 0,
 'AMPY': 0,
 'MMLP': 0,
 'JCI': 0,
 'CMRE': 0,
 'AVT': 0,
 'ANY': 0,
 'SMIN': 0,
 'MSN': 0,

## Reference Code

In [None]:
# returns a list of time difference of seconds if within threshold return delta, otherwise -1
def dateDiff(listOfUnixTime, threshold):
    dateDiffs = []
    for item in listOfUnixTime:
        dt = datetime.utcfromtimestamp(item).strftime('%Y-%m-%d %H:%M:%S')
        date_time_obj = datetime.strptime(dt, '%Y-%m-%d %H:%M:%S')
        triangle = (ts - date_time_obj).seconds
        if( triangle >= threshold ):
            dateDiffs.append(-1)
        else:
            dateDiffs.append(triangle)
    return dateDiffs

In [None]:
tickers = pd.read_csv("tickers.csv") 
tickers = set(tickers['Symbol'])
tickersDict = dict()
for ticker in tickers:
    tickersDict[ticker] = 0

In [None]:
wsb_subreddit = reddit.subreddit('wallstreetbets')
posts = []
for post in wsb_subreddit.new(limit=100):
    posts.append([post.title, post.score, post.id, post.subreddit, 
                  post.url, post.num_comments, post.selftext, post.created])

In [None]:
posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
dayInSeconds = 86400
postThreshold = dayInSeconds
posts['timeDiff'] = dateDiff(posts['created'], postThreshold)
posts = posts[posts.timeDiff > -1]

#set another threshold for post karma

In [None]:
comments = []
for postId in posts['id']:
    submission = reddit.submission(id=postId)
    submission.comments.replace_more(limit=0)
    for comment in submission.comments.list():
        if comment.author != 'AutoModerator':
            comments.append([ comment.score, comment.author, comment.body, comment.created])

commentsDF = pd.DataFrame(comments, columns=['score','author','body','created'])
commentsDF['timeDiff'] = dateDiff(commentsDF['created'], postThreshold)

In [None]:
# letTheBodiesHitTheFloor
punct_table = str.maketrans(dict.fromkeys(string.punctuation)) # Holds all punctuation
for body in commentsDF['body']:
    for word in body.split():
        word = word.translate(punct_table) # Remove punctuation 
        if word == "": # if word is empty
            continue
        elif len(word) > 5: # if word is clearly not a ticker
            continue
        elif word.lower() in stopwords.words('english') or word.lower() in moKenFilter:
            continue
        elif word.isnumeric():
            continue
        elif word in tickersDict: #handle if ticker starts with $
            tickersDict[word] += 1

In [None]:
tickersSorted = dict(OrderedDict(sorted(tickersDict.items(), key = lambda t: t[1] ,reverse=True)))