In [1]:
import pymongo
import pandas as pd
import numpy as np
import re
import time
import datetime
import nltk
import praw
from datetime import datetime, timedelta
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bson.objectid import ObjectId

In [2]:
def getContinuousChunks(text):
    chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
    continuousChunkdf = pd.DataFrame(columns = ['id', 'Named Entity', 'Label'])
    currentChunk = []
    currentLabel = []
    for i in chunked:
        if type(i) == nltk.tree.Tree:
            currentChunk.append(" ".join([token for token, pos in i.leaves()]))
            currentLabel.append(i.label())
        if currentChunk:
            namedEntity = " ".join(currentChunk)
            label = " ".join(currentLabel)
            if namedEntity not in continuousChunkdf['Named Entity']:
                d = {'id': 0, 'Named Entity': namedEntity, 'Label': label}
                continuousChunkdf = continuousChunkdf.append(d, ignore_index = True)
                currentChunk = []
                currentLabel = []
        else:
            continue
    return continuousChunkdf

def mapResults(result, leftNames, rightNames, threadID, threshold):
    result[result < threshold] = 0
    matchdf = pd.DataFrame(0, index = np.arange(len(result.nonzero()[0])), columns = ['id','Left', 'Right', 'Similarity'])
    for i in range(len(result.nonzero()[0])):
        matchdf.loc[i, 'Left'] = leftNames[result.nonzero()[0][i]]
        matchdf.loc[i, 'Right'] = rightNames[result.nonzero()[1][i]]
        matchdf.loc[i, 'Similarity'] = result[result.nonzero()[0][i]][result.nonzero()[1][i]]
    matchdf['id'] = threadID
    return matchdf.drop_duplicates(subset = 'Right')

def pullTickers(string):
    dollarTicker = set(re.findall(r"\$\b[A-Z]{1,4}\b",string))
    manualTicker = re.findall(r"\b[A-Z]{2,4}\b",string)
    manualTicker = list(set(manualTicker).difference(notTickers))
    manualTicker = set(['$' + manualTicker for manualTicker in manualTicker])
    combined = manualTicker.union(dollarTicker)
    return combined.intersection(companydf['Manual'])

notTickers = {'DCF', 'IMO', 'CAN', 'MMS', 'ARE', 'CDC', 'NEW', 'LOVE', 'NYC', 'CASH', 'AI', 
'NAV', 'GOOD', 'DD', 'ATH', 'APPS', 'EDIT', 'WOW', 'PCB', 'UNIT', 'TA', 'VG', 'SELF', 'MR',
'RARE', 'ALEX', 'KEY', 'STIM', 'GO', 'SEE', 'CFO', 'CAL', 'REV', 'PE', 'CHI', 'EVE', 'PDT',
'CO', 'EV', 'TTM', 'EOD', 'AT', 'HUGE', 'ES', 'ONE', 'PT', 'CEO', 'ZEN', 'NOW', 'JAN', 'O',
'OR', 'PG', 'ROCK', 'FOUR', 'ONE', 'TWO', 'FIVE', 'SIX', 'NINE', 'TEN', 'ON', 'SU', 'XT',
'WELL', 'NOV', 'MAR', 'JAN', 'FUN', 'NOW', 'VERY', 'USA', 'POST'}

posts = {'fat4io', 'gaszeo', 'gogpfy', 'hhxaym', 'hnegmj', 'ifmc59', 'iq0jbk', 'j9obtn'}

In [3]:
client = MongoClient('localhost', 27017)
db = client.db
threads = db.threads
companies = db.companylist
secret = db.secret.find_one()['Secret']

reddit = praw.Reddit(client_id='-wpcPIbA7bhlpw', client_secret=secret, user_agent='sentiment')
subreddit = reddit.subreddit('wallstreetbets')
commentlist = []

In [4]:
for post in posts:
    thread = reddit.submission(post)
    print(thread.title)
    thread.comments.replace_more()
    for comment in thread.comments.list():
        commentlist.append(comment)

Daily Discussion Thread for September 10, 2020
Daily Discussion Thread - February 28, 2020
Daily Discussion Thread for July 08, 2020
Daily Discussion Thread for April 30, 2020
Daily Discussion Thread for May 22, 2020
Daily Discussion Thread for August 24, 2020
Daily Discussion Thread for October 12, 2020
Daily Discussion Thread for June 29, 2020


In [11]:
'''from praw.models import MoreComments
for post in postlist:    
    for comment in post.list():
        commentlist.append(comment)'''

In [14]:
'''commentlistlen = 0
currentlen = len(commentlist)
while(commentlistlen != currentlen):
    commentlistlen = len(commentlist)
    for comment in commentlist:
        if isinstance(comment, MoreComments):
            commentlist.remove(comment)
    currentlen = len(commentlist)'''

In [9]:
commentDf = pd.DataFrame(columns = ['Inserted Date','Body', 'Score', 'Stocks'])

In [10]:
created = [comment.created_utc for comment in commentlist]
bodies = [comment.body for comment in commentlist]
scores = [comment.score for comment in commentlist]

In [11]:
commentDf['Inserted Date'] = created
commentDf['Body'] = bodies
commentDf['Score'] = scores

In [12]:
commentDf

Unnamed: 0,Inserted Date,Body,Score,Stocks
0,1.599733e+09,My accounts wiped but I'm still waking up this...,89,
1,1.599740e+09,This is all you need to know about NKLA\n\n\nT...,219,
2,1.599742e+09,“Nikola had the truck towed to the top of a hi...,130,
3,1.599744e+09,Markets realized you can cram 3 months worth o...,63,
4,1.599743e+09,"god damn, the comment volume is so low after s...",105,
...,...,...,...,...
28934,1.593452e+09,Depends HOW it lands 🧐,9,
28935,1.593427e+09,Sometimes I think I never should have started ...,32,
28936,1.593428e+09,Puts on Denmark,8,
28937,1.593428e+09,WHADUPPIMPIN,6,


In [13]:
def cleanStrings(string):
    return re.sub("[^a-zA-Z0-9./$:,'&]+", ' ',string) #only include normal string characters
def cleanText(text):
    return re.sub("http[s]?://\S+", ' ', text) #Remove links

In [14]:
commentDf['Body'] = commentDf['Body'].apply(cleanStrings)
commentDf['Body'] = commentDf['Body'].apply(cleanText)

In [15]:
commentDf = commentDf.drop_duplicates(subset = ['Body']).reset_index(drop = True)

In [16]:
commentDf = commentDf.sample(frac = 1)[0:10000]

In [17]:
commentDf = commentDf.reset_index(drop=True)

In [18]:
companydf = pd.DataFrame.from_records(companies.find())

In [19]:
companyNames = companydf['Name'].unique()
companySymbols = companydf['Symbol'].unique()
nameVectorizer = TfidfVectorizer(min_df = 1)
symbolVectorizer = TfidfVectorizer(min_df = 1)
companyMatrix = nameVectorizer.fit_transform(companyNames)
symbolMatrix = symbolVectorizer.fit_transform(companySymbols)

In [20]:
commentDf['Stocks'] = commentDf['Stocks'].astype(object)

In [21]:
for i in range(len(commentDf['Body'])):
    sent = commentDf['Body'][i]
    threadID = i
    chunks = getContinuousChunks(sent)
    chunkdf = chunks.loc[(chunks['Label'] == 'ORGANIZATION') | (chunks['Label'] == 'PERSON')].reset_index()
    stocklist = []

    if len(chunkdf['Label']) > 0:
        nerNameMatrix = nameVectorizer.transform(chunkdf['Named Entity'])
        nerSymbolMatrix = symbolVectorizer.transform(chunkdf['Named Entity'])
        nameResult = cosine_similarity(nerNameMatrix, companyMatrix)
        symbolResult = cosine_similarity(nerSymbolMatrix, symbolMatrix)
        namedf = mapResults(nameResult, chunkdf['Named Entity'], companyNames, threadID, 0.85)
        symboldf = mapResults(symbolResult, chunkdf['Named Entity'], companySymbols, threadID, 1)
        stocklist = list(set(namedf.loc[namedf['Similarity'] > 0.8, 'Right'].append(symboldf.loc[symboldf['Similarity'] > 0.999, 'Right'])))
    commentDf.at[threadID, 'Stocks'] = stocklist

In [26]:
#commentDf = commentDf.drop(commentDf.loc[commentDf['Stocks'].str.len() == 0].index).reset_index()

In [27]:
#commentDf = commentDf.drop(columns = ['index'])

In [22]:
commentDf

Unnamed: 0,Inserted Date,Body,Score,Stocks
0,1.598282e+09,That apple dip was a gift from Tim himself,11,[TIM S.A.]
1,1.588266e+09,tesla fraud,6,[]
2,1.599743e+09,Total lack of angry bears in the thread is bot...,9,[]
3,1.588242e+09,Jay Powell seems to have ignited a new round ...,4,[EMB]
4,1.593455e+09,BA.. ah shit he on x games mode,6,[]
...,...,...,...,...
9995,1.599749e+09,Is Apple fucking kidding me right now,6,[]
9996,1.582924e+09,that last second pump did not look normal,4,[]
9997,1.599755e+09,volume on these dumps too low to make any seri...,5,[]
9998,1.582900e+09,"I found my DOW 25,000 shirt.",7,[]
