In [1]:
import pandas as pd
import yfinance as yf
import pymongo
import time
import datetime
import re
import numpy as np
import nltk

from datetime import datetime, timedelta
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Pulling stock tickers

In [2]:
def getContinuousChunks(text):
    chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
    continuousChunkdf = pd.DataFrame(columns = ['id', 'Named Entity', 'Label'])
    currentChunk = []
    currentLabel = []
    for i in chunked:
        if type(i) == nltk.tree.Tree:
            currentChunk.append(" ".join([token for token, pos in i.leaves()]))
            currentLabel.append(i.label())
        if currentChunk:
            namedEntity = " ".join(currentChunk)
            label = " ".join(currentLabel)
            if namedEntity not in continuousChunkdf['Named Entity']:
                d = {'id': 0, 'Named Entity': namedEntity, 'Label': label}
                continuousChunkdf = continuousChunkdf.append(d, ignore_index = True)
                currentChunk = []
                currentLabel = []
        else:
            continue
    return continuousChunkdf

def mapResults(result, leftNames, rightNames, threadID, threshold):
    result[result < threshold] = 0
    matchdf = pd.DataFrame(0, index = np.arange(len(result.nonzero()[0])), columns = ['id','Left', 'Right', 'Similarity'])
    for i in range(len(result.nonzero()[0])):
        matchdf.loc[i, 'Left'] = leftNames[result.nonzero()[0][i]]
        matchdf.loc[i, 'Right'] = rightNames[result.nonzero()[1][i]]
        matchdf.loc[i, 'Similarity'] = result[result.nonzero()[0][i]][result.nonzero()[1][i]]
    matchdf['id'] = threadID
    return matchdf.drop_duplicates(subset = 'Right')

def pullTickers(string):
    dollarTicker = list(set(re.findall(r"\$\b[A-Z]{1,4}\b",string)))
    manualTicker = re.findall(r"\b[A-Z]{2,4}\b",string)
    manualTicker = set(manualTicker).difference(notTickers)
    manualTicker = set(['$' + manualTicker for manualTicker in manualTicker])
    combined = manualTicker.union(dollarTicker)
    combined = list(combined.intersection(companydf['Manual']))
    combined = set([combined[1:] for combined in combined])
    return combined

notTickers = {'DCF', 'IMO', 'CAN', 'MMS', 'ARE', 'CDC', 'NEW', 'LOVE', 'NYC', 'CASH', 'AI', 
'NAV', 'GOOD', 'DD', 'ATH', 'APPS', 'EDIT', 'WOW', 'PCB', 'UNIT', 'TA', 'VG', 'SELF', 'MR',
'RARE', 'ALEX', 'KEY', 'STIM', 'GO', 'SEE', 'CFO', 'CAL', 'REV', 'PE', 'CHI', 'EVE', 'PDT',
'CO', 'EV', 'TTM', 'EOD', 'AT', 'HUGE', 'ES', 'ONE', 'PT', 'CEO', 'ZEN', 'NOW', 'JAN', 'O',
'OR', 'PG', 'ROCK', 'FOUR', 'ONE', 'TWO', 'FIVE', 'SIX', 'NINE', 'TEN', 'ON', 'SU', 'XT',
'WELL', 'NOV', 'MAR', 'JAN', 'FUN', 'NOW', 'VERY', 'USA', 'POST', 'ALL', 'IT', 'GDP', 'RH',
}

In [3]:
client = MongoClient('localhost', 27017)
db = client.db
comments = db.comments
companies = db.companylist

In [4]:
companydf = pd.DataFrame.from_records(companies.find())
commentdf = pd.DataFrame.from_records(comments.find({'$and': [{'stocks': {'$exists': False}},{'created_utc': {'$exists': True}}]}).limit(100000))

In [5]:
commentdf['sentiment'] = 0
commentdf['stocks'] = 0
commentdf['stocks'] = commentdf['stocks'].astype(object)

In [6]:
def cleanStrings(string):
    return re.sub("[^a-zA-Z0-9./$:,'&]+", ' ',string) #only include normal string characters
def cleanText(text):
    return re.sub("http[s]?://\S+", ' ', text) #Remove links

In [7]:
commentdf['body'] = commentdf['body'].apply(cleanStrings)
commentdf['body'] = commentdf['body'].apply(cleanText)

In [8]:
companyNames = companydf['Name'].unique()
companySymbols = companydf['Symbol'].unique()
nameVectorizer = TfidfVectorizer(min_df = 1)
symbolVectorizer = TfidfVectorizer(min_df = 1)
companyMatrix = nameVectorizer.fit_transform(companyNames)
symbolMatrix = symbolVectorizer.fit_transform(companySymbols)

In [9]:
for i in range(len(commentdf)):
    print(i)
    sent = commentdf['body'][i]
    threadID = i
    chunks = getContinuousChunks(sent)
    chunkdf = chunks.loc[(chunks['Label'] == 'ORGANIZATION') | (chunks['Label'] == 'PERSON')].reset_index()
    stocklist = set()

    if len(chunkdf['Label']) > 0:
        nerNameMatrix = nameVectorizer.transform(chunkdf['Named Entity'])
        nerSymbolMatrix = symbolVectorizer.transform(chunkdf['Named Entity'])
        nameResult = cosine_similarity(nerNameMatrix, companyMatrix)
        symbolResult = cosine_similarity(nerSymbolMatrix, symbolMatrix)
        namedf = mapResults(nameResult, chunkdf['Named Entity'], companyNames, threadID, 0.85)
        symboldf = mapResults(symbolResult, chunkdf['Named Entity'], companySymbols, threadID, 1)
        stocklist = set(namedf.loc[namedf['Similarity'] > 0.8, 'Right'].append(symboldf.loc[symboldf['Similarity'] > 0.999, 'Right']))
    commentdf.at[threadID, 'stocks'] = stocklist

6
96667
96668
96669
96670
96671
96672
96673
96674
96675
96676
96677
96678
96679
96680
96681
96682
96683
96684
96685
96686
96687
96688
96689
96690
96691
96692
96693
96694
96695
96696
96697
96698
96699
96700
96701
96702
96703
96704
96705
96706
96707
96708
96709
96710
96711
96712
96713
96714
96715
96716
96717
96718
96719
96720
96721
96722
96723
96724
96725
96726
96727
96728
96729
96730
96731
96732
96733
96734
96735
96736
96737
96738
96739
96740
96741
96742
96743
96744
96745
96746
96747
96748
96749
96750
96751
96752
96753
96754
96755
96756
96757
96758
96759
96760
96761
96762
96763
96764
96765
96766
96767
96768
96769
96770
96771
96772
96773
96774
96775
96776
96777
96778
96779
96780
96781
96782
96783
96784
96785
96786
96787
96788
96789
96790
96791
96792
96793
96794
96795
96796
96797
96798
96799
96800
96801
96802
96803
96804
96805
96806
96807
96808
96809
96810
96811
96812
96813
96814
96815
96816
96817
96818
96819
96820
96821
96822
96823
96824
96825
96826
96827
96828
96829
96830
96831
96832
96

In [10]:
commentdf['stocks'] = [set(a).union(b) for a,b in zip(commentdf['stocks'], commentdf['body'].apply(pullTickers))]

In [11]:
for i in range(len(commentdf['stocks'])):
    tickerset = commentdf.loc[i, 'stocks']
    named = tickerset.intersection(companydf['Name'])
    if len(named) > 0:
        tickerset = tickerset.difference(named)
        namedTickers = set()
        for j in named:
            namedTickers.add(companydf.loc[companydf['Name'] == j, 'Symbol'].iloc[0])
        commentdf.at[i,'stocks'] = tickerset.union(namedTickers)


In [12]:
commentdf['stocks'] = [list(tickers) for tickers in commentdf['stocks']]

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
s = commentdf['stocks']
mlb = MultiLabelBinarizer()
pd.DataFrame(mlb.fit_transform(s), columns = mlb.classes_, index = commentdf.index)

Unnamed: 0,A,AA,AAL,AAOI,AAP,AAPL,AAXN,ABBV,ABC,ABEV,...,YPF,YUM,YY,Z,ZEUS,ZM,ZNGA,ZS,ZTS,ZYXI
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
commentdf

Unnamed: 0,_id,id,created_utc,body,score,sentiment,stocks
0,5fac60f84f1d40506940af6e,f24h9o9,1.569952e+09,deleted,3,0,[]
1,5fac60f84f1d40506940af6f,f23y8fn,1.569941e+09,Donald J. Trump realDonaldTrump 1m As I p...,7,0,[]
2,5fac60f84f1d40506940af70,f24vyaa,1.569961e+09,I subscribe to this reality,3,0,[]
3,5fac60f84f1d40506940af71,f24dxww,1.569950e+09,nice,1,0,[]
4,5fac60f84f1d40506940af72,f243mu5,1.569944e+09,Na that s just me who farted after eating chip...,1,0,[]
...,...,...,...,...,...,...,...
99995,5facd11d4f1d40506942360a,f8rn6a5,1.574745e+09,Tradetalksgoingwell.exe followed by HSI drop,6,0,[]
99996,5facd11d4f1d40506942360b,f8qt9jk,1.574724e+09,I thought he said the stock market hated the i...,1,0,[]
99997,5facd11d4f1d40506942360c,f8r6qlp,1.574733e+09,futures mooning for no reason first time,3,0,[]
99998,5facd11d4f1d40506942360d,f8qi6ox,1.574718e+09,"Powell, please oh please break the market",2,0,[]


# Sentiment

In [26]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('SavedModels/bestModel.pt')

2020-11-14 18:16:26,928 loading file SavedModels/bestModel.pt


In [27]:
sentence = Sentence("drill team six")
classifier.predict(sentence)
sentence.labels[0].to_dict()['value']

'bearish'

In [29]:
commentdf['sentiment'] = commentdf['sentiment'].astype(object)

In [32]:
for i in range(len(commentdf)):
    sent = Sentence(commentdf.loc[i,'body'])
    classifier.predict(sent)
    predictions = sent.labels[0].to_dict()
    commentdf.at[i,'sentiment'] = predictions['value']

Sentence: "deleted"   [− Tokens: 1]
Sentence: "Donald J. Trump realDonaldTrump 1m As I predicted , Jay Powell and the Federal Reserve have allowed the Dollar to get so strong , especially relative to ALL other currencies , that our manufacturers are being negatively affected . Fed Rate too high . They are their own worst enemies , they don t have a clue . Pathetic"   [− Tokens: 59]
Sentence: "I subscribe to this reality"   [− Tokens: 5]
Sentence: "nice"   [− Tokens: 1]
Sentence: "Na that s just me who farted after eating chipotle"   [− Tokens: 10]
Sentence: "same here bud , just a bump needed"   [− Tokens: 8]
Sentence: "Its run up too much puts"   [− Tokens: 6]
Sentence: "deleted"   [− Tokens: 1]
Sentence: "No way . Been bitten by that shit multiple times the past week or so . Power hour or nothing for me"   [− Tokens: 22]
Sentence: "Any plays for today Asides from hoping SPY isn t too gay ."   [− Tokens: 13]
Sentence: "That would be amazing . Cmon bear HFs , post a bs article aboit ho

IndexError: list index out of range

In [31]:
commentdf

Unnamed: 0,_id,id,created_utc,body,score,sentiment,stocks
0,5fac60f84f1d40506940af6e,f24h9o9,1.569952e+09,deleted,3,neutral,[]
1,5fac60f84f1d40506940af6f,f23y8fn,1.569941e+09,Donald J. Trump realDonaldTrump 1m As I p...,7,neutral,[]
2,5fac60f84f1d40506940af70,f24vyaa,1.569961e+09,I subscribe to this reality,3,neutral,[]
3,5fac60f84f1d40506940af71,f24dxww,1.569950e+09,nice,1,neutral,[]
4,5fac60f84f1d40506940af72,f243mu5,1.569944e+09,Na that s just me who farted after eating chip...,1,neutral,[]
...,...,...,...,...,...,...,...
99995,5facd11d4f1d40506942360a,f8rn6a5,1.574745e+09,Tradetalksgoingwell.exe followed by HSI drop,6,0,[]
99996,5facd11d4f1d40506942360b,f8qt9jk,1.574724e+09,I thought he said the stock market hated the i...,1,0,[]
99997,5facd11d4f1d40506942360c,f8r6qlp,1.574733e+09,futures mooning for no reason first time,3,0,[]
99998,5facd11d4f1d40506942360d,f8qi6ox,1.574718e+09,"Powell, please oh please break the market",2,0,[]


# Visualization