In [1]:
import pandas as pd
import yfinance as yf
import pymongo
import time
import datetime
import re
import numpy as np
import nltk
import altair as alt

from datetime import datetime, timedelta
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
client = MongoClient('localhost', 27017)
db = client.db
comments = db.comments

commentdf = pd.DataFrame.from_records(comments.find({'created_utc': {'$exists': True}}))
commentdf.drop(commentdf[commentdf['sentiment'] == 0].index, inplace=True)
commentdf.reset_index(drop = True, inplace = True)

In [4]:
spydf = yf.download('SPY', start = '2019-10-01', end = '2020-10-03', progress = False)

In [5]:
dayBins = list(spydf.reset_index()['Date'])

In [6]:
commentdf['created_utc'] = [pd.Timestamp(datetime.utcfromtimestamp(time)) for time in commentdf['created_utc']]

In [7]:
commentdf['days'] = pd.cut(commentdf['created_utc'], bins = dayBins, labels=spydf.reset_index()['Date'][0:254])

In [8]:
data = pd.pivot_table(commentdf, values = 'id', index = ['days','sentiment'], aggfunc = 'count')

In [9]:
commentdf['days'] = commentdf['days'].astype(str)

In [10]:
data = data.reset_index()

In [11]:
data = data.drop(data[data['sentiment'] == 'neutral'].index)

In [12]:
data['days'] = data['days'].astype(str)

In [21]:
bar = alt.Chart(data).mark_bar().encode(
    x = 'days:T',
    y = alt.Y('id:Q', stack = 'normalize'),
    color = alt.Color('sentiment', sort='ascending'),
)

In [28]:
line = alt.Chart(spydf.reset_index()).mark_line(color = "#E21D29").encode(
    x = 'Date:T',
    y = 'Open:Q'
)

In [30]:
alt.layer(bar,line).resolve_scale(y = 'independent')

In [25]:
line

In [5]:
commentdf['size'] = 0

In [6]:
for i in range(len(commentdf['stocks'])):
    commentdf.at[i,'size'] = len(commentdf.loc[i, 'stocks'])

In [7]:
commentdf

Unnamed: 0,_id,id,created_utc,body,score,sentiment,stocks,size
0,5fac60f84f1d40506940af6f,f23y8fn,1.569941e+09,[**Donald J. Trump**@realDonaldTrump](https://...,7,neutral,[],0
1,5fac60f84f1d40506940af70,f24vyaa,1.569961e+09,I subscribe to this reality,3,neutral,[],0
2,5fac60f84f1d40506940af71,f24dxww,1.569950e+09,nice,1,neutral,[],0
3,5fac60f84f1d40506940af72,f243mu5,1.569944e+09,Na that’s just me who farted after eating chip...,1,neutral,[],0
4,5fac60f84f1d40506940af73,f24ssqk,1.569960e+09,"same here bud, just a bump needed",1,bullish,[],0
...,...,...,...,...,...,...,...,...
2392628,5faf404fa6460ea208f39e70,g79oznl,1.601543e+09,Are you going to publish your research?,5,neutral,[],0
2392629,5faf404fa6460ea208f39e71,g79bjxq,1.601530e+09,Ah fucking boomer mentality,7,neutral,[],0
2392630,5faf404fa6460ea208f39e72,g77wunh,1.601500e+09,Yea for no reason at all,1,neutral,[],0
2392631,5faf404fa6460ea208f39e73,g780x3y,1.601502e+09,what are you looking to do? Hold long and exit...,1,neutral,[],0


In [8]:
stockdf = commentdf.loc[commentdf['size'] != 0]

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer
s = stockdf['stocks']
mlb = MultiLabelBinarizer()
onehotdf = pd.DataFrame(mlb.fit_transform(s), columns = mlb.classes_, index = stockdf.index)

In [10]:
stocklist = onehotdf.sum(axis = 0).sort_values()

In [11]:
len(stocklist)

3503

In [12]:
stocklist[3403:3503]

TLT       959
HAS       960
COST      968
AMC       976
LYFT      993
        ...  
AMD     13577
MSFT    17321
AAPL    18889
TSLA    43351
SPY     51261
Length: 100, dtype: int64

In [13]:
stocklist = list(stocklist[3478:3503].to_dict().keys())

In [14]:
stocklist = list(stocklist)

In [15]:
stocklist

['ATVI',
 'RH',
 'NIO',
 'AAL',
 'BABA',
 'PTON',
 'NFLX',
 'MGM',
 'ZM',
 'RKT',
 'NVDA',
 'QQQ',
 'DKNG',
 'WMT',
 'NKLA',
 'DIS',
 'FB',
 'AMZN',
 'SPCE',
 'BA',
 'AMD',
 'MSFT',
 'AAPL',
 'TSLA',
 'SPY']

In [16]:
chordDict = onehotdf.sum(axis = 1).sort_values().to_dict()

In [17]:
chordDict = {key:val for key, val in chordDict.items() if val != 1}

In [18]:
len(chordDict)

61541

In [22]:
connectiondf = onehotdf[onehotdf.index.isin(chordDict.keys())]

In [25]:
connectionDict = connectiondf.sum(axis = 0).sort_values().to_dict()

In [26]:
connectionDict = {key:val for key, val in connectionDict.items() if val != 0}

In [28]:
len(connectionDict)

2704

In [36]:
connectionDict = {key:val for key, val in connectionDict.items() if val > 100}

In [38]:
connectionDict

{'LOGI': 101,
 'BP': 104,
 'CMG': 105,
 'BOX': 105,
 'BB': 105,
 'GL': 107,
 'IQ': 107,
 'CLDR': 107,
 'PPT': 108,
 'PLAN': 108,
 'SPG': 108,
 'FL': 109,
 'DPZ': 110,
 'ZS': 110,
 'QCOM': 110,
 'CAT': 112,
 'JBLU': 113,
 'GRUB': 113,
 'SRNE': 117,
 'SWBI': 118,
 'TLRY': 119,
 'COTY': 120,
 'JP': 121,
 'PINS': 122,
 'DHT': 122,
 'STNG': 123,
 'LYV': 125,
 'HUYA': 126,
 'AONE': 126,
 'EA': 127,
 'VALE': 127,
 'UI': 128,
 'TTWO': 130,
 'APT': 130,
 'ANY': 133,
 'KIM': 134,
 'WWE': 135,
 'V': 136,
 'DTE': 137,
 'IBM': 137,
 'APHA': 138,
 'MTB': 139,
 'PLUG': 140,
 'BRK.B': 143,
 'ONE': 145,
 'ENPH': 145,
 'EVER': 146,
 'FOX': 147,
 'DBX': 147,
 'LIVE': 151,
 'FMCI': 151,
 'MRO': 152,
 'CSOD': 153,
 'FORD': 153,
 'TLT': 154,
 'DELL': 155,
 'TDOC': 155,
 'EURN': 160,
 'ADBE': 162,
 'DGLY': 162,
 'SNOW': 163,
 'BRK.A': 163,
 'SGU': 164,
 'NRZ': 165,
 'RUN': 166,
 'CVS': 166,
 'LIFE': 167,
 'STAR          ': 168,
 'SDC': 170,
 'OI': 172,
 'CZR': 172,
 'MS': 173,
 'NKE': 174,
 'LL': 174,
 'STAY

# Visualization