In [None]:
#!pip install py2neo
#!pip install nlp_rake
#!pip install pytextrank
#!python3 -m pip install -U pip
#!python3 -m pip install -r requirements.txt
#!python3 -m spacy download en_core_web_sm

In [1]:
import pandas as pd
from py2neo import Graph,Node,Relationship
from py2neo.bulk import create_nodes
import re
from py2neo.bulk import create_relationships
import nltk
import re
import collections 

In [2]:
# Read Reddit post on wallstreetbets subreddit
df_p = pd.read_csv('archive/wsb-aug-2021-posts.csv', usecols=None,names=['type', 'p_id', 'subid',
                    'name', 'nsfw','p_created','p_permalink','domain','url','selftext','p_title','p_score'],skiprows=1)

In [3]:
# Retain relevant columns
df_p = df_p[['p_id','p_created','p_permalink','p_title','p_score']]
df_p.shape

(25751, 5)

In [4]:
# Read Comments to reddit posts file
df_c = pd.read_csv('archive/wsb-aug-2021-comments.csv', usecols=None, names=['type', 'c_id', 'subid',
                    'name', 'nsfw','c_created','c_permalink','c_body','c_sentiment','c_score'],skiprows=1)

In [5]:
# Retain relevant columns
df_c=df_c[['c_id','c_created','c_permalink','c_body','c_score']]
df_c['parentid']=df_c.c_permalink.str.slice(49,55) # Extract original post id from permalink
df_c.shape

(1001160, 6)

In [6]:
df_p.head()

Unnamed: 0,p_id,p_created,p_permalink,p_title,p_score
0,pfi0x7,1630454321,https://old.reddit.com/r/wallstreetbets/commen...,Is BABA the next?,1
1,pfhz92,1630454157,https://old.reddit.com/r/wallstreetbets/commen...,$TELL- According to Wall Street Journal its a ...,1
2,pfhxzc,1630454028,https://old.reddit.com/r/wallstreetbets/commen...,IS BABA next?,1
3,pfhw6s,1630453851,https://old.reddit.com/r/wallstreetbets/commen...,1.4K to 7.K overnight on FIVN puts. Thanks ZM!,79
4,pfhtyf,1630453627,https://old.reddit.com/r/wallstreetbets/commen...,1.4K to 7.1K overnight on FIVN puts. Thanks ZM!,2


In [7]:
df_c.head()

Unnamed: 0,c_id,c_created,c_permalink,c_body,c_score,parentid
0,hb4hdm3,1630454394,https://old.reddit.com/r/wallstreetbets/commen...,What's updog,3,pfdkjw
1,hb4hdm8,1630454394,https://old.reddit.com/r/wallstreetbets/commen...,Don’t tell em,1,pfdkjw
2,hb4hdjc,1630454393,https://old.reddit.com/r/wallstreetbets/commen...,"I realize this, ive been losing thousands shoo...",2,pfdkjw
3,hb4hdgo,1630454392,https://old.reddit.com/r/wallstreetbets/commen...,then it tanks after earnings,4,pfgr1h
4,hb4hdeh,1630454391,https://old.reddit.com/r/wallstreetbets/commen...,Are you saying I should or shouldn’t yolo my l...,1,pf3xee


### Data Cleanup and merging

In [8]:
df_p.shape, df_c.shape

((25751, 5), (1001160, 6))

In [9]:
#Dropping null values
df_c.dropna(inplace=True)

In [10]:
df_p.shape, df_c.shape

((25751, 5), (1001158, 6))

In [11]:
# Dropping rows with body and title as [removed]
df_p=df_p[~df_p.p_title.str.match(pat='\[?removed\]')]
df_c=df_c[~df_c.c_body.str.match(pat='\[?removed\]')]
df_p.shape,df_c.shape

((25751, 5), (857417, 6))

In [12]:
# Dropping rows with body and title as [deleted]
df_p=df_p[~df_p.p_title.str.match(pat='\[?deleted\]')]
df_c=df_c[~df_c.c_body.str.match(pat='\[?deleted\]')]
df_p.shape,df_c.shape

((25751, 5), (813950, 6))

In [13]:
df_m=df_p.merge(df_c, left_on='p_id', right_on='parentid')

In [14]:
df_m.shape

(799702, 11)

In [15]:
# Dropping rows with body and title as containing string 'Your submission was removed'
df_m=df_m[~df_m.c_body.str.contains('Your submission was removed')]
df_m=df_m[~df_m.c_body.str.contains('Your submission was removed')]
print('Shape of final dataframe after data cleanup',df_m.shape)

Shape of final dataframe after data cleanup (783773, 11)


In [16]:
df_p.reset_index(inplace=True,drop=True)
df_c.reset_index(inplace=True,drop=True)
df_m.reset_index(inplace=True,drop=True)
df_m.head()

Unnamed: 0,p_id,p_created,p_permalink,p_title,p_score,c_id,c_created,c_permalink,c_body,c_score,parentid
0,pfhw6s,1630453851,https://old.reddit.com/r/wallstreetbets/commen...,1.4K to 7.K overnight on FIVN puts. Thanks ZM!,79,hb4grlq,1630454109,https://old.reddit.com/r/wallstreetbets/commen...,How do people do this?,4,pfhw6s
1,pfhw6s,1630453851,https://old.reddit.com/r/wallstreetbets/commen...,1.4K to 7.K overnight on FIVN puts. Thanks ZM!,79,hb4ga59,1630453885,https://old.reddit.com/r/wallstreetbets/commen...,\n**User Report**| | | |\n:--|:--|:--|:--\n**T...,1,pfhw6s
2,pfhtyf,1630453627,https://old.reddit.com/r/wallstreetbets/commen...,1.4K to 7.1K overnight on FIVN puts. Thanks ZM!,2,hb4ft1v,1630453666,https://old.reddit.com/r/wallstreetbets/commen...,\n**User Report**| | | |\n:--|:--|:--|:--\n**T...,1,pfhtyf
3,pfhtyf,1630453627,https://old.reddit.com/r/wallstreetbets/commen...,1.4K to 7.1K overnight on FIVN puts. Thanks ZM!,2,hb4fs9a,1630453655,https://old.reddit.com/r/wallstreetbets/commen...,I am a bot from /r/wallstreetbets. You submitt...,1,pfhtyf
4,pfhq3j,1630453246,https://old.reddit.com/r/wallstreetbets/commen...,Does anyone know what the first stock symbol i...,4,hb4gzf2,1630454210,https://old.reddit.com/r/wallstreetbets/commen...,"That company was delisted, the value is prob a...",3,pfhq3j


### Most popular Post

In [17]:
print(f'''Most popular posts is {df_p[df_p.p_score == df_p.p_score.max()].p_title.values} 
with the score of {df_p.p_score.max()}''')

Most popular posts is ['My portfolio after discovering wsb'] 
with the score of 45414


### Most popular Comment

In [18]:
print(f'''Most popular posts is {df_c[df_c.c_score == df_c.c_score.max()].c_body.values} 
      with the score of {df_c.c_score.max()}''')

Most popular posts is ['Ok so I\'d just remove the letter "k" from the post because if you do that then you\'ll only be down $15 instead of $15k, and that\'s a really manageable loss.'] 
      with the score of 21129


### Most commented post

In [19]:
a=df_m['parentid'].mode()
title=df_m.p_title[df_m.p_id == 'p3sv76']
print(f'Most commented post is: {title[title.index[0]]}')

Most commented post is: Weekend Discussion Thread for the Weekend of August 13, 2021


## Keyword Extraction

In [32]:
# We had a problem of scale. Running any line below was not possible for 1M+ records so we decided to curtail our dataset

v = df_m.p_id.value_counts()
df_m=df_m[df_m.p_id.isin(v.index[v.gt(15000)])]
df_m.reset_index(drop=True, inplace=True)

In [34]:
df_m.shape

(71732, 11)

### Using RAKE (We decided to use rack over spacy for better output and performance)

In [22]:
#Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/asachan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
from nlp_rake import Rake
r = Rake(
    min_chars=2,
    max_words=5,
    min_freq=1
)

In [53]:
posts = df_m.p_title.unique()
titles = df_m.c_body.unique()
print('Length of unique posts: ',len(posts))
print('Length of unique titles: ',len(title))

Length of unique posts:  4
Length of unique titles:  19259


In [54]:
%%time
# Get all stock symbol from title of the post.
title_ent=[]
for i in range(len(posts)):   
    keywords = r.apply(posts[i])
    if len(keywords) > 0:
        result = re.search("\$(\w+)", keywords[0][0])
        if result:
            title_ent.append(result.group(1))
print('Number of keywords extracted:', len(title_ent))

Number of keywords extracted: 0
CPU times: user 30.8 ms, sys: 2.49 ms, total: 33.3 ms
Wall time: 31.7 ms


In [55]:
%%time
# Get all stock symbol from the body of the comments.
body_ent=[]
for i in range(len(titles)):    
    keywords = r.apply(titles[i])
    if len(keywords) > 0:
        result = re.search("\$(\w+)", keywords[0][0])
        if result:
            body_ent.append(result.group(1))
print('Number of keywords extracted:', len(body_ent))

Number of keywords extracted: 783
CPU times: user 6min 14s, sys: 2.38 s, total: 6min 17s
Wall time: 6min 18s


In [84]:
m_com=[]
for i in range(len(body_ent)):
    if body_ent[i].isalpha():
        m_com.append(body_ent[i])
m_com=set(m_com)

In [86]:
print('Number of companies mentioned',len(m_com))

Number of companies mentioned 124


### Using pytextrank and spacy (Output especially with '$' prefixed string is not as good as RACK)

In [28]:
#https://towardsdatascience.com/keyword-extraction-a-benchmark-of-7-algorithms-in-python-8a905326d93f - Keyword extraction comparison
# import pytextrank
# import spacy

In [None]:
# nlp = spacy.load("en_core_web_sm")
# nlp.add_pipe("textrank")

In [None]:
# %%time
# ## Extract keywords from title of the post
# p_ent=[]
# for i in range(len(df_p)):
#     doc=nlp(df_p.p_title[i])
#     for ent in doc.ents:
#         p_ent.append(ent.text) # named entity and noun chunk yields same result

In [None]:
# %%time
# ## Extract keywords from body of the comments
# c_ent=[]
# for i in range(len(df_c)):
#     doc=nlp(df_c.c_body[i])
#     for ent in doc.ents:
#         c_ent.append(ent.text) # named entity and noun chunk yields same result

## Graph Section

#### Project and dabase was manually created

In [63]:
g = Graph("bolt://localhost:11006", auth=('neo4j', 'admin'))

In [65]:
# # Create Post nodes
# data=[]
# posts=df_p
# for i in range(len(posts)):
#     x=posts.id[i]
#     y=posts.title[i]
#     z=str(posts.score[i])
#     dict1 = {'id':x, 'title':y, 'score':z}
#     data.append(dict1)
# create_nodes(g.auto(), data, labels={"Post"})

# #Create comments nodes
# data=[]
# comments=df_c
# for i in range(len(comments)):
#     x=comments.id[i]
#     y=comments.body[i]
#     z=str(comments.score[i])
#     zz=comments.parentid[i]
#     dict1 = {'id':x, 'title':y, 'score':z,'parentid':zz}
#     data.append(dict1)

# from itertools import islice
# stream = iter(data)
# batch_size = 10000
# while True:
#     batch = islice(stream, batch_size)
#     if batch:
#         create_nodes(g.auto(), batch, labels={"Comment"})
#     else:
#         break


In [108]:
%%time
# This code creates nodes and relationship between nodes
for i in range(len(df_m)):

    a = Node("POST", p_id=df_m.p_id[i], title=df_m.p_title[i], score=str(df_m.p_score[i]))
    a.__primarylabel__ = "POST"
    a.__primarykey__ = "p_id"
    b = Node("COMMENT", c_id=df_m.c_id[i], body=df_m.c_body[i], score=str(df_m.c_score[i]), parentid=df_m.parentid[i])
    b.__primarylabel__ = "COMMENT"
    b.__primarykey__ = "c_id"
    HAS = Relationship.type("HAS")
    g.merge(HAS(a, b)) # Relations between post and comment
    keywords = r.apply(df_m.c_body[i])
    m_com=[]
    if len(keywords) > 0:
        result = re.search("\$(\w+)", keywords[0][0])
        if result:
            m_com.append(result.group(1))

    for i in m_com:
        c = Node("Company", name=i)
        c.__primarylabel__ = "Company"
        c.__primarykey__ = "name"
        MENTIONS = Relationship.type("MENTIONS")
        g.merge(MENTIONS(b, c)) # Relations between comment and companies mentioned
        

In [111]:
df_m.p_id.unique()

array(['pcuv2j', 'p8cqpr', 'p3sv76', 'ozebic'], dtype=object)

In [101]:
# # Using Bulk method
# from py2neo import Graph
# from py2neo.bulk import create_relationships
# d=data[0:5]
# from itertools import islice
# stream = iter(d)
# batch_size = 10000
# while True:
#     batch = islice(stream, batch_size)
#     if batch:
#         create_relationships(g.auto(), batch, "HAS", \
#     start_node_key=("Post", "id", "title","score"), end_node_key=("Comment", "id","body","score","parentid"))
#     else:
#         break


In [106]:
g.delete_all()

In [107]:
g.nodes.match("POST").count()

0

In [None]:
g.nodes.match("Comment").count()

### Pulling stock price data from finance.yahoo.com

In [None]:
companies = pd.read_csv('Companies.csv')
comps = companies.iloc[:,0]

In [None]:
import yfinance as yf
comp_prices = pd.DataFrame()
for comp in comps:
    tick = yf.Ticker(comp)
    dat = tick.history(start='2021-08-01', end='2021-08-31')
    dat['Delta'] = dat['Close']-dat['Open']
    dat['Ticker'] = comp
    comp_prices = comp_prices.append(dat)
comp_prices = comp_prices.reset_index(drop=False)
df_prices=comp_prices[['Ticker','Date','Open','Close','Delta']]

In [None]:
df_prices