In [None]:
#!pip install py2neo
#!pip install nlp_rake
#!pip install pytextrank
#!python3 -m pip install -U pip
#!python3 -m pip install -r requirements.txt
#!python3 -m spacy download en_core_web_sm

In [9]:
import pandas as pd
from py2neo import Graph,Node,Relationship
from py2neo.bulk import create_nodes
import re

In [10]:
# Read Reddit post on wallstreetbets subreddit
df_p = pd.read_csv('archive/wsb-aug-2021-posts.csv', usecols=None,names=['type', 'id', 'subid',
                    'name', 'nsfw','created','permalink','domain','url','selftext','title','score'],skiprows=1)

In [11]:
# Retain relevant columns
df_p = df_p[['id','permalink','title','score']]

In [12]:
df_p.head()

Unnamed: 0,id,permalink,title,score
0,pfi0x7,https://old.reddit.com/r/wallstreetbets/commen...,Is BABA the next?,1
1,pfhz92,https://old.reddit.com/r/wallstreetbets/commen...,$TELL- According to Wall Street Journal its a ...,1
2,pfhxzc,https://old.reddit.com/r/wallstreetbets/commen...,IS BABA next?,1
3,pfhw6s,https://old.reddit.com/r/wallstreetbets/commen...,1.4K to 7.K overnight on FIVN puts. Thanks ZM!,79
4,pfhtyf,https://old.reddit.com/r/wallstreetbets/commen...,1.4K to 7.1K overnight on FIVN puts. Thanks ZM!,2


In [13]:
# Print shape of posts file
print('Shape of Posts File: ', df_p.shape)

Shape of Posts File:  (25751, 4)


In [14]:
# Read Comments to reddit posts file
df_c = pd.read_csv('archive/wsb-aug-2021-comments.csv', usecols=None, names=['type', 'id', 'subid',
                    'name', 'nsfw','created','permalink','body','sentiment','score'],skiprows=1)

In [15]:
# Retain relevant columns
df_c=df_c[['id','permalink','body','sentiment','score']]
df_c['parentid']=df_c.permalink.str.slice(49,55) # Extract original post id from permalink

In [16]:
df_c.head()

Unnamed: 0,id,permalink,body,sentiment,score,parentid
0,hb4hdm3,https://old.reddit.com/r/wallstreetbets/commen...,What's updog,,3,pfdkjw
1,hb4hdm8,https://old.reddit.com/r/wallstreetbets/commen...,Don’t tell em,,1,pfdkjw
2,hb4hdjc,https://old.reddit.com/r/wallstreetbets/commen...,"I realize this, ive been losing thousands shoo...",0.6369,2,pfdkjw
3,hb4hdgo,https://old.reddit.com/r/wallstreetbets/commen...,then it tanks after earnings,0.0,4,pfgr1h
4,hb4hdeh,https://old.reddit.com/r/wallstreetbets/commen...,Are you saying I should or shouldn’t yolo my l...,0.2732,1,pf3xee


In [17]:
# Print shape of posts file
print('Shape of Posts File: ', df_c.shape)

Shape of Posts File:  (1001160, 6)


### Most popular Post

In [18]:
print(f'Most popular posts is {df_p[df_p.score == df_p.score.max()].title.values} with the score of {df_p.score.max()}')

Most popular posts is ['My portfolio after discovering wsb'] with the score of 45414


### Most popular Comment

In [19]:
print(f'Most popular posts is {df_c[df_c.score == df_c.score.max()].body.values} with the score of {df_c.score.max()}')

Most popular posts is ['Ok so I\'d just remove the letter "k" from the post because if you do that then you\'ll only be down $15 instead of $15k, and that\'s a really manageable loss.'] with the score of 21129


### Most commented post

In [20]:
df_m=df_p.merge(df_c, left_on='id', right_on='parentid')

In [21]:
a=df_m['parentid'].mode()
a
title=df_m.title[df_m.id_x == 'p3sv76']
print(f'Most commented post is: {title[title.index[0]]}')

Most commented post is: Weekend Discussion Thread for the Weekend of August 13, 2021


## Keyword Extraction

In [22]:
# Creating sample data to test scripts
posts = df_p[:100]
postids=posts.id.values

In [23]:
comments=df_c[:1000]

### Using RAKE

In [24]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/asachan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
from nlp_rake import Rake
r = Rake(
    min_chars=2,
    max_words=5,
    min_freq=1
)

In [29]:
import re
for i in range(len(posts)):
    
    keywords = r.apply(posts.title[i])
    if len(keywords) > 0:
        result = re.search("\$(\w+)", keywords[0][0])
        if result:
            print (result.group(1))
        #print (keywords[0][0])
    


geni
geni
geni
sklz
prog
sklz
amc
yndx
bbig
bb
sava


In [30]:
import re
for i in range(len(comments)):
    
    keywords = r.apply(comments.body[i])
    if len(keywords) > 0:
        result = re.search("\$(\w+)", keywords[0][0])
        if result:
            print (result.group(1))
        #print (keywords[0][0])
    


200
12
80
500k
20
clf
85
tsla
sklz
4c
150
30
80
50k
24b
11
shekelz


In [None]:
# for i in range(len(posts)):
    
#     keywords = r.apply(posts.title[i])
#     print(keywords)

### Using pytextrank and spacy

In [31]:
import pytextrank
import spacy

In [32]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")


<pytextrank.base.BaseTextRankFactory at 0x7f83dd994760>

In [33]:
for i in range(len(posts)):
    doc = nlp(posts.title[i])
    for phrase in doc._.phrases:
        print(phrase.text)
        print(phrase.rank, phrase.count)
        print(phrase.chunks)

BABA
0.2961745101860818 1
[BABA]
Meme Stock
0.33755886011192726 1
[Meme Stock]
Wall Street Journal
0.25576983140197523 2
[Wall Street Journal, Wall Street Journal]
its a Meme Stock
0.12992646583530962 1
[its a Meme Stock]
BABA
0.5 1
[BABA]
FIVN puts
0.33459732664029557 1
[FIVN puts]
7.K
0.17610381003176306 1
[7.K]
Thanks
0.1554531024627926 1
[Thanks]
1.4K
0.07180071244260462 1
[1.4K]
1.4
0.0 1
[1.4]
overnight
0.0 1
[overnight]
FIVN puts
0.33459732664029557 1
[FIVN puts]
Thanks
0.1554531024627926 1
[Thanks]
7.1K
0.08133886570172572 1
[7.1K]
1.4
0.0 1
[1.4]
overnight
0.0 1
[overnight]
today
0.23854050690690223 1
[today]
the first stock symbol
0.1542371933953662 1
[the first stock symbol]
first
0.1341387595867109 1
[first]
the market
0.07018637238888584 1
[the market]
my account
0.06195603916492002 1
[my account]
It
0.0 1
[It]
anyone
0.0 1
[anyone]
what
0.0 1
[what]
YOLO
0.11705139662830938 1
[YOLO]
a retard
0.10891410804690982 1
[a retard]
260K position
0.09880270937004297 1
[260K positi

In [None]:
for i in range(len(df1)):
    doc=nlp(df1.title[i])
    for ent in doc.ents:
        print(ent.text, ent.label_)

In [None]:
posts.head()

## Graph Section

#### Project and dabase was manually created

In [4]:
g = Graph("bolt://localhost:11003", auth=('admin', 'admin'))

In [16]:
data=[]
posts=df_p
for i in range(len(posts)):
    x=posts.id[i]
    y=posts.title[i]
    z=str(posts.score[i])
    dict1 = {'id':x, 'title':y, 'score':z}
    data.append(dict1)

In [17]:
# Create Post nodes
create_nodes(g.auto(), data, labels={"Post"})

In [19]:
data=[]
comments=df_c
for i in range(len(comments)):
    x=comments.id[i]
    y=comments.body[i]
    z=str(comments.score[i])
    zz=comments.parentid[i]
    dict1 = {'id':x, 'title':y, 'score':z,'parentid':zz}
    data.append(dict1)

In [None]:
#Create comments nodes

from itertools import islice
stream = iter(data)
batch_size = 10000
while True:
    batch = islice(stream, batch_size)
    if batch:
        create_nodes(g.auto(), batch, labels={"Comment"})
    else:
        break


In [None]:
# Create data for relationships


In [None]:
g.delete_all()

In [5]:
g.nodes.match("Post").count()

25751

In [6]:
g.nodes.match("Comment").count()

1001160