In [46]:
import praw
from psaw import PushshiftAPI

import datetime as dt
from tqdm import tqdm

import pandas as pd
import numpy as np
from IPython.display import display, Markdown, Latex

from os import listdir
from os.path import isfile, join
import csv

from tqdm.notebook import tqdm,tnrange
import pickle

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud, STOPWORDS

In [47]:
import import_ipynb
from reddit_access import client_id, client_secret, password, user_agent, username

#PSAW TO RETRIEVE IDs OF SUBMISSIONS
api = PushshiftAPI()

#PRAW TO RETRIEVE ACTUAL CONVERSATIONS
reddit = praw.Reddit(client_id=client_id, 
                     client_secret=client_secret,
                     password=password, 
                     user_agent=user_agent,
                     username=username)

reddit.user.me()

Redditor(name='micheledinelli')

In [49]:
# DEFINE YEAR-MONTH RANGES TO LOOK UP 
today = dt.datetime.today()

today_month = today.month
today_year = today.year

start_year = 2019
start_month = 6
end_year = 2020
dates = []
while start_year <= end_year:
    
    if start_year == today_year and start_month > today_month:
        break
        
    dates.append((start_year, start_month))
    
    start_month += 1
    if start_month > 12:
        start_year += 1
        start_month = 1
        
dates.pop();

[(2019, 6),
 (2019, 7),
 (2019, 8),
 (2019, 9),
 (2019, 10),
 (2019, 11),
 (2019, 12),
 (2020, 1),
 (2020, 2),
 (2020, 3),
 (2020, 4),
 (2020, 5),
 (2020, 6),
 (2020, 7),
 (2020, 8),
 (2020, 9),
 (2020, 10),
 (2020, 11)]

In [50]:
def retrieve_ids_in_range(year, month, SR, verbose=True):
    print("Scraping", "r/" + SR, "for:", year, "-", month)
    
    # Seconds in a day/hour
    DAY = 60 * 60 * 24
    HOUR = 60 * 60
    
    start = int(dt.datetime(year, month, 1).timestamp())
    if month < 12:
        end = int(dt.datetime(year, month + 1, 1).timestamp())
    else: 
        end = int(dt.datetime(year + 1, 1, 1).timestamp())
    
    start_epoch = start
    end_epoch = start + DAY 
    
    # Search IDs on a weekly basis
    ids = []
    while end_epoch <= end:
        res = list(api.search_submissions(
                            after = start_epoch,
                            before= end_epoch + HOUR, 
                            subreddit = SR,
                            limit = 100))
        
        print(dt.datetime.fromtimestamp(start_epoch))
        print(dt.datetime.fromtimestamp(end_epoch), '\n')
       
        if verbose:
            print("FROM: ", np.intc(start_epoch).astype("datetime64[s]"), 
                  "TO:", np.intc(end_epoch).astype("datetime64[s]"))        
            print("FIRST: ", np.intc(res[-1].created_utc).astype("datetime64[s]"), 
                  "LAST: ", np.intc(res[0].created_utc).astype("datetime64[s]"))        
            print("number of posts: ", len(res))
        
        for r in res:
            ids.append(r.id)
        
        start_epoch = end_epoch
        end_epoch = start_epoch + DAY
    
    print("SAVING...", SR + "_" + str(year) + "-" + str(month) + "_submissions.csv")
    print(len(ids), " POSTS", "\n")
    pd.DataFrame(ids).drop_duplicates().to_csv("./submissions_previous/" + SR + "_" + str(year) + "-" + str(month) + "_submissions.csv")
    
    return ids

In [51]:
subreddits = ['CrohnsDisease', 'UlcerativeColitis', 'IBD', 'ibs']

In [None]:
for subreddit in subreddits:
    for year, month in dates:
        retrieve_ids_in_range(year = year, month = month, SR = subreddit, verbose = False)

In [52]:
mypath = "./submissions_previous/"
files = [join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))]

merged_file = []
for filename in files:
    with open(filename, 'r') as csv_file:
        file = csv.reader(csv_file)
        for row in file:
            merged_file.append(row)
            
merged_df = pd.DataFrame(merged_file, columns=['index', 'id'])
merged_df.drop(columns=['index'], inplace=True)
merged_df.to_csv('all_ids_previous.csv')

In [53]:
import os 

all_ids = pd.read_csv('all_ids_previous.csv')
all_ids.drop(columns=['Unnamed: 0'], inplace=True)

y = []
for subreddit in subreddits:
    x = []
    for fname in os.listdir('./submissions_previous/'):
        if subreddit in fname:
            x.append(pd.read_csv(join('./submissions_previous/', fname),index_col=0))
    
    x = pd.concat(x)
    x['subreddit'] = subreddit
    y.append(x)

y = pd.concat(y)
y.to_csv('all_ids_withsr_previous.csv')

In [56]:
all_ids_withsr = pd.read_csv('all_ids_withsr_previous.csv')
all_ids_withsr.drop(columns = ['Unnamed: 0'], inplace = True)
all_ids_withsr.drop_duplicates(inplace = True)
all_ids_withsr.groupby('subreddit').count()

Unnamed: 0_level_0,0
subreddit,Unnamed: 1_level_1
CrohnsDisease,13234
IBD,1609
UlcerativeColitis,6974
ibs,18906


In [57]:
ibd_ids = all_ids_withsr[all_ids_withsr['subreddit'] == 'IBD'].reset_index()
ibd_ids.rename(columns = {'0': 'id'}, inplace = True)

ibd_posts = []
for id_ in tqdm (ibd_ids['id'], desc="Loading from subreddit IBD"):
    post = reddit.submission(id_)
    ibd_posts.append([post.title, post.author, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
    
df_ibd = pd.DataFrame(ibd_posts, columns = ['title', 'author', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
df_ibd['created'] = df_ibd['created'].apply(lambda x: dt.datetime.fromtimestamp(x))

df_ibd.to_csv('./submissions_scraped_previous/ibd_submissions.csv')
df_ibd.head(10)

Loading from subreddit IBD:   0%|          | 0/1609 [00:00<?, ?it/s]

Unnamed: 0,title,author,score,id,subreddit,url,num_comments,body,created
0,Coronoavirus and traveling with Humira?,caroljohn15,16,fc00oo,IBD,https://www.reddit.com/r/IBD/comments/fc00oo/c...,11,So I only recently started Humira a couple wee...,2020-03-01 21:24:28
1,Coronovirus: What Is Your Plan?,StupidLullabies,1,fbzg4q,IBD,https://www.reddit.com/r/IBD/comments/fbzg4q/c...,0,[removed],2020-03-01 20:47:33
2,How long did it take for you to get diagnosed ...,darknesskiss,6,fbtjvx,IBD,https://www.reddit.com/r/IBD/comments/fbtjvx/h...,21,Sorry for the novel but I'm frustrated and suf...,2020-03-01 13:44:23
3,vu,matty62916,1,fbqgqu,IBD,https://www.reddit.com/r/IBD/comments/fbqgqu/vu/,0,[removed],2020-03-01 08:00:43
4,Would love tips for my colonoscopy,RaeNezL,5,fd2540,IBD,https://www.reddit.com/r/IBD/comments/fd2540/w...,23,"Hi all! So I finally got to the GI today and, ...",2020-03-03 22:47:24
5,Symptoms management!,ZuzuTheZooz,1,fd0i4q,IBD,https://www.reddit.com/r/IBD/comments/fd0i4q/s...,1,[removed],2020-03-03 21:02:54
6,Inflammatory Bowel Disease(IBD)- Ulcerative co...,drravibhatt80,1,fdhcd1,IBD,https://www.reddit.com/r/IBD/comments/fdhcd1/i...,0,[removed],2020-03-04 19:28:40
7,THE COLITIS BLOG: www.TheColitisBlog.com,ibdlife,1,fdhbll,IBD,https://www.reddit.com/r/IBD/comments/fdhbll/t...,0,[removed],2020-03-04 19:27:19
8,Inflammatory Bowel Disease (IBD)- Ulcerative c...,drravibhatt80,1,fdy0rk,IBD,https://www.drravibhatt.com/inflammatory-bowel...,0,,2020-03-05 17:46:14
9,Fatigue after eating sugar,qwertyzxcvbqwerty,19,fdnlz3,IBD,https://www.reddit.com/r/IBD/comments/fdnlz3/f...,6,Does anyone else get intense fatigue the day a...,2020-03-05 02:34:25


In [60]:
ulc_col_ids = all_ids_withsr[all_ids_withsr['subreddit'] == 'UlcerativeColitis'].reset_index()
ulc_col_ids.rename(columns = {'0': 'id'}, inplace = True)

ulc_col_posts = []
for id_ in tqdm (ulc_col_ids['id'], desc="Loading from subreddit Ulcerative Colitis"):
    post = reddit.submission(id_)
    ulc_col_posts.append([post.title, post.author ,post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
    
df_ulc_col = pd.DataFrame(ulc_col_posts, columns = ['title', 'author', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
df_ulc_col['created'] = df_ulc_col['created'].apply(lambda x: dt.datetime.fromtimestamp(x))

df_ulc_col.to_csv('./submissions_scraped_previous/ulc_col_submissions.csv')
df_ulc_col.head(10)

Loading from subreddit Ulcerative Colitis:   0%|          | 0/6974 [00:00<?, ?it/s]

Unnamed: 0,title,author,score,id,subreddit,url,num_comments,body,created
0,Brown rice is a trigger but white rice isn't?,ulcerativebolitis,2,ex9ugj,UlcerativeColitis,https://www.reddit.com/r/UlcerativeColitis/com...,8,"I had never had brown rice before, and recentl...",2020-02-01 18:52:08
1,My GI would disagree,breathelovesit,78,ex8nfe,UlcerativeColitis,https://i.redd.it/3v13yvdqsae41.jpg,0,,2020-02-01 17:35:07
2,Starting Remicade soon,WowItsCharles,4,ex5d85,UlcerativeColitis,https://www.reddit.com/r/UlcerativeColitis/com...,12,Hi everyone. Just looking for advice or though...,2020-02-01 13:19:41
3,Anybody taking entyvio every 4 weeks?,,4,ex3msg,UlcerativeColitis,,8,[deleted],2020-02-01 10:21:03
4,Doctor can't make a definitive diagnosis. Anyo...,,6,ex2rle,UlcerativeColitis,https://www.reddit.com/r/UlcerativeColitis/com...,3,Quick backstory: out of nowhere I started havi...,2020-02-01 08:45:33
5,A gem my gf found the other day:90's satire Ja...,lyydia76,12,ex1cqy,UlcerativeColitis,https://youtu.be/CKjaFG4YN6g,2,,2020-02-01 06:18:38
6,Do probiotics help?,Mazdino,1,ex05ep,UlcerativeColitis,https://www.reddit.com/r/UlcerativeColitis/com...,5,I'm in a semi-remission state meaning my sever...,2020-02-01 04:32:34
7,Is UC a underlying illness?,,0,ewyy3v,UlcerativeColitis,https://www.reddit.com/r/UlcerativeColitis/com...,5,I live less than 5mins away from a new outbrea...,2020-02-01 02:54:00
8,How long should I give Entyvio to work?????,aridgway,6,ewy8y8,UlcerativeColitis,https://www.reddit.com/r/UlcerativeColitis/com...,10,How long should I give a Entyvio to see if it’...,2020-02-01 01:58:54
9,When you feel the cramps kicking in,The_Noodle_Dragon,71,ewxxj0,UlcerativeColitis,https://i.redd.it/v4w04mxqj7e41.png,6,,2020-02-01 01:34:30


In [61]:
crohns_disease_ids = all_ids_withsr[all_ids_withsr['subreddit'] == 'CrohnsDisease'].reset_index()
crohns_disease_ids.rename(columns = {'0': 'id'}, inplace = True)

crohns_disease_posts = []
for id_ in tqdm (crohns_disease_ids['id'], desc="Loading from subreddit CrohnsDisease"):
    post = reddit.submission(id_)
    crohns_disease_posts.append([post.title, post.author, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
    
df_crohns_disease = pd.DataFrame(crohns_disease_posts, columns = ['title', 'author', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
df_crohns_disease['created'] = df_crohns_disease['created'].apply(lambda x: dt.datetime.fromtimestamp(x))

df_crohns_disease.to_csv('./submissions_scraped_previous/crohns_disease_submissions.csv')
df_crohns_disease.head(10)

Loading from subreddit CrohnsDisease:   0%|          | 0/13234 [00:00<?, ?it/s]

Unnamed: 0,title,author,score,id,subreddit,url,num_comments,body,created
0,Recently diagnosed. What are the key problems ...,,4,gusifm,CrohnsDisease,,13,[deleted],2020-06-01 22:39:00
1,Anyone else think the can actively feel their ...,ApexIsGangster,43,guqn9c,CrohnsDisease,https://www.reddit.com/r/CrohnsDisease/comment...,7,I love crawling into my bed after an hour of c...,2020-06-01 21:02:48
2,Wish this Disease would have killed me by now 🤞,Chipochapi,109,guqd0m,CrohnsDisease,https://www.reddit.com/r/CrohnsDisease/comment...,47,It’s taken everything my promising Sporting ca...,2020-06-01 20:48:27
3,Any results with Imuran?,burntsquid,1,guqboc,CrohnsDisease,https://www.reddit.com/r/CrohnsDisease/comment...,4,Hey guys I’m a 25(M) with Crohns and I’ll be s...,2020-06-01 20:46:30
4,"Normal bloodtest, worsening symptoms [vent]",ShaariAmairi,7,guq7i5,CrohnsDisease,https://www.reddit.com/r/CrohnsDisease/comment...,5,I was called minuted ago to be told my results...,2020-06-01 20:40:39
5,Burning feet during flare ups,theindian_90sguy,3,gup2qp,CrohnsDisease,https://www.reddit.com/r/CrohnsDisease/comment...,3,Randomly the underside of my feet get excrucia...,2020-06-01 19:41:47
6,Volumetric Analysis of Small Bowel Motility in...,Robert_Larsson,2,gumfw2,CrohnsDisease,https://pubmed.ncbi.nlm.nih.gov/32469111/,1,,2020-06-01 17:17:25
7,"need some support, feeling alienated",,23,gul9xw,CrohnsDisease,,29,[deleted],2020-06-01 16:08:38
8,Heart Disease Awareness Survivor,,1,gul8b3,CrohnsDisease,,1,[deleted],2020-06-01 16:05:54
9,Getting ready for my MRE in about 30 mins. Thi...,,123,gukm3m,CrohnsDisease,https://i.redd.it/9c7ds6isva251.jpg,32,,2020-06-01 15:26:28


In [62]:
ibs_ids = all_ids_withsr[all_ids_withsr['subreddit'] == 'ibs'].reset_index()
ibs_ids.rename(columns = {'0': 'id'}, inplace = True)

ibs_posts = []
for id_ in tqdm (ibs_ids['id'], desc="Loading from subreddit ibs"):
    post = reddit.submission(id_)
    ibs_posts.append([post.title, post.author, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
    
    
df_ibs = pd.DataFrame(ibs_posts, columns = ['title', 'author', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
df_ibs['created'] = df_ibs['created'].apply(lambda x: dt.datetime.fromtimestamp(x))

df_ibs.to_csv('./submissions_scraped_previous/ibs_submissions.csv')
df_ibs.head(10)

Loading from subreddit ibs:   0%|          | 0/18906 [00:00<?, ?it/s]

Unnamed: 0,title,author,score,id,subreddit,url,num_comments,body,created
0,Does anyone else get pain so bad your brain go...,,6,i215vo,ibs,https://www.reddit.com/r/ibs/comments/i215vo/d...,4,"Whenever I eat anything remotely triggering, w...",2020-08-02 00:28:11
1,[deleted by user],,10,i208uy,ibs,,5,[removed],2020-08-01 23:31:19
2,Help what’s happening?,Caleabfronk,5,i202m0,ibs,https://www.reddit.com/r/ibs/comments/i202m0/h...,6,So apparently I was diagnosed with IBS 2 month...,2020-08-01 23:20:44
3,Went from having large normal shaped bms to ra...,Nirvana12345678,3,i1zqx6,ibs,https://www.reddit.com/r/ibs/comments/i1zqx6/w...,6,"I'm a 34 year old male, about 220 lbs, I went ...",2020-08-01 23:00:58
4,Dark patch in stool. Any ideas? 21M,,3,i1zkbn,ibs,,1,[deleted],2020-08-01 22:50:13
5,Dark patch in stool. Any ideas why?,,1,i1z8ht,ibs,,0,[deleted],2020-08-01 22:32:52
6,Gassiness?,holographicum,2,i1ytbs,ibs,https://www.reddit.com/r/ibs/comments/i1ytbs/g...,4,"Hi - I'm fairly new to this community, I just ...",2020-08-01 22:12:41
7,"If my abdomen feels inflamed, is it a digestiv...",fadedguy41,1,i1ylm5,ibs,https://www.reddit.com/r/ibs/comments/i1ylm5/i...,2,There are many times where my entire abdomen f...,2020-08-01 22:03:03
8,Post Gallbladder Life,BourbonSipping,2,i1y8al,ibs,https://www.reddit.com/r/ibs/comments/i1y8al/p...,2,So last week things came to a head and I was r...,2020-08-01 21:40:27
9,This has definitely been a struggle lately,SpookyKnees,14,i1xshw,ibs,https://v.redd.it/npksddhgxfe51,2,,2020-08-01 21:14:32
