In [1]:
import datetime
import praw
import pandas as pd
from keys import client_id, client_secret

### Collecting the posts for our topic

Initializing a Reddit Instance

In [2]:
reddit = praw.Reddit( client_id=client_id,
                      client_secret=client_secret,
                      user_agent='android:my_app:v1 (by /u/CosmicGoose98)')

Calling the API and building a dataframe from it

In [3]:
bitcoin = reddit.subreddit('Bitcoin')

#Gathering the top 500 posts, with their title, url, body, upvotes, timestamp, and an index that serves as a key between the
#posts and the comments we collect later
posts = []
for index, post in enumerate(bitcoin.top(limit=30)):
    posts.append([post.title, "https://www.reddit.com" + post.permalink, post.selftext, post.score, post.created_utc, index])

#Converting into DataFrame
posts = pd.DataFrame(posts, columns=['Title', 'URL', 'Body', 'Upvotes', 'Time', 'Key'])
#Changing from utc time to standard timestamp
posts.Time = posts.Time.apply(lambda x: pd.to_datetime(datetime.datetime.fromtimestamp(x)))

#The first post is a sticky, so we can drop it
posts = posts.iloc[1:]

In [4]:
posts.head(3)

Unnamed: 0,Title,URL,Body,Upvotes,Time,Key
1,The last 3 months in 47 seconds.,https://www.reddit.com/r/Bitcoin/comments/7v43...,,48468,2018-02-04 15:42:03,1
2,It's over 9000!!!,https://www.reddit.com/r/Bitcoin/comments/7fkq...,,42441,2017-11-26 17:55:02,2
3,Everyone who's trading BTC right now,https://www.reddit.com/r/Bitcoin/comments/7olr...,,42052,2018-01-07 09:38:56,3


In [5]:
posts.shape

(29, 6)

### Collecting the comments for each of our posts

We want to get all the comments for the posts we collected

In [6]:
def collect_replies(key, url):
    ''' 
    params pandas series row: each row of the dataframe we built above in the form of a panda series
    Returns a pandas DataFrame, where each row represents an individual comment
    '''
    submission = reddit.submission(url=url)
    submission.comments.replace_more(limit=None)
    comment_queue = submission.comments[:] 

    table = {'Reply':[], 'Upvote':[], 'Time':[], 'Key':[]}

    while comment_queue:
        comment = comment_queue.pop(0)
        table['Reply'].append(comment.body)
        table['Time'].append(comment.created_utc)
        table['Upvote'].append(comment.score)
        table['Key'].append(key)
        comment_queue.extend(comment.replies)
    
    return pd.DataFrame.from_dict(table)

Now the function has been defined, we create our dataframe of comments. Using list comprehensions will speed things up slightly

In [7]:
#Let us first generate a list of tupules that contains the key and url for each row - the first value of the tupule is key,
#and the second value is url
keys = posts.Key.tolist()
urls = posts.URL.tolist()
tupules = list(zip(keys, urls))

#Now we generate our comments dataframe using list comprehensions!
comments = pd.concat([collect_replies(x[0], x[1]) for x in tupules])

In [8]:
#Again, converting the timestamp from utc to a standard format
comments.Time = comments.Time.apply(lambda x: pd.to_datetime(datetime.datetime.fromtimestamp(x)))

In [9]:
comments.head(20)

Unnamed: 0,Reply,Upvote,Time,Key
0,"Brilliant.\n\nEdit: \n\nWell, as I’m here I ma...",4543,2018-02-04 15:49:03,1
1,"Awesome, except all the other coins fell in af...",744,2018-02-04 17:17:05,1
2,Best Simpsons ever,858,2018-02-04 16:05:41,1
3,"Simpsons predicted Bitcoin, again",369,2018-02-04 16:42:58,1
4,The things that fall on Homers head are 1) Alt...,971,2018-02-04 17:39:30,1
5,Remember Bitcoin? It’s back! In POG form!,139,2018-02-04 19:01:52,1
6,This is gold.,117,2018-02-04 16:31:44,1
7,proof that /r/HighQualityGifs could be funny ...,1826,2018-02-04 18:14:30,1
8,Is this why my friends on social media stopped...,33,2018-02-04 21:14:00,1
9,Bitconeeeeeeeeeeeeeeeect👍,27,2018-02-04 18:53:36,1


In [10]:
comments.shape

(33101, 4)

In [13]:
comments.to_csv('Comments_.csv', index=False)

In [14]:
posts.to_csv('Posts.csv', index=False)