In [45]:
import datetime
import praw
import pandas as pd

from keys import client_id, secret

from tqdm.auto import tqdm

### Collecting posts

In [46]:
reddit = praw.Reddit(client_id=client_id, client_secret=secret, user_agent='postscrape by /u/kejimuna')

In [51]:
nintendo = reddit.subreddit('nintendoswitch')

posts = []
for index, post in enumerate(nintendo.top(limit=40)):
    posts.append([post.title, "https://reddit.com" + post.permalink, post.selftext, post.score, post.created_utc, index, post.num_comments])

# dataframe for posts
posts = pd.DataFrame(posts, columns=['Title', 'URL', 'Body', 'Upvotes', 'Time', 'Key', 'TotalComments'])

# changing from utc time to standard timestamp
posts.Time = posts.Time.apply(lambda x: pd.to_datetime(datetime.datetime.fromtimestamp(x)))


In [52]:
sum(posts.TotalComments)

102812

In [53]:
posts.head()

Unnamed: 0,Title,URL,Body,Upvotes,Time,Key,TotalComments
0,My kid was so adamant I put in the lunchables ...,https://reddit.com/r/NintendoSwitch/comments/d...,,99966,2019-10-18 05:08:57,0,2313
1,Nintendo delayed Animal Crossing because it di...,https://reddit.com/r/NintendoSwitch/comments/b...,,76590,2019-06-13 01:31:03,1,2353
2,Join the Battle for Net Neutrality! Net neutra...,https://reddit.com/r/NintendoSwitch/comments/7...,,69715,2017-11-21 19:21:08,2,1247
3,NES Switch dock (build pics in comments),https://reddit.com/r/NintendoSwitch/comments/g...,,62776,2020-05-26 06:21:24,3,988
4,Nintendo is really missing a huge opportunity ...,https://reddit.com/r/NintendoSwitch/comments/i...,GameCube was the second lowest selling console...,61594,2020-08-03 00:16:38,4,4896


In [54]:
posts = posts.drop(posts.index[2])
posts.head()

Unnamed: 0,Title,URL,Body,Upvotes,Time,Key,TotalComments
0,My kid was so adamant I put in the lunchables ...,https://reddit.com/r/NintendoSwitch/comments/d...,,99966,2019-10-18 05:08:57,0,2313
1,Nintendo delayed Animal Crossing because it di...,https://reddit.com/r/NintendoSwitch/comments/b...,,76590,2019-06-13 01:31:03,1,2353
3,NES Switch dock (build pics in comments),https://reddit.com/r/NintendoSwitch/comments/g...,,62776,2020-05-26 06:21:24,3,988
4,Nintendo is really missing a huge opportunity ...,https://reddit.com/r/NintendoSwitch/comments/i...,GameCube was the second lowest selling console...,61594,2020-08-03 00:16:38,4,4896
5,Super Smash Bros. Ultimate brings back every c...,https://reddit.com/r/NintendoSwitch/comments/8...,,60855,2018-06-13 04:23:27,5,4266


In [55]:
posts.shape

(39, 7)

**Collecting comments on each post**

In [56]:
def collect_replies(key, url):
    '''
    Args:
        key: the key of the post
        url (str): the url of the post

    Returns:
        pd.dataframe: comments
    '''

    submission = reddit.submission(url=url)
    submission.comments.replace_more(limit=None)
    comment_queue = submission.comments[:]

    table = {'Reply':[], 'Upvote':[], 'Time':[], 'Key':[]}

    while comment_queue:
        comment = comment_queue.pop(0)
        table['Reply'].append(comment.body)
        table['Time'].append(comment.created_utc)
        table['Upvote'].append(comment.score)
        table['Key'].append(key)
        comment_queue.extend(comment.replies)

    return pd.DataFrame.from_dict(table)

In [57]:
keys = posts.Key.tolist()
urls = posts.URL.tolist()
tuples = list(zip(keys, urls))

comments = pd.concat([collect_replies(x[0], x[1]) for x in tuples])

In [67]:
comments.Time = comments.Time.apply(lambda x: pd.to_datetime(datetime.datetime.fromtimestamp(x)))

TypeError: an integer is required (got type Timestamp)

In [65]:
comments.shape

(101826, 4)

In [68]:
comments.to_csv('./data/raw/Comments.csv', index=False)

In [69]:
posts.to_csv('./data/raw/Posts.csv', index=False)