# ASOIAF/GoT Reddit Posts - Pt. 1

## Data Collection

###### Import Libraries

In [1]:
import requests
import time
import pandas as pd
import json

###### Import Data

In [2]:
## creating a user-agent to access Reddit's content
headers = {'User-agent': 'JObot'}

In [15]:
## setting our subreddit links to loop through
subreddits = ['r/asoiaf/new/.json', 'r/gameofthrones/top/.json?t=all']

### Day 1 scraping

In [73]:
## setting empty posts list to collect our json 'data'
posts = []

## looping through the two different subreddits
for i in range(len(subreddits)):
    
    ## setting our after to None to start at the beginning of the thread
    after = None
    
    ## looping through 20 times and collecting 25 posts per loop
    for posts_25 in range(20):
        
        print(posts_25)  ## printing the computed iteration
        
        if after == None:  ## if statement to check value of 'after'
            params = {}    ## setting params to empty dictionary if 'None'
        
        else:
            params = {'after': after}  ## setting our params to the after at the end of each series of posts
        
        ## creating the url for given subreddit by attaching the looped through subreddit
        url = 'https://www.reddit.com/' + subreddits[i]  
        
        ## using our get request based on url and current params
        res = requests.get(url, params = params, headers=headers)
        
        ## if statement run, if our status code is 200 (good)
        if res.status_code == 200:
            asoiaf = res.json() ## gathering/assigning json info from our get request

            ## extending our list of posts from data in 'children'
            posts.extend(asoiaf['data']['children'])

            ## setting our after value to
            after = asoiaf['data']['after']

        ## if error with status code, we will break our loop and print error code
        else:
            print(res.status_code)  
            break

        time.sleep(1) ## slowing our loop by one second for each iteration

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [75]:
## showing that we have 1004 unique posts
len(set([p['data']['name'] for p in posts]))

1004

In [100]:
## looking at our keys, so that we can isolate our text from the post
posts[0]['data'].keys()

dict_keys(['approved_at_utc', 'subreddit', 'selftext', 'author_fullname', 'saved', 'mod_reason_title', 'gilded', 'clicked', 'title', 'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls', 'link_flair_css_class', 'downs', 'thumbnail_height', 'hide_score', 'name', 'quarantine', 'link_flair_text_color', 'author_flair_background_color', 'subreddit_type', 'ups', 'total_awards_received', 'media_embed', 'thumbnail_width', 'author_flair_template_id', 'is_original_content', 'user_reports', 'secure_media', 'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed', 'link_flair_text', 'can_mod_post', 'score', 'approved_by', 'thumbnail', 'edited', 'author_flair_css_class', 'author_flair_richtext', 'gildings', 'content_categories', 'is_self', 'mod_note', 'created', 'link_flair_type', 'wls', 'banned_by', 'author_flair_type', 'domain', 'allow_live_comments', 'selftext_html', 'likes', 'suggested_sort', 'banned_at_utc', 'view_count', 'archived', 'no_follow', 'is_crosspostable', 

In [150]:
## showing our first post in posts
posts[0]['data']

{'approved_at_utc': None,
 'subreddit': 'asoiaf',
 'selftext': "Welcome to the Weekly Q &amp; A! Feel free to ask any questions you may have about the world of ASOIAF. No need to be bashful. Book and show questions are welcome; please say in your question if you would prefer to focus on the BOOKS, the SHOW, or BOTH.  And if you think you've got an answer to someone's question, feel free to lend them a hand!",
 'author_fullname': 't2_6l4z3',
 'saved': False,
 'mod_reason_title': None,
 'gilded': 0,
 'clicked': False,
 'title': '(Spoilers Main) Weekly Q and A',
 'link_flair_richtext': [{'e': 'text', 't': 'MAIN'}],
 'subreddit_name_prefixed': 'r/asoiaf',
 'hidden': False,
 'pwls': 6,
 'link_flair_css_class': 'main',
 'downs': 0,
 'thumbnail_height': None,
 'hide_score': False,
 'name': 't3_c8man6',
 'quarantine': False,
 'link_flair_text_color': 'light',
 'author_flair_background_color': '',
 'subreddit_type': 'public',
 'ups': 11,
 'total_awards_received': 0,
 'media_embed': {},
 'thumbn

In [131]:
subreddit_lists = []
for post in range(len(posts)):
    post_dict = {}
    post_dict['subreddit'] = posts[post]['data']['subreddit']
    post_dict['post_text'] = posts[post]['data']['selftext']
    
    if post_dict['post_text'] != '':
        subreddit_lists.append(post_dict)
    else:
        pass

In [138]:
subreddit_lists[-1]

{'subreddit': 'gameofthrones',
 'post_text': '[SPOILERS] I just remembered there was a scene in seson 7 where john told all the lords to train there women in combat. Because they cant win if only half the population is fighting... where were these women on the battlefield? If i remember well they went to hide in the cripts!!! How could they forget about this??? they the ones writing this line in the first place.'}

In [134]:
df = pd.DataFrame(subreddit_lists)

In [135]:
df.shape

(620, 2)

In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620 entries, 0 to 619
Data columns (total 2 columns):
post_text    620 non-null object
subreddit    620 non-null object
dtypes: object(2)
memory usage: 9.8+ KB


In [166]:
df.to_csv('day_1_reddit_scrapping.csv', index=False)

In [167]:
df.to_csv('day_1_reddit_scraping_indexed.csv')

### Day 2 scraping

In [175]:
posts = []
afters = []
for i in range(len(subreddits)):
    after = None
    for posts_25 in range(25):
        print(posts_25)
        if after == None:
            params = {}
        else:
            params = {'after': after}
        url = 'https://www.reddit.com/' + subreddits[i]
        res = requests.get(url, params = params, headers=headers)
        if res.status_code == 200:
            asoiaf = res.json()
            
            ## append will do a list of list, but extend will add to the list
            posts.extend(asoiaf['data']['children'])

            ## this is where we overwrite it, we start at zero on the first hit
            ## then we will set it to a value after
            ## now we can hit this API with this 'after' paramater and get 25 more hits every time
            after = asoiaf['data']['after']

        else:
            print(res.status_code)
            break

        ## slow your loop down intentionally, so you're not hitting the server hard
        time.sleep(1) ## slows the loop down by one second for each iteration
        afters.append(after) ## collecting our after values to start each successive loop with

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


In [176]:
len(posts)

1244

In [187]:
recent_afters = [afters[24], afters[49]]

In [188]:
recent_afters

['t3_c1bzp6', 't3_c8205r']

In [16]:
def sub_collection(subreddit_list):
    posts = []
    for i in range(len(subreddit_list)):
        after = None
        for posts_25 in range(40):
            if after == None:
                params = {}
            else:
                params = {'after': after}
            url = 'https://www.reddit.com/' + subreddit_list[i] 
            res = requests.get(url, params = params, headers=headers)
            if res.status_code == 200:
                asoiaf = res.json()

                ## append will do a list of list, but extend will add to the list
                posts.extend(asoiaf['data']['children'])

                ## this is where we overwrite it, we start at zero on the first hit
                ## then we will set it to a value after
                ## now we can hit this API with this 'after' paramater and get 25 more hits every time
                after = asoiaf['data']['after']

            else:
                print(res.status_code)
                break

            ## slow your loop down intentionally, so you're not hitting the server hard
            time.sleep(2) ## slows the loop down by two seconds for each iteration            
    
     ## creating a function that will separate the text and the respective subreddit
#     def text_parsing(posts):
    subreddit_lists = []
    for post in range(len(posts)):
        post_dict = {}
        post_dict['subreddit'] = posts[post]['data']['subreddit']
        post_dict['post_text'] = posts[post]['data']['selftext']

        if post_dict['post_text'] != '':
            subreddit_lists.append(post_dict)
        else:
            pass

    return subreddit_lists

In [195]:
new_posts, new_afters = sub_collection(subreddits, recent_afters)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


In [199]:
## afters for #3
new_afters

['t3_c1ohlg', 't3_c7y596']

In [212]:
## creating a function that will separate the text and the respective subreddit
def text_parsing(posts):
    subreddit_lists = []
    for post in range(len(posts)):
        post_dict = {}
        post_dict['subreddit'] = posts[post]['data']['subreddit']
        post_dict['post_text'] = posts[post]['data']['selftext']

        if post_dict['post_text'] != '':
            subreddit_lists.append(post_dict)
        else:
            pass
    return subreddit_lists

In [216]:
posts_2 = text_parsing(posts)

In [219]:
## showing the length of collected posts
len(posts_2)

770

In [220]:
df2 = pd.DataFrame(posts_2)
df2.head()

Unnamed: 0,post_text,subreddit
0,Welcome to the Weekly Q &amp; A! Feel free to ...,asoiaf
1,Please remember:\n\n1. You must submit the ori...,asoiaf
2,I’ll start - “as useless as nipples on a breas...,asoiaf
3,It's been pointed out by Youtuber 'The Dragon ...,asoiaf
4,In [a recent thread](https://www.reddit.com/r...,asoiaf


In [225]:
df2.to_csv('day_2_reddit_scrapping.csv')

### Day 3 Scraping

In [38]:
posts_3 = sub_collection(subreddits)

In [39]:
len(posts_3)

1225

In [41]:
posts_3[-1]

{'subreddit': 'gameofthrones',
 'post_text': ' We know what D&amp;D did, we know roughly what GRRM would have done, but precisely how would you have made it so, and how would the end result and impact for the viewer have been different? Would the impact really be that different, if Daenerys is still the same in the end?'}

In [43]:
df3 = pd.DataFrame(posts_3)
df3.head()

Unnamed: 0,post_text,subreddit
0,Welcome to the Weekly Q &amp; A! Feel free to ...,asoiaf
1,Please remember:\n\n1. You must submit the ori...,asoiaf
2,For me:\n\nA Game of Thrones: Ned Stark and *E...,asoiaf
3,Let's say fAegon and his army successfully tak...,asoiaf
4,"One time , the king was feasting the queen's f...",asoiaf


In [45]:
df3.subreddit.value_counts()

asoiaf           881
gameofthrones    344
Name: subreddit, dtype: int64

In [44]:
df3.to_csv('day_3_reddit_scraping.csv')

### Day 4 scraping

In [6]:
posts_4 = sub_collection(subreddits)
len(posts_4)

1243

In [7]:
df4 = pd.DataFrame(posts_4)
df4.head()

Unnamed: 0,post_text,subreddit
0,Welcome to the Weekly Q &amp; A! Feel free to ...,asoiaf
1,"As you may know, we have a policy against sill...",asoiaf
2,Here I'm not referring to minor gripes about A...,asoiaf
3,"I've been rereading the series again, and came...",asoiaf
4,In the books kills multiple highly skilled kni...,asoiaf


In [8]:
df4.subreddit.value_counts()

asoiaf           894
gameofthrones    349
Name: subreddit, dtype: int64

In [10]:
df4.to_csv('day_4_reddit_scraping.csv')

### Day 5 scraping

In [8]:
## changed the subreddit list to include all new posts (no long 'hot')

posts_5 = sub_collection(subreddits)
len(posts_5)

1267

In [10]:
df5 = pd.DataFrame(posts_5)
df5.head()

Unnamed: 0,post_text,subreddit
0,Just read this in AGOT. I figured the oldest o...,asoiaf
1,) for an instant it looked as though he might ...,asoiaf
2,IIRC either varys or littlefinger suggested to...,asoiaf
3,"As we all know, in asoiaf it is commonly belie...",asoiaf
4,So we're all aware at this point of how Bronn ...,asoiaf


In [11]:
df5.subreddit.value_counts()

asoiaf           886
gameofthrones    381
Name: subreddit, dtype: int64

In [12]:
df5.to_csv('day_5_reddit_scraping.csv')

### Day 6 scraping

In [5]:
posts_6 = sub_collection(subreddits)
len(posts_6)

1271

In [6]:
df6 = pd.DataFrame(posts_6)
df6.head()

Unnamed: 0,post_text,subreddit
0,"Actually, I have this hunch that somehow Georg...",asoiaf
1,"Could be an entire era like the Age of Heroes,...",asoiaf
2,Westeros has had it's fair share of good and b...,asoiaf
3,While re-listening to ACOK much is made of how...,asoiaf
4,"I think it is Sansa, she said to Olenna about ...",asoiaf


In [7]:
df6.subreddit.value_counts()

asoiaf           886
gameofthrones    385
Name: subreddit, dtype: int64

In [8]:
df6.to_csv('day_6_reddit_scraping.csv')

In [10]:
## scraping the hots section of 
posts_6 = sub_collection(subreddits)
len(posts_6)

1217

In [11]:
df6_hot = pd.DataFrame(posts_6)
df6_hot.head()

Unnamed: 0,post_text,subreddit
0,Welcome to the Weekly Q &amp; A! Feel free to ...,asoiaf
1,Welcome to the Weekly Q &amp; A! Feel free to ...,asoiaf
2,While re-listening to ACOK much is made of how...,asoiaf
3,[This comment](https://www.reddit.com/r/asoiaf...,asoiaf
4,So I was reading Fire and Blood today and duri...,asoiaf


In [19]:
df6 = pd.concat([df6, df6_hot], ignore_index=True)
df6.tail()

Unnamed: 0,post_text,subreddit
11002,Bran just got yeeted out of a fucking wondowsi...,gameofthrones
11003,[https://youtu.be/WpeMPIL-mfg](https://youtu.b...,gameofthrones
11004,What did people think of Arya’s ending?\n\nShe...,gameofthrones
11005,It seems like every great house goes extinct s...,gameofthrones
11006,So one thing has always bothered me when i wat...,gameofthrones


In [28]:
## scraping the top for the month
posts_6t = sub_collection(subreddits)
len(posts_6t)

1265

In [31]:
## scraping the controversial for the past month
posts_6c = sub_collection(subreddits)
len(posts_6c)

1284

In [52]:
df6t = pd.DataFrame(posts_6t)
df6c = pd.DataFrame(posts_6c)

df6 = pd.concat([df6, df6_hot, df6t, df6c], ignore_index=True)
df6.shape

(18535, 2)

In [53]:
df6.to_csv('day_6_reddit_scraping.csv')

### Day 7 scraping

In [17]:
## pulling our final days worth of subreddits
posts_7 = sub_collection(subreddits)
df7 = pd.DataFrame(posts_7)
print(len(posts_7))

999


In [18]:
df7.subreddit.value_counts()

asoiaf           888
gameofthrones    111
Name: subreddit, dtype: int64

In [19]:
no_asoiaf = df7[df7['subreddit'] == 'gameofthrones']
no_asoiaf.subreddit.value_counts()

gameofthrones    111
Name: subreddit, dtype: int64

In [20]:
no_asoiaf.to_csv('day_7_reddit_scraping.csv')