# Subredding Scrapping Function

In [6]:
import pandas as pd
import requests
import time
import praw
import json

### Creating a scraping function

In [7]:
def scraper(subreddit, posts, sorter, age, n):    
    after = None
    if age == 0:
        url = 'https://www.reddit.com/r/{}/{}.json'.format(subreddit, sorter)
    else:
        url = 'https://www.reddit.com/r/{}/{}/.json?t={}'.format(subreddit, sorter, age)
    for i in range(n):
        if after == None:
            current_url = url
        elif age == 0:
            current_url = url + '?after=' + after
        else:
            current_url = url + '&after=' + after
        res = requests.get(current_url, headers={'User-agent': 'Reddit Scrape'})
        if res.status_code != 200:
            print('Status Error', res.status_code)
            break
        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        after = current_dict['data']['after']
        time.sleep(2)
        if after == None:
            break
        if (i + 1) % 25 == 0:
            print(i + 1)
            
    return

In [8]:
def main_scraper(subreddit, posts, sorter_list, age_list, n):
    
    for sorter in sorter_list:
        if sorter in ['top', 'controversial']:
            for age in age_list:
                scraper(subreddit, posts, sorter, age, n)
                print('Finished', sorter, age)
        else:
            scraper(subreddit, posts, sorter, 0, n)  
            print("Finished", sorter)
    
    return

### Scraping for the subreddit /r/netflix

In [56]:
netflix_posts = []

In [57]:
main_scraper('netflix', netflix_posts, ['hot', 'controversial', 'new', 'top'], ['week','month', 'year', 'all'], n=1000)

25
Finished hot
Finished controversial week
25
Finished controversial month
25
Finished controversial year
25
Finished controversial all
25
Finished new
Finished top week
25
Finished top month
25
Finished top year
25
Finished top all


In [58]:
len(netflix_posts)

7713

In [59]:
df_netflix = pd.DataFrame(netflix_posts)
df_netflix.head()


Unnamed: 0,all_awardings,allow_live_comments,approved_at_utc,approved_by,archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,...,thumbnail,title,total_awards_received,ups,url,user_reports,view_count,visited,whitelist_status,wls
0,[],True,,,False,Elainasha,,,,[],...,,List of some upcoming new Netflix Original ser...,0,596,https://www.reddit.com/r/netflix/comments/btys...,[],,False,all_ads,6
1,[],True,,,False,FPS_Yusuf1999,,,,[],...,,Stranger Things 4 | Official Announcement,0,1377,https://youtu.be/cIiDY4WA0oo,[],,False,all_ads,6
2,"[{'count': 1, 'is_enabled': True, 'subreddit_i...",False,,,False,infodawg,,,,[],...,,Anyone else addicted to subtitles?,1,704,https://www.reddit.com/r/netflix/comments/djd1...,[],,False,all_ads,6
3,[],False,,,False,ThisisWiretap,,,,[],...,,"Netflix Q3 earnings exceed estimates, despite ...",0,508,https://techcrunch.com/2019/10/16/netflix-q3-e...,[],,False,all_ads,6
4,[],False,,,False,maskedhero,,,,[],...,,Atypical Season 3 Trailer,0,55,https://youtu.be/bSbF25mXDyM,[],,False,all_ads,6


In [60]:
df_netflix.shape

(7713, 103)

In [61]:
df_netflix_unique = pd.DataFrame(columns=df_netflix.columns)
for i, identifier in enumerate(df_netflix['id'].unique()):
    df_netflix_unique.loc[i] = df_netflix[df_netflix['id']==identifier].iloc[0]

In [62]:
df_netflix_unique.head()

Unnamed: 0,all_awardings,allow_live_comments,approved_at_utc,approved_by,archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,...,thumbnail,title,total_awards_received,ups,url,user_reports,view_count,visited,whitelist_status,wls
0,[],True,,,False,Elainasha,,,,[],...,,List of some upcoming new Netflix Original ser...,0,596,https://www.reddit.com/r/netflix/comments/btys...,[],,False,all_ads,6
1,[],True,,,False,FPS_Yusuf1999,,,,[],...,,Stranger Things 4 | Official Announcement,0,1377,https://youtu.be/cIiDY4WA0oo,[],,False,all_ads,6
2,"[{'count': 1, 'is_enabled': True, 'subreddit_i...",False,,,False,infodawg,,,,[],...,,Anyone else addicted to subtitles?,1,704,https://www.reddit.com/r/netflix/comments/djd1...,[],,False,all_ads,6
3,[],False,,,False,ThisisWiretap,,,,[],...,,"Netflix Q3 earnings exceed estimates, despite ...",0,508,https://techcrunch.com/2019/10/16/netflix-q3-e...,[],,False,all_ads,6
4,[],False,,,False,maskedhero,,,,[],...,,Atypical Season 3 Trailer,0,55,https://youtu.be/bSbF25mXDyM,[],,False,all_ads,6


In [63]:
df_netflix_unique.shape

(4128, 103)

In [64]:
df_netflix_unique.to_csv('netflix.csv')

### Scraping for the subreddit /r/hulu

In [65]:
hulu_posts = []

In [66]:
main_scraper('hulu', hulu_posts, ['hot', 'controversial', 'new', 'top'], ['week','month', 'year', 'all'], n=1000)

25
Finished hot
Finished controversial week
Finished controversial month
25
Finished controversial year
25
Finished controversial all
25
Finished new
Finished top week
Finished top month
25
Finished top year
25
Finished top all


In [67]:
len(hulu_posts)

6546

In [73]:
df_hulu = pd.DataFrame(hulu_posts)
df_hulu.head()

Unnamed: 0,all_awardings,allow_live_comments,approved_at_utc,approved_by,archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,...,thumbnail_width,title,total_awards_received,ups,url,user_reports,view_count,visited,whitelist_status,wls
0,[],False,,,False,AutoModerator,,,mod,[],...,,What are you watching and what do you recommen...,0,3,https://www.reddit.com/r/Hulu/comments/dgdn28/...,[],,False,all_ads,6
1,[],True,,,False,TeamHulu,,,officialhulu,[],...,,Who's got two thumbs and loves feedback?,0,40,https://www.reddit.com/r/Hulu/comments/ditn22/...,[],,False,all_ads,6
2,[],True,,,False,zwis99,,,,[],...,140.0,AD’s over my content with no way to remove? Is...,0,49,https://i.redd.it/zogtzqptu4t31.jpg,[],,False,all_ads,6
3,[],False,,,False,donovan0313,,,,[],...,,Can't download premium content?,0,4,https://www.reddit.com/r/Hulu/comments/djgk25/...,[],,False,all_ads,6
4,[],True,,,False,ChocolateCherryCola,,,,[],...,,Hulu's App Upgrade New Logo Color &amp; Colore...,0,3,https://www.reddit.com/r/Hulu/comments/djcy6s/...,[],,False,all_ads,6


In [74]:
df_hulu_unique = pd.DataFrame(columns=df_hulu.columns)
for i, identifier in enumerate(df_hulu['id'].unique()):
    df_hulu_unique.loc[i] = df_hulu[df_hulu['id']==identifier].iloc[0]

In [75]:
df_hulu_unique.head()

Unnamed: 0,all_awardings,allow_live_comments,approved_at_utc,approved_by,archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,...,thumbnail_width,title,total_awards_received,ups,url,user_reports,view_count,visited,whitelist_status,wls
0,[],False,,,False,AutoModerator,,,mod,[],...,,What are you watching and what do you recommen...,0,3,https://www.reddit.com/r/Hulu/comments/dgdn28/...,[],,False,all_ads,6
1,[],True,,,False,TeamHulu,,,officialhulu,[],...,,Who's got two thumbs and loves feedback?,0,40,https://www.reddit.com/r/Hulu/comments/ditn22/...,[],,False,all_ads,6
2,[],True,,,False,zwis99,,,,[],...,140.0,AD’s over my content with no way to remove? Is...,0,49,https://i.redd.it/zogtzqptu4t31.jpg,[],,False,all_ads,6
3,[],False,,,False,donovan0313,,,,[],...,,Can't download premium content?,0,4,https://www.reddit.com/r/Hulu/comments/djgk25/...,[],,False,all_ads,6
4,[],True,,,False,ChocolateCherryCola,,,,[],...,,Hulu's App Upgrade New Logo Color &amp; Colore...,0,3,https://www.reddit.com/r/Hulu/comments/djcy6s/...,[],,False,all_ads,6


In [76]:
df_hulu_unique.shape

(3309, 106)

In [77]:
df_hulu_unique.to_csv('hulu.csv')

In [None]:
#In future, could create one list e.g. post = [] and append both lists into one 
#This would look like:
#post = []
#main_scraper('hulu', posts, ['hot', 'controversial', 'new', 'top'], ['week','month', 'year', 'all'], n=1000)
#main_scraper('netflix', posts, ['hot', 'controversial', 'new', 'top'], ['week','month', 'year', 'all'], n=1000)
#This would allow me to save time at the next step where using the scraping function above I had to concatenate both datasets
#instead of already having it in one file

In [9]:
makeup_posts = []

In [10]:
main_scraper('makeupaddiction', makeup_posts, ['hot', 'controversial', 'new', 'top'], ['week','month', 'year', 'all'], n=1000)

25
Finished hot
Finished controversial week
25
Finished controversial month
25
Finished controversial year
25
Finished controversial all
25
Finished new
Finished top week
25
Finished top month
25
Finished top year
25
Finished top all


In [11]:
len(makeup_posts)

8778

In [12]:
skincare_posts = []

In [13]:
main_scraper('skincareaddiction', skincare_posts, ['hot', 'controversial', 'new', 'top'], ['week','month', 'year', 'all'], n=1000)

25
Finished hot
25
Finished controversial week
25
Finished controversial month
25
Finished controversial year
25
Finished controversial all
25
Finished new
25
Finished top week
25
Finished top month
25
Finished top year
25
Finished top all


In [14]:
df_makeup = pd.DataFrame(makeup_posts)
df_makeup.head()

Unnamed: 0,all_awardings,allow_live_comments,approved_at_utc,approved_by,archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,...,thumbnail_width,title,total_awards_received,ups,url,user_reports,view_count,visited,whitelist_status,wls
0,[],True,,,False,TheNewPoetLawyerette,True,,,"[{'e': 'text', 't': 'No Compliments Allowed!'}]",...,,"Recent Changes – Rule 4, Post Review Flair, Da...",0,23,https://www.reddit.com/r/MakeupAddiction/comme...,[],,False,all_ads,6
1,[],False,,,False,AutoModerator,,,,[],...,,Daily Discussion and Simple Questions: Selfie ...,0,1,https://www.reddit.com/r/MakeupAddiction/comme...,[],,False,all_ads,6
2,[],True,,,False,Glambykimmyfab,,,,[],...,140.0,Client pic CCW! Help me improve or Ask me Anyt...,0,4830,https://i.redd.it/zebl17b4qot31.jpg,[],,False,all_ads,6
3,[],True,,,False,dailyanna_unfiltered,,,,[],...,140.0,Loving dewy natural skin with peachy tones at ...,0,558,https://i.redd.it/hpdy5eslort31.jpg,[],,False,all_ads,6
4,[],False,,,False,luciphyrr,,,,[],...,140.0,blue and bronze cut crease,0,733,https://i.redd.it/yh1cdcxi6rt31.jpg,[],,False,all_ads,6


In [15]:
df_skincare = pd.DataFrame(skincare_posts)
df_skincare.head()

Unnamed: 0,all_awardings,allow_live_comments,approved_at_utc,approved_by,archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,...,thumbnail_width,title,total_awards_received,ups,url,user_reports,view_count,visited,whitelist_status,wls
0,[],False,,,False,AutoModerator,,,msgd,[],...,,NEW OR NEED HELP? Ask here! - ScA Daily Help T...,0,1,https://www.reddit.com/r/SkincareAddiction/com...,[],,False,all_ads,6
1,[],False,,,False,AutoModerator,,,msgd,[],...,,"[Review] Rants, Raves, &amp; New Purchases Oct...",0,4,https://www.reddit.com/r/SkincareAddiction/com...,[],,False,all_ads,6
2,[],False,,,False,ShabbyCashmere,,,notag,[],...,140.0,[misc] finally picked up my first cerave in th...,0,842,https://i.redd.it/xftzx7267st31.jpg,[],,False,all_ads,6
3,[],True,,,False,daza_b,,,notag,[],...,140.0,[Shelfie] After finally building up a collecti...,0,229,https://i.redd.it/l2efa96j2rt31.jpg,[],,False,all_ads,6
4,[],False,,,False,sisterlulu,True,,,[],...,140.0,[shelfie] Simple routine for a baby Differin user,0,95,https://i.redd.it/nfea5u10pqt31.jpg,[],,False,all_ads,6


In [16]:
df_makeup.to_csv("makeup.csv")
df_skincare.to_csv("skincare.csv")