# Table of Contents
- [Import Libraries](#import)      
- [Scraping Posts from Reddit](#scrape)
- [Cleaning Data to Title + Text and Subreddit Only](#clean)
- [Saving and Exporting csv for EDA](#save)

Keeping scraping in an independent notebook such that when I restart kernels, I do NOT repull data.

###  Importing Libraries <a id="import"></a>

In [None]:
# Import Libraries

import requests # Library enables requests from reddit
import time # Library enables lag to be added to request loop

import pandas as pd

###  Scraping Posts from Reddit <a id="scrape"></a>

In [None]:
# Set up source to scrape from

url_base_ww = 'https://www.reddit.com/r/thewestwing.json' # API Endpoint 1
url_base_nr = 'https://www.reddit.com/r/Thenewsroom.json' # API Endpoint 2

In [None]:
# Create a function with a for loop to make pull requests from the sub reddit 
# until either 1000 posts have been pulled or the entire subreddit has been pulled

def pull_posts(url_base):
    
    # Setting user agent to enable pulling from Reddit
    user_agent = {'User-agent': 'mags'}
    
    # Instantiating 'after' string that can be concatenated to url after the first pull request
    after = None
    
    # Creating empty posts list
    posts = []
    
    for pull_req in range(int(1000/25)):
        # Setting url to pull from based on whether it is the first pull (if)
        # or a subsequent pull (else)
        if after == None:
            url = url_base
        else:
            url = url_base+"?after="+after
        
        # Making request
        res = requests.get(url, headers = user_agent)
        
        # If statement checks that request worked
        # adds posts to posts list.
        if res.status_code == 200:
            json_data = res.json() 
            posts.extend(json_data['data']['children'])
            # need to change after string such that next pull pulls the next set of posts
            after = json_data['data']['after']
            
        else:
            print(f'There was an error : {res.status_code}.')
            break
        
        if after == None:
            break
        
        print(f'Pulled {len(posts)} posts so far...')
        time.sleep(5)
    
    return posts

In [None]:
posts_ww = pull_posts(url_base_ww)
ww_raw_df = pd.DataFrame(posts_ww)
ww_raw_df.to_csv('./datasets/raw_ww.csv')

In [None]:
posts_nr = pull_posts(url_base_nr)
nr_raw_df = pd.DataFrame(posts_nr)
nr_raw_df.to_csv('./raw_nr.csv')

###  Cleaning Data to Title + Text and Subreddit Only <a id="clean"></a> 

In [None]:
# Creating a function to take the posts and format them such that 
# the title and text can be used for NLP.

# Concatenating title and text because some posts do not have any text,
# and some posts have short titles
def clean_posts(posts):
    # Creating empty list, one for titles and one for text
    clean_text = []
    
    # Looping through each post to create dictionary
    for data in posts:
        # Creating empty dictionary
        text = {}
        
        text['subreddit'] = data['data']['subreddit']
        temp_text =  data['data']['title'] + " " + data['data']['selftext']
        text = {text['subreddit'] : temp_text}
        
        clean_text.append(text)
    
    return pd.DataFrame(clean_text)

In [None]:
text_ww = clean_posts(posts_ww)
text_ww.to_csv('./titletext_ww.csv')
text_ww.columns = ['text']

In [None]:
text_nr = clean_posts(posts_nr)
text_nr.to_csv('./titletext_nr.csv')
text_nr.columns = ['text']

### Saving and Exporting CSV for EDA  <a id="save"></a> 

In [None]:
# Y = 1 for West Wing, Y = 0 for Newsroom
text_nr['subreddit'] = 0
text_ww['subreddit'] = 1

# Combining Newsroom and West Wing data frames 
text_df = text_nr.append(text_ww)
text_df['class'] = text_df['subreddit']
text_df.drop('subreddit', axis=1, inplace=True)

text_df.to_csv('./text_df.csv')