
# Part 1: Webscraping & Data Collection
***

## (1) Import packages

In [1]:
# Import libraries
import requests
import pandas as pd
import datetime as dt 
import time
import random
import collections
from collections import Counter

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## (2) Webscraping

Access to Reddit data on its subreddit forums must be gained before any analysis can be done. This was achieved with Pushshift API, which allows us to search Reddit data easily. For this project, the main endpoint used to search all publicly available submissions on Reddit is "/reddit/search/submission". The following function titled 'scrape' will be used to restrict scraping from a named subreddit, for a set number of posts which were made within the past n number of days.

### Define function for webscraping

In [2]:
def scrape(subreddit, n, days = 30):
    
    # Url
    base_url = 'https://api.pushshift.io/reddit/search/submission'
    full_url = f'{base_url}?subreddit={subreddit}&size=100'
    
    # Create an empty list to store posts
    posts = []
    
    # Modify the url after each iteration
    for i in range(1, n+1):
        urlmod = '{}&after={}d'.format(full_url, days*i)
        res_1 = requests.get(urlmod)
        
        # This is to prevent errors from stopping the codes from running
        try:
            res = requests.get(urlmod)
            assert res.status_code == 200
        except:
            continue
        
        # Convert data to json object
        extracted = res.json()['data']
        
        # Construct a Pandas dataframe from dict
        df = pd.DataFrame.from_dict(extracted)
        
        # Add the df to 'posts' list
        posts.append(df)
        
        # Total number of posts scraped
        total_scraped = sum(len(x) for x in posts)
        
        # Scrape only specific n amount of data 
        if total_scraped > n:
            break
        
        # Generate a random sleep duration to simulate a human user
        sleep_duration = random.randint(1,9)
        time.sleep(sleep_duration)
            
    
    # Create a list of features of interest 
    features_of_interest = ['subreddit', 'title', 'selftext']
    
    # Combine all iterations into 1 dataframe
    final_df = pd.concat(posts, sort=False)
    # And remove all the unrequired columns from the datasets
    final_df = final_df[features_of_interest]
    # Drop any duplicates
    final_df.drop_duplicates(inplace=True)
    return final_df.reset_index(drop=True)

### Scrape data & Store as df
Pass the function to scrape data from each subreddit r/Anxiety and r/depression, and assign the data to its own respective variable <i>submissions_anxiety_df</i> and <i>submissions_depression_df</i>.

In [3]:
submissions_anxiety_df = scrape('Anxiety', 3000, days = 30)
submissions_depression_df = scrape('depression', 3000, days = 30)

print(f'Retrieved {len(submissions_anxiety_df)} submissions on \'Anxiety\' from Pushshift')
print((f'Retrieved {len(submissions_depression_df)} submissions on \'depression\' from Pushshift'))

Retrieved 3069 submissions on 'Anxiety' from Pushshift
Retrieved 3075 submissions on 'depression' from Pushshift


## (3) Data Collection

### Export as csv
Raw data collected from the r/Anxiety and r/depression subreddits are saved and exported via separate csv files.
Specify <i>index=False</i> to avoid the addition of an unnamed column or additional index when these csv files are read in again in later notebooks.

In [5]:
submissions_anxiety_df.to_csv('./data/anxiety_data.csv', index=False)
submissions_depression_df.to_csv('./data/depression_data.csv', index=False)