# Scraping
This notebook uses PMAW to get archive comments retrieved by Pushshift immediately after they are created, and PRAW to retrieve live data. This way, we can get comments before the risk of them being deleted, and then complement the set with up to date information on comments, their submissions, and their users.


#### Requirements
* [ ] praw_functions.py
* [ ] hate_terms.csv
* [ ] reddit_auth.py (your reddit credentials in a python script)

#### Generates
* [ ] comments_df.csv
* [ ] submissions_df.csv
* [ ] users_df.csv
* [ ] log.csv (temporary)
* [ ] comments_raw.csv (temporary)
* [ ] new_comments_stats.csv (temporary)

<br/>

##### Links and documentations
Pushshift API [here](https://reddit-api.readthedocs.io/en/latest/) <br/>
PRAW API [here](https://praw.readthedocs.io/en/stable/getting_started/quick_start.html)<br/>
PMAW documentation [here](https://github.com/mattpodolak/pmaw)


___

In [None]:
!pip install pmaw
!pip install praw

from pmaw import PushshiftAPI
import pandas as pd
import joblib
from praw_functions import *

from collections import defaultdict
from tqdm import tqdm

RANDOM_SEED=697

## 1. Comments

### Comments from Pushshift (via pmaw)

In [None]:
# features to retrieve
our_filter = ['author','author_flair_type','author_fullname','author_premium',
              'body','body_sha1','controversiality','created_utc','distinguished',
              'gilded','id','is_submitter','link_id', 'locked','parent_id',
              'permalink','retrieved_utc','subreddit','subreddit_id',
              'subreddit_name_prefixed','subreddit_type'
             ]

In [None]:
# terms for our filter
hate_path = 'hate_terms.csv'
hate_terms = pd.read_csv(hate_path)

our_terms = '|'.join(hate_terms.term)

In [None]:
api = PushshiftAPI()

In [None]:
date_range = pd.date_range(start='2022-01-01 00:00:00', 
                           end='2022-02-01 00:00:00', 
                           freq='H').to_list()

In [None]:
#---------------------------
# Creating variables once
# Don't run this cell again
#---------------------------

# comments_raw = pd.DataFrame()
# comments_raw.to_csv('comments_raw.csv')


#---------------------------------------------------------------------------
# use log below to verify num_items retrieved,
# if more than 1000, scraper hit limit, 
# go back to epoch, divide into smaller epochs and scrape all the comments
#---------------------------------------------------------------------------

# log = pd.DataFrame({'time':[], 'epoch':[], 'num_items':[]})
# log.to_csv('log.csv', index=False)

In [None]:
# log = pd.read_csv('log.csv')
# comments_raw = pd.read_csv('comments_raw.csv')

limit_=1000
epoch_=3600

for after_ in date_range:
    data = api.search_comments(q=our_terms, 
                                limit=limit_,
                                after=after_, 
                                before=after+epoch_,
                                filter=our_filter)
    df = pd.DataFrame(data)

    comments_raw = comments_raw.append(df, ignore_index=True)
    comments_raw.to_csv('comments_raw.csv', index=False)
    if df.shape[0]>999:
        update_log(after_)

In [None]:
comments_raw.shape

(161, 21)

In [1]:
log[log.num_items>999]

In [None]:
max(comments_raw.retrieved_utc - comments_raw.created_utc)

7889.0

### Comments directly from Reddit (using praw)

In [None]:
#---------------------------
# Creating variables once
# Don't run this cell again
#---------------------------

# new_comments_stats = pd.DataFrame()
# new_comments_stats.to_csv('new_comments_stats.csv')

In [None]:
c = pd.read_csv('comments_raw.csv')

In [None]:
step = 100
new_comments_stats = pd.read_csv('new_comments_stats.csv')
for i in range(0, c.shape[0], step):
    new_comments_stats = new_comments_stats.append(update_comments(c['id'][i:i+step]), ignore_index=True)
    new_comments_stats.to_csv('new_comments_stats.csv', index=False)

In [None]:
print(new_comments_stats.shape)
print(new_comments_stats['id'].nunique())

#### Combine comments

In [None]:
comments_df = c.merge(new_comments_stats, on='id')
comments_df = comments_df.to_csv('comments_df.csv')

## 2. Submissions

In [None]:
c = pd.read_csv('comments_raw.csv')
id_list = [x[3:] for x in c['link_id'].unique()]

In [None]:
#---------------------------
# Creating variables once
# Don't run this cell again
#---------------------------

# sub_df = pd.DataFrame()
# sub_df.to_csv('submissions_df.csv')

In [None]:
step = 25
# sub_df = pd.read_csv('submissions_df.csv')

for i in range(0, len(id_list), step):
    sub_df = sub_df.append(get_submissions_data(id_list[i:i+step]), ignore_index=True)
    sub_df.to_csv('submissions_df.csv', index=False)

In [None]:
print(sub_df.shape)
print(sub_df['id'].nunique())

## 3. Users

In [None]:
c = pd.read_csv('comments_raw.csv')
user_list = list(c.author.unique())

In [None]:
#---------------------------
# Creating variables once
# Don't run this cell again
#---------------------------

# users_df = pd.DataFrame()
# users_df.to_csv('users_df.csv')

In [None]:
step = 50
# user_df = pd.read_csv('users_df.csv')

for i in range(0, len(user_list), step):
    users_df = users_df.append(get_users_data(user_list[i:i+step]), ignore_index=True)
    users_df.to_csv('users_df.csv', index=False)

In [None]:
print(users_df.shape)
print(users_df.author.nunique())

--
#### Generate samples to keep on Github

In [10]:
comments_sample = c.sample(1000, random_state=RANDOM_SEED)
submissions_sample = s.sample(1000, random_state=RANDOM_SEED)
users_sample = u.sample(1000, random_state=RANDOM_SEED)

comments_sample.to_csv('comments_sample.csv', index=False)
submissions_sample.to_csv('submissions_sample.csv', index=False)
users_sample.to_csv('users_sample.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b22dad3f-c925-4cd0-bb81-e22d83bd774f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>