# Step 1: Extracting Posts from r/anime
On popular Anime subreddits users will set their flair to a link to their anime list, usually either MAL or Anilist. Scraping posts from r/anime will provide a source for anime lists in post and comment flairs. Ratings on anime lists will provide an opportunity to build a collection of user ratings for use in a collaborative filtering recommender system.

In [1]:
import praw
from praw.models import MoreComments
from psaw import PushshiftAPI

In [19]:
import os
from dotenv import load_dotenv

load_dotenv()
reddit_client_id = os.getenv('REDDIT_CLIENT_ID')
reddit_client_secret = os.getenv('REDDIT_CLIENT_SECRET')
reddit_user = os.getenv('REDDIT_USER')

In [20]:
user_agent=f'jupyter:reddit.nlp.testproj: (by u/{reddit_user})'

In [21]:
reddit = praw.Reddit(
    client_id=reddit_client_id,
    client_secret=reddit_client_secret,
    user_agent=user_agent,
)
api = PushshiftAPI()

In [143]:
import re

In [31]:
import pandas as pd
import datetime as dt
import requests
import json

In [43]:
def get_subreddit_submissions(subreddit, limit, period_length, period_shift=0, min_score=-1, min_comments=-1, return_all=False):
    """
    Input:
        subreddit - subreddit name, example: hololive
        limit - number of posts to retrieve. this value is ignored if return_all is set to True
        period_length - how large of a date window, starts today if no period shift
        period_shift - shift date window back in time (optional, default=0)
        min_score - minimum score for posts retrieved (optional, default=-1)
        min_comments - minimum number of comments on posts retrieved (optional, default=-1)
        return_all - if true will return all posts ignoring the limit (optional, default=False)
    
    Output:
        dataframe with all the post data
    """
    today_date = dt.date.today()
    before_date = today_date - dt.timedelta(days=period_shift)
    after_date = today_date - dt.timedelta(days=period_shift+period_length)
    
    if not return_all:
        print(f'\n Retrieving {limit} posts from {after_date} to {before_date} with {min_score}+ score, and {min_comments}+ comments')
    else:
        print(f'\n Retrieving ALL posts from {after_date} to {before_date} with a {min_score} minimum score, and {min_comments}+ comments')

    cache = list()
    gen = api.search_submissions(subreddit=subreddit,
                                        before=before_date,
                                        after=after_date,
                                        score=f'>{min_score}',
                                        num_comments=f'>{min_comments}'
                                       )
    for c in gen:
        cache.append(c)
        if len(cache) >= limit:
            break
    # If you really want to: pick up where we left off to get the rest of the results.
    if return_all:
        for c in gen:
            cache.append(c)       

    return pd.DataFrame([post.d_ for post in cache])

In [136]:
num_posts = 500000
num_days = 360

subreddit = "anime"

# shift by 2 days as recent submissions (within 1 day) will not have comments saved
posts_df = get_subreddit_submissions(subreddit, num_posts, num_days, period_shift=2)



In [137]:
posts_df

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,gilded,author_cakeday,distinguished,suggested_sort,crosspost_parent,crosspost_parent_list,category,top_awarded_type,poll_data,steward_reports
0,[],False,nf_hades,,[],,text,t2_hriq1b,False,False,...,,,,,,,,,,
1,[],False,MyLittleDeku,,[],,text,t2_7dj62vj2,False,False,...,,,,,,,,,,
2,[],False,lilirucaarde12,,[],,text,t2_6i04uaxw,False,False,...,,,,,,,,,,
3,[],False,[deleted],,,,,,,,...,,,,,,,,,,
4,[],False,sirdimpleton,,[],,text,t2_bznmn4i,False,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214871,[],False,Meliodas4TheMob,,[],,text,t2_3rmp6p2n,False,False,...,,,,,,,,,,[]
214872,[],False,theintrovert42,,[],,text,t2_3nq8o0hh,False,False,...,,,,,,,,,,[]
214873,[],False,0ri00n,,[],,text,t2_3l83r5j7,False,False,...,,,,,,,,,,[]
214874,[],False,Daniel2k03,,[],,text,t2_3q4nqunc,False,False,...,,,,,,,,,,[]


In [144]:
def save_posts(posts_df, file_name):
    """
    Description:
        Saves a PSAW posts to a csv using a DataFrame
    
    Input:
        posts_df - a dataframe of PSAW Posts
        file_name - name of output csv, including relative path
    """
    
    # replace ; with . from comment bodies to ensure proper data save and retrieval
    for index, row in posts_df.iterrows():
        row['title'] = re.sub(r';+', '.', row['title'])
    csv_file = file_name+'.csv'
    
    posts_df.to_csv(csv_file, sep=';', header=True, index=False, columns=list(posts_df.axes[1]))
    print('Posts Saved Successfully')

In [335]:
save_posts(posts_df, f'./data/{subreddit}_submissions_{num_posts}_{num_days}')