# How do mothers and fathers talk about parenting to different audiences? 

### Import modules

In [None]:
## Load needed modules
import re
import string
import nltk
import requests as rq
import json
import time
import sys ## for printing only
import tqdm ## This is for a progress bar
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import check_array
from sklearn.preprocessing import normalize
from sklearn.feature_extraction import text

# 1. Collecting Reddit data

In [None]:
def parse_date(date, format='human'):
    """"
    It takes a string and converts it into either human readable date format or epoch date format
    
    Parameteres:
    ============
    date: str
        A string with either epoch date format or human readable date format.
    format: str
        A string defining the format of the input string. By default, it takes the value 'human' and the other option is 'epoch'.
    """
    if format == 'human':
        pattern = '%Y-%m-%d %H:%M:%S'
        return str(int(time.mktime(time.strptime(date, pattern))))
    elif format == 'epoch':
        pattern = '%Y-%m-%d %H:%M:%S'
        return time.strftime(pattern, time.localtime(int(date)))
    
def collect_data(source_url, payload):
    """
    It takes the Pushift endpoint and payload as arguments and sends a request to the given URL. Depending on the status code it either returns 
    the list of mappings, a status code, or sleeps for 60 seconds and tries again to scrape the data.
    
    Parameters:
    ===========
    source_url: str
        A string with url of the Pushshift endpoint.
    payload: 
        A mapping with parameters passed to the Pushshift API.
    """
    if 'after' not in payload:
        payload['after'] = parse_date('2005-06-23 00:00:00')
    response = rq.get(source_url, params = payload)
    if response.status_code == 200:
        time.sleep(1)
        return json.loads(response.text)['data']
    elif response.status_code == 429 or response.status_code == 523 or response.status_code == 502:
        for i in range(60,0,-1):
            print(f'\rThe compulsory break finishes in {str(i)} seconds', end ='', flush=True)
            time.sleep(1)
        print('\r' + 100*' ')
        return collect_data(source_url = source_url, payload = payload)
    else:
        return [{'status' : response.status_code, 'message' : response.content }]

In [None]:
#open a file in write mode
with open('daddit_comments_2020.jl', 'w') as file:
    ## Write out the data you already collected
    source_url = 'https://api.pushshift.io/reddit/search/comment/'
    payload = { 'subreddit' : 'daddit',
                'after' : parse_date('2020-01-01 00:00:00'),
                'before' : parse_date('2021-01-01 00:00:00'),
                'fields' : ["author", "created_utc", "subreddit", "body"],
                'sort_type' : "created_utc",
                'size' : 500}
    daddit_comments = collect_data(source_url = source_url, payload = payload)
    for line in daddit_comments:
        line['created_utc'] = parse_date(line['created_utc'], format = 'epoch')
        file.write(json.dumps(line) + '\n')
    ## Set the progress bar
    pbar = tqdm.tqdm(position=0, leave=True,initial=100)
    ## Create a while-loop
    while len(daddit_comments) > 0:
        ## Check if we got data from Reddit or a strange status code
        if len(daddit_comments[0].keys()) > 2:
            ## Get the last collected data date in epoch time
            after = parse_date(daddit_comments[-1]['created_utc'])
            ## Update the payload after field
            payload['after'] = after
            ## Collect the data
            daddit_comments = collect_data(source_url = source_url, payload = payload)
            ## Write out the collected data to the file
            for line in daddit_comments:
                if 'created_utc' in line:
                    line['created_utc'] = parse_date(line['created_utc'], format = 'epoch')
                    file.write(json.dumps(line) + '\n')
            ## Update the progress bar
            pbar.update(len(daddit_comments))
        else:
            ## Print out the strange status code and its message
            print(f'Something went wrong. The status code error was {daddit_comments.pop}.')

In [None]:
#open the file with collected data
daddit_comments_2020 = pd.read_json(r'daddit_comments_2020.jl', lines = True)

In [None]:
#select the set of unique authors, 
#make it a list (to fix the order if there is a bug and we want to collect the rest of authors)
#sort them in alphabetical order
authors_daddit_2020 = sorted(list(set(daddit_comments_2020["author"])), key=str.lower)

In [None]:
#collect the comments published by each author who posted on r/Daddit in 2020
## Open a file in write mode
with open('authors_daddit_comments_2020.jl', 'a') as file:
    ## Set the progress bar
    pbar = tqdm.tqdm(position=0, leave=True,initial=100)
    #take each author of the list and collect their comments published on r/Parenting subreddit in 2020
    for author in authors_daddit_2020:
        source_url = 'https://api.pushshift.io/reddit/search/comment/'
        payload = { 'after' : parse_date('2020-01-01 00:00:00'),
                    'before' : parse_date('2020-12-31 00:00:00'),
                    'subreddit' : "Parenting"
                    'author' : author,
                    'fields' : ["author", "created_utc", "subreddit", "body"],
                    'sort_type' : "created_utc",
                    'size' : 100}
        daddit_comments = collect_data(source_url = source_url, payload = payload)
        for line in daddit_comments:
            line['created_utc'] = parse_date(line['created_utc'], format = 'epoch')
            file.write(json.dumps(line) + '\n')
        ## Create a while-loop
        while len(daddit_comments) > 0:
            ## Check if we got data from Reddit or a strange status code
            if len(daddit_comments[0].keys()) > 2:
                ## Get the last collected data date in epoch time
                after = parse_date(daddit_comments[-1]['created_utc'])
                ## Update the payload after field
                payload['after'] = after
                ## Collect the data
                daddit_comments = collect_data(source_url = source_url, payload = payload)
                ## Write out the collected data to the file
                for line in daddit_comments:
                    if 'created_utc' in line:
                        line['created_utc'] = parse_date(line['created_utc'], format = 'epoch')
                        file.write(json.dumps(line) + '\n')
            else:
                ## Print out the strange status code and its message
                print(f'Something went wrong. The status code error was {daddit_comments.pop}.')
        ## Update the progress bar
        pbar.update(len(daddit_comments))