In [2]:
import numpy as np
import pandas as pd
import syft as sy
import time
import os
import dotenv


In [None]:
# we recommend that you source these values using environment variables
dotenv.load_dotenv()
URL = "https://reddit-for-researchers.snooguts.net"
EMAIL = os.getenv("EMAIL")
PASSWORD = os.getenv("PASSWORD")

# you can provide a "password" keyword argument, but if you don't...
# the browser will prompt you for input
client = sy.login(
    url=URL,
    email=EMAIL,
    password=PASSWORD
)

In [None]:
client.requests

In [7]:
def name_to_id_dict(client):
    # TODO handle case with duplicate names?
    return {request.code.service_func_name: str(request.id) for request in client.requests}


def query_r4r(function_name, name_to_id_dict, client):

    while True:   
        try:
            request = client.api.services.request.get_by_uid(uid=sy.UID(name_to_id_dict[function_name]))
            job = request.code(blocking=False)
            results = job.wait()
            df = results.get()
            return df
        except KeyboardInterrupt:
            raise
        except Exception as ex:
            print(ex)
            time.sleep(10)
            client = sy.login(
                url=URL,
                email=EMAIL,
                password=PASSWORD
            )

In [None]:
funcdict = name_to_id_dict(client)
funcdict

In [None]:
request = client.api.services.request.get_by_uid(uid=sy.UID(funcdict["avg_comment_score_pg"]))
job = request.code(blocking=False)
results = job.wait()
results.get()

In [None]:
# Posts

posts_dict = {}
for year in range(2020, 2024):
    key = f"pg_wiki_{year}"
    print(key)
    try:
        posts_dict[key] = pd.read_hdf(f"data/{key}.h5")
    except:
        posts_dict[key] = query_r4r(key, funcdict, client)
        for c in ['score', 'gildings', 'num_comments']:
            posts_dict[key][c] = posts_dict[key][c].astype(np.int64)
        for c in ['nsfw', 'self', 'video', 'locked', 'spoiler', 'sticky']:
            posts_dict[key][c] = posts_dict[key][c].astype(np.bool_)
        for c in ['created_at', 'updated_at']:
            posts_dict[key][c] = pd.to_datetime(posts_dict[key][c]).astype('datetime64[ns]')

        posts_dict[key].to_hdf(f"data/{key}.h5", key='df', mode='w')

posts_df = pd.concat(posts_dict.values())
posts_df.to_hdf("data/posts.h5", key='df', mode='w')


In [None]:
posts_df = pd.read_hdf("data/posts.h5")


In [None]:
# Comments

comments_dict = {}
for year in range(2020, 2024):
    for month in range(1, 13):
        key = f"pg_wiki_comments_{year}{month:02d}"
        print(key)
        try:
            comments_dict[key] = pd.read_hdf(f"data/{key}.h5")
        except:
            comments_dict[key] = query_r4r(key, funcdict, client)
            for c in ['score']:
                comments_dict[key][c] = comments_dict[key][c].astype(np.int64)
            for c in ['gilded']:
                comments_dict[key][c] = comments_dict[key][c].astype(np.bool_)
            for c in ['created_at', 'last_modified_at']:
                comments_dict[key][c] = pd.to_datetime(comments_dict[key][c]).astype('datetime64[ns]')
            comments_dict[key].to_hdf(f"data/{key}.h5", key='df', mode='w')

comments_df = pd.concat(comments_dict.values())
os.remove("data/comments_1.h5")
comments_df.iloc[:len(comments_df)//4].to_hdf("data/comments_1.h5", key='df', mode='w')
os.remove("data/comments_2.h5")
comments_df.iloc[len(comments_df)//4:len(comments_df)//2].to_hdf("data/comments_2.h5", key='df', mode='w')
os.remove("data/comments_3.h5")
comments_df.iloc[len(comments_df)//2:len(comments_df)//4*3].to_hdf("data/comments_3.h5", key='df', mode='w')
os.remove("data/comments_4.h5")
comments_df.iloc[len(comments_df)//4*3:].to_hdf("data/comments_4.h5", key='df', mode='w')

In [None]:
#  replies

replies_df = query_r4r('pg_wiki_replies', funcdict, client)
for c in ['score']:
    replies_df[c] = replies_df[c].astype(np.int64)
for c in ['gilded']:
    replies_df[c] = replies_df[c].astype(np.bool_)
for c in ['created_at', 'last_modified_at']:
    replies_df[c] = pd.to_datetime(replies_df[c]).astype('datetime64[ns]')
replies_df.to_hdf(f"data/replies.h5", key='df', mode='w')

In [None]:
# replies

replies_dict = {}
for year in range(2020, 2024):
    key = f"pg_wiki_replies_{year}"
    print(key)
    try:
        replies_dict[key] = pd.read_hdf(f"data/{key}.h5")
    except:
        replies_dict[key] = query_r4r(key, funcdict, client)
        for c in ['score']:
            replies_dict[key][c] = replies_dict[key][c].astype(np.int64)
        for c in ['gilded']:
            replies_dict[key][c] = replies_dict[key][c].astype(np.bool_)
        for c in ['created_at', 'last_modified_at']:
            replies_dict[key][c] = pd.to_datetime(replies_dict[key][c]).astype('datetime64[ns]')

        replies_dict[key].to_hdf(f"data/{key}.h5", key='df', mode='w')

replies_df = pd.concat(replies_dict.values())
os.remove("data/replies.h5")
replies_df.to_hdf("data/replies.h5", key='df', mode='w')

In [None]:
replies_dict = {}
for year in range(2020, 2024):
    for month in range(1, 13):
        key = f"pg_wiki_replies_{year}"
        print(key)
        try:
            replies_dict[key] = pd.read_hdf(f"data/{key}.h5")
        except:
            replies_dict[key] = query_r4r(key, funcdict, client)
            for c in ['score']:
                replies_dict[key][c] = replies_dict[key][c].astype(np.int64)
            for c in ['gilded']:
                replies_dict[key][c] = replies_dict[key][c].astype(np.bool_)
            for c in ['created_at', 'last_modified_at']:
                replies_dict[key][c] = pd.to_datetime(replies_dict[key][c]).astype('datetime64[ns]')

            replies_dict[key].to_hdf(f"data/{key}.h5", key='df', mode='w')

replies_df = pd.concat(replies_dict.values())
os.remove("data/replies.h5")
replies_df.to_hdf("data/replies.h5", key='df', mode='w')

In [None]:
#  create sqlite database

import sqlite3

conn = sqlite3.connect('wikireddit.db')
posts_df.to_sql('posts', conn, if_exists='replace')
comments_df.to_sql('comments', conn, if_exists='replace')
replies_df.to_sql('replies', conn, if_exists='replace')

In [1]:
import pandas as pd
posts_df = pd.read_hdf("data/posts.h5")