In [1]:
import requests
from datetime import datetime
import traceback
import time
import json
import logging
from tqdm.notebook import tqdm
import pandas as pd

## Set up Logging

In [2]:
logger = logging.getLogger("my app")
logger.setLevel("DEBUG")
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

## Global Variables

In [3]:
subreddit = "dataengineering"

In [4]:
base_url = "https://api.pushshift.io/reddit/submission/search?\
&subreddit={subreddit}\
&limit={limit}\
&sort={sort}\
&sort_type={sort_type}\
&before={end_timestamp}\
&{start_timestamp}"

In [5]:
valid_columns = [
    'author',
    'author_flair_css_class',
    'author_flair_text',
    'author_premium',
    'created_utc',
    'domain',
    'full_link',
    'id',
    'is_robot_indexable',
    'is_self',
    'is_video',
    'link_flair_text',
    'no_follow',
    'num_comments',
    'permalink',
    'post_hint',
    'score',
    'selftext',
    'title',
    'url',
    'url_overridden_by_dest',
    'removed_by_category',
    'banned_by',
    'removed_by',
]

## helper functions

In [6]:
def _get_post(post):
    filtered_posts = {k:v for (k,v) in post.items() if k in valid_columns}
    return filtered_posts
                     
def get_posts(json_objects):
    posts = []
    for post in tqdm(json_objects, len(posts)):
        post_args = _get_post(post)
        posts.append(post_args)
    return posts

## Get all subreddit posts

In [7]:
def get_all_subreddit_posts(pushshift_args, full_refresh = True):
    all_posts = []
    start_time = datetime.utcnow()
    previous_epoch = int(start_time.timestamp())
    if not full_refresh:
        pushshift_args["start_timestamp"] = None
    while True: # loop through requests until empty
        pushshift_args["end_timestamp"] = previous_epoch
        request_url = base_url.format(**pushshift_args)
        json_text = requests.get(request_url) # get reddit data via request from pushshift
        time.sleep(1)
        try: # parse request to json
            json_request = json_text.json()
        except json.decoder.JSONDecodeError as e:
            logger.warning("Unable to Decode json request:", e)
            time.sleep(1)
            continue
            
        json_objects = json_request.get('data', [])
        if not json_objects: # check if result set is empty
            logger.warning("Empty json request")
            break
            
        previous_epoch = json_objects[-1]['created_utc'] - 1
        total_posts = len(json_objects)
            
        posts = get_posts(json_objects) # get posts from request
        all_posts.extend(posts)
        logger.info(f"Processed {len(all_posts)} posts")

    logger.info(f"Saved {len(all_posts)} posts")
    return all_posts

In [8]:
pushshift_args = {
    'subreddit': 'dataengineering',
    'limit': 1000,
    'sort': 'desc',
    'sort_type': 'created_utc',
    'end_timestamp': None,
    'start_timestamp': None
}

In [9]:
all_posts = get_all_subreddit_posts(pushshift_args)

  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:09,186 - my app - INFO - Processed 100 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:11,098 - my app - INFO - Processed 200 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:12,973 - my app - INFO - Processed 300 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:14,293 - my app - INFO - Processed 400 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:17,648 - my app - INFO - Processed 500 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:19,554 - my app - INFO - Processed 600 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:21,034 - my app - INFO - Processed 700 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:22,922 - my app - INFO - Processed 800 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:26,280 - my app - INFO - Processed 900 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:29,995 - my app - INFO - Processed 1000 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:32,008 - my app - INFO - Processed 1100 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:33,683 - my app - INFO - Processed 1200 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:35,317 - my app - INFO - Processed 1300 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:37,237 - my app - INFO - Processed 1400 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:38,865 - my app - INFO - Processed 1500 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:40,223 - my app - INFO - Processed 1600 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:41,961 - my app - INFO - Processed 1700 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:43,808 - my app - INFO - Processed 1800 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:45,751 - my app - INFO - Processed 1900 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:47,670 - my app - INFO - Processed 2000 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:49,100 - my app - INFO - Processed 2100 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:50,612 - my app - INFO - Processed 2200 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:53,661 - my app - INFO - Processed 2300 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:55,298 - my app - INFO - Processed 2400 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:56,874 - my app - INFO - Processed 2500 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:48:58,234 - my app - INFO - Processed 2600 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:00,008 - my app - INFO - Processed 2700 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:01,351 - my app - INFO - Processed 2800 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:03,302 - my app - INFO - Processed 2900 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:04,980 - my app - INFO - Processed 3000 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:06,445 - my app - INFO - Processed 3100 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:08,153 - my app - INFO - Processed 3200 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:09,798 - my app - INFO - Processed 3300 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:11,732 - my app - INFO - Processed 3400 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:13,642 - my app - INFO - Processed 3500 posts


  0%|          | 0/99 [00:00<?, ?it/s]

2021-12-26 14:49:15,563 - my app - INFO - Processed 3599 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:17,276 - my app - INFO - Processed 3699 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:19,137 - my app - INFO - Processed 3799 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:21,432 - my app - INFO - Processed 3899 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:23,404 - my app - INFO - Processed 3999 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:24,768 - my app - INFO - Processed 4099 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:26,203 - my app - INFO - Processed 4199 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:29,466 - my app - INFO - Processed 4299 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:31,355 - my app - INFO - Processed 4399 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:33,251 - my app - INFO - Processed 4499 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:34,624 - my app - INFO - Processed 4599 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:36,103 - my app - INFO - Processed 4699 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:37,993 - my app - INFO - Processed 4799 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:41,631 - my app - INFO - Processed 4899 posts


  0%|          | 0/99 [00:00<?, ?it/s]

2021-12-26 14:49:43,541 - my app - INFO - Processed 4998 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:45,434 - my app - INFO - Processed 5098 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:47,407 - my app - INFO - Processed 5198 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:48,831 - my app - INFO - Processed 5298 posts


  0%|          | 0/99 [00:00<?, ?it/s]

2021-12-26 14:49:50,556 - my app - INFO - Processed 5397 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:51,921 - my app - INFO - Processed 5497 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:53,790 - my app - INFO - Processed 5597 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:55,372 - my app - INFO - Processed 5697 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:57,344 - my app - INFO - Processed 5797 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:49:59,240 - my app - INFO - Processed 5897 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:01,258 - my app - INFO - Processed 5997 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:04,608 - my app - INFO - Processed 6097 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:06,714 - my app - INFO - Processed 6197 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:08,861 - my app - INFO - Processed 6297 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:10,782 - my app - INFO - Processed 6397 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:15,158 - my app - INFO - Processed 6497 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:16,480 - my app - INFO - Processed 6597 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:18,365 - my app - INFO - Processed 6697 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:20,274 - my app - INFO - Processed 6797 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:21,869 - my app - INFO - Processed 6897 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:23,432 - my app - INFO - Processed 6997 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:25,394 - my app - INFO - Processed 7097 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:27,076 - my app - INFO - Processed 7197 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:28,992 - my app - INFO - Processed 7297 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:30,409 - my app - INFO - Processed 7397 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:32,046 - my app - INFO - Processed 7497 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:33,440 - my app - INFO - Processed 7597 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:35,198 - my app - INFO - Processed 7697 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:36,505 - my app - INFO - Processed 7797 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:38,671 - my app - INFO - Processed 7897 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:40,632 - my app - INFO - Processed 7997 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:41,936 - my app - INFO - Processed 8097 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:44,023 - my app - INFO - Processed 8197 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:45,407 - my app - INFO - Processed 8297 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:47,378 - my app - INFO - Processed 8397 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:49,024 - my app - INFO - Processed 8497 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:52,710 - my app - INFO - Processed 8597 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:54,010 - my app - INFO - Processed 8697 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:55,678 - my app - INFO - Processed 8797 posts


  0%|          | 0/99 [00:00<?, ?it/s]

2021-12-26 14:50:57,074 - my app - INFO - Processed 8896 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:50:58,746 - my app - INFO - Processed 8996 posts


  0%|          | 0/100 [00:00<?, ?it/s]

2021-12-26 14:51:00,349 - my app - INFO - Processed 9096 posts


  0%|          | 0/10 [00:00<?, ?it/s]

2021-12-26 14:51:02,011 - my app - INFO - Processed 9106 posts
2021-12-26 14:51:03,224 - my app - INFO - Saved 9106 posts


### Create dataframe from results

In [10]:
df = pd.DataFrame(all_posts)

### Drop Duplicates

In [11]:
df_deduped = df.drop_duplicates(['author', 'title', 'created_utc'])

### Save Dataframe to CSV

In [12]:
df_deduped.to_csv("reddit_de_posts_filtered.csv", index = False)

## Save results to Postgres Database

### Connect to Postgres Database

In [13]:
import psycopg2
import os
from dotenv import load_dotenv

load_dotenv("./postgres/.env")

True

In [14]:
PG_USER = os.getenv('POSTGRES_USER')
PG_PW = os.getenv('POSTGRES_PASSWORD')
PG_DB = os.getenv('POSTGRES_DB')

In [15]:
postgres_conn_args = {
    "host": "localhost",
    "database": PG_DB,
    "user": PG_USER,
    "password": PG_PW,
    "port": 5432
}

In [16]:
pg_conn = psycopg2.connect(**postgres_conn_args)

### Create table

In [17]:
CREATE_TABLE_SQL = """
CREATE TABLE IF NOT EXISTS raw_data.reddit_de (
    id char(6) NOT NULL,
    author text NOT NULL,
    author_flair_css_class text,
    author_flair_text text,
    author_premium text,
    created_utc bigint NOT NULL,
    domain text,
    full_link text NOT NULL,
    is_robot_indexable bool,
    is_self bool,
    is_video bool,
    link_flair_text text,
    no_follow text,
    num_comments int,
    permalink text NOT NULL,
    post_hint text,
    score int,
    selftext text,
    title text NOT NULL,
    url text NOT NULL,
    url_overridden_by_dest text,
    removed_by_category text,
    banned_by text,
    removed_by text
);
"""

In [18]:
with pg_conn:
    with pg_conn.cursor() as pg_cur:
            pg_cur.execute(CREATE_TABLE_SQL)

### Copy Contents to table

In [19]:
COPY_SQL = """
COPY raw_data.reddit_de {columns}
FROM STDIN
WITH CSV HEADER
DELIMITER as ','
"""

In [20]:
column_names = f"({', '.join(list(df_deduped.columns))})"

In [21]:
with pg_conn:
    with pg_conn.cursor() as pg_cur:
        with open("reddit_de_posts_filtered.csv",  'rb') as tmp_file:
            pg_cur.copy_expert(COPY_SQL.format(columns=column_names), tmp_file)

### Clean Up

In [22]:
with pg_conn:
    with pg_conn.cursor() as pg_cur:
        pg_cur.execute("ANALYZE")

In [23]:
pg_conn.close()

In [24]:
!rm -f 'reddit_de_posts_filtered.csv'

In [25]:
with pg_conn:
    df = pd.read_sql("SELECT * FROM raw_data.reddit_de", pg_conn)

InterfaceError: connection already closed

In [None]:
df.shape

In [None]:
df.id.nunique()

In [None]:
df.post_hint.value_counts()

In [None]:
df.columns

In [None]:
test = df[~df.url_overridden_by_dest.isnull()]

In [None]:
test.full_link.equals(test.url_overridden_by_dest)

In [None]:
df.full_link.str[22:].equals(df.permalink)

In [None]:
df.permalink.iloc[0]

In [None]:
df.iloc[0].full_link

In [None]:
df.iloc[0].permalink

In [None]:
df.no_follow.value_counts()

In [None]:
import numpy as np

In [None]:
np.where(df[df.is_self].selftext.str.len() == 1)

In [None]:
df[df.is_self].selftext.iloc[231]

In [None]:
df[df.title.str.len() < 5].title

In [None]:
df.iloc[2397]

In [None]:
df.is_robot_indexable.value_counts()

In [None]:
pd.DataFrame(df.author.value_counts())

In [None]:
df[df.author == "[deleted]"]