# Reddit Crawler
Using praw to comb through reddit posts, then PIL to process images

In [2]:
import datetime
import os
import praw

from PIL import Image

Some configurations for the crawler and image downloads

In [3]:
SUBREDDITS_LIST_PATH = "./subreddit_list.txt"
MAX_IMG = 10
ACCEPTABLE_EXTENSIONS = ["jpg", "png"]
MAX_RESOLUTION = (1024, 1024)

# Note: you will need your own praw.ini config file to use this command
reddit = praw.Reddit("cs4243")
req_header = { "User-Agent": "CS4243 crawler bot", "From": "insert email here" }
with open(SUBREDDITS_LIST_PATH, "r") as f:
    sr_list = [ x.strip() for x in f.readlines() ]

Get image metadata first before deciding on images to download. Due to the long tailed distribution, and for a more representative distribution of scores, we download 500 images per subreddit so that the calculated percentiles are representative, and that there are enough images in the popular class. 

In [6]:
# CAUTION: THIS CODE SEGMENT CAN TAKE MORE THAN TEN MINUTES TO RUN!
now = datetime.datetime.now()
unixnow = int(datetime.datetime.timestamp(now))
PRELIMINARY_PATH = f"crawl_{now.month:02}{now.day:02}.csv"

with open(PRELIMINARY_PATH, 'w') as datafile:
    datafile.write("ID,SCORE,SUBREDDIT,URL,UNIX TIME,UPVOTE RATIO\n")
    for sr in sr_list:
        count = 0
        for submission in reddit.subreddit(sr).new(limit=None):
            # posts are at least one week old, for score stability
            if (unixnow - submission.created_utc) > 604800:  
                srname = submission.subreddit.display_name.lower()
                if submission.url[-3:] not in ACCEPTABLE_EXTENSIONS:
                    continue
                datafile.write(f"{submission.id},{submission.score},{srname},{submission.url}," + \
                                    f"{submission.created_utc},{submission.upvote_ratio}\n")
                count += 1
                if count % 10 == 0:
                    datafile.flush()
                    print(f"Sourcing images in {sr}: {count}/{MAX_IMG}")
                if count == MAX_IMG:
                    datafile.flush()
                    break

Sourcing images in ImaginaryArchers: 10/10
Sourcing images in ImaginaryAssassins: 10/10
Sourcing images in ImaginaryAstronauts: 10/10
Sourcing images in ImaginaryKnights: 10/10
Sourcing images in ImaginaryLovers: 10/10
Sourcing images in ImaginaryMythology: 10/10
Sourcing images in ImaginaryNobles: 10/10
Sourcing images in ImaginaryScholars: 10/10
Sourcing images in ImaginarySoldiers: 10/10
Sourcing images in ImaginaryWarriors: 10/10
Sourcing images in ImaginaryWitches: 10/10
Sourcing images in ImaginaryWizards: 10/10
Sourcing images in ImaginaryAngels: 10/10
Sourcing images in ImaginaryDwarves: 10/10
Sourcing images in ImaginaryElves: 10/10
Sourcing images in ImaginaryFaeries: 10/10
Sourcing images in ImaginaryHumans: 10/10
Sourcing images in ImaginaryImmortals: 10/10
Sourcing images in ImaginaryMerfolk: 10/10
Sourcing images in ImaginaryOrcs: 10/10
Sourcing images in ImaginaryBattlefields: 10/10
Sourcing images in ImaginaryCityscapes: 10/10
Sourcing images in ImaginaryHellscapes: 10/

# Data processing to download selected images based on class

In [7]:
import numpy as np
import pandas as pd
import queue
import requests
import threading

PERCENTILE_BINS = [0.5, 0.9, 1.0]
NUM_ROWS_PER_SUB_PER_PERCENT = 10
CSV_PATH = "../data/reddit/processed_data.csv"
MAX_WORKERS = 2

with open(PRELIMINARY_PATH, "r") as f:
    data = pd.read_csv(f)

In [16]:
data['PERCENTILE'] = data['SCORE'].rank(pct=True)
data['PERCENTILE BIN'] = np.digitize(
    data['PERCENTILE'], PERCENTILE_BINS, right=True)
data['PERCENTILE BIN'] = data['PERCENTILE BIN'].map(
    {index: bin for index, bin in enumerate(PERCENTILE_BINS)})

filtered_rows = []
for sr in sr_list:
    os.makedirs(f"../data/reddit/{sr.lower()}", exist_ok=True)
    for percent in PERCENTILE_BINS:
        subdata = data[(data["SUBREDDIT"] == sr.lower()) & (data["PERCENTILE BIN"] == percent)]
        filtered_rows.extend(subdata.head(NUM_ROWS_PER_SUB_PER_PERCENT).values.tolist())
output = pd.DataFrame(filtered_rows, columns=data.columns)
with open(CSV_PATH, "w") as f:
    output.to_csv(f, index=False)

In [17]:
job_pool = queue.Queue()
def get_image(url, filename) :
    req = requests.get(url, stream=True, headers=req_header)
    if not req.ok:
        return
    with open(filename, 'wb') as f:
        for chunk in req.iter_content(1024):
            if chunk:
                f.write(chunk)
    with Image.open(filename) as im:
        im.thumbnail(MAX_RESOLUTION)
        im = im.convert("RGB")
        im.save(filename[:-3]+"jpeg", "JPEG", quality=50, optimize=True)
    os.remove(filename)

def worker():
    while True:
        try:
            subr, id, url = job_pool.get(timeout=60)
            get_image(url, f"../data/reddit/{subr.lower()}/{id}.{url[-3:]}")
        except queue.Empty:
            break
        except Exception as e:
            pass
        job_pool.task_done()

all_threads = [threading.Thread(target=worker) for _ in range(MAX_WORKERS)]
for t in all_threads:
    t.start()

In [None]:
for srname, id, url in zip(data["SUBREDDIT"], data["ID"], data["URL"]):
    print(f"{srname} {id} {url}")
    job_pool.put((srname, id, url))
job_pool.join()