# Reddit Crawler
Using praw to comb through reddit posts, then PIL to process images

In [3]:
import datetime
import os
import praw

from PIL import Image

Some configurations for the crawler and image downloads

In [3]:
SUBREDDITS_LIST_PATH = "./subreddit_list.txt"
MAX_IMG = 500
ACCEPTABLE_EXTENSIONS = ["jpg", "png"]
MAX_RESOLUTION = (1024, 1024)

# Note: you will need your own praw.ini config file to use this command
reddit = praw.Reddit("cs4243")
req_header = { "User-Agent": "CS4243 crawler bot", "From": "insert email here" }
with open(SUBREDDITS_LIST_PATH, "r") as f:
    sr_list = [ x.strip() for x in f.readlines() ]

Get image metadata first before deciding on images to download. Due to the long tailed distribution, and for a more representative distribution of scores, we download 500 images per subreddit so that the calculated percentiles are representative, and that there are enough images in the popular class. 

In [None]:
# CAUTION: THIS CODE SEGMENT CAN TAKE MORE THAN TEN MINUTES TO RUN!
now = datetime.datetime.now()
unixnow = int(datetime.datetime.timestamp(now))
PRELIMINARY_PATH = f"./crawl_{now.month:02}{now.day:02}.csv"

with open(PRELIMINARY_PATH, 'w') as datafile:
    datafile.write("ID,SCORE,SUBREDDIT,URL,UNIX TIME,UPVOTE RATIO\n")
    for sr in sr_list:
        count = 0
        for submission in reddit.subreddit(sr).new(limit=None):
            # posts are at least one week old, for score stability
            if (unixnow - submission.created_utc) > 604800:  
                srname = submission.subreddit.display_name.lower()
                if submission.url[-3:] not in ACCEPTABLE_EXTENSIONS:
                    continue
                datafile.write(f"{submission.id},{submission.score},{srname},{submission.url}," + \
                                    f"{submission.created_utc},{submission.upvote_ratio}\n")
                count += 1
                if count % 10 == 0:
                    datafile.flush()
                    print(f"Sourcing images in {sr}: {count}/{MAX_IMG}")
                if count == MAX_IMG:
                    datafile.flush()
                    break

# Data processing to download selected images based on class

Parameters for selecting images to download based on metadata

In [2]:
import numpy as np
import pandas as pd
import queue
import requests
import threading

PERCENTILE_BINS = [0.5, 0.9, 1.0]
NUM_ROWS_PER_SUB_PER_PERCENT = 50
CSV_PATH = "./data/reddit/processed_data.csv"
MAX_WORKERS = 4

with open(PRELIMINARY_PATH, "r") as f:
    data = pd.read_csv(f)

NameError: name 'PRELIMINARY_PATH' is not defined

Sort reddit post scores into classes based on percentile bins. After which, select an equal number of images per subreddit per bin such that the proportion of classes are roughly equal. This solves the problem of learning on long-tailed distributions. 

In [16]:
data['PERCENTILE'] = data['SCORE'].rank(pct=True)
data['PERCENTILE BIN'] = np.digitize(
    data['PERCENTILE'], PERCENTILE_BINS, right=True)
data['PERCENTILE BIN'] = data['PERCENTILE BIN'].map(
    {index: bin for index, bin in enumerate(PERCENTILE_BINS)})

filtered_rows = []
for sr in sr_list:
    os.makedirs(f"./data/reddit/{sr.lower()}", exist_ok=True)
    for percent in PERCENTILE_BINS:
        subdata = data[(data["SUBREDDIT"] == sr.lower()) & (data["PERCENTILE BIN"] == percent)]
        filtered_rows.extend(subdata.head(NUM_ROWS_PER_SUB_PER_PERCENT).values.tolist())
output = pd.DataFrame(filtered_rows, columns=data.columns)
with open(CSV_PATH, "w") as f:
    output.to_csv(f, index=False)

Some threading tools to help make downloading images faster

In [17]:
job_pool = queue.Queue()
def get_image(url, filename) :
    req = requests.get(url, stream=True, headers=req_header)
    if not req.ok:
        return
    with open(filename, 'wb') as f:
        for chunk in req.iter_content(1024):
            if chunk:
                f.write(chunk)
    with Image.open(filename) as im:
        im.thumbnail(MAX_RESOLUTION)
        im = im.convert("RGB")
        im.save(filename[:-3]+"jpeg", "JPEG", quality=50, optimize=True)
    os.remove(filename)

def worker():
    while True:
        try:
            subr, id, url = job_pool.get(timeout=60)
            get_image(url, f"./data/reddit/{subr.lower()}/{id}.{url[-3:]}")
        except queue.Empty:
            break
        except Exception as e:
            pass
        job_pool.task_done()

all_threads = [threading.Thread(target=worker) for _ in range(MAX_WORKERS)]
for t in all_threads:
    t.start()

Proceed with the downloading of images

In [None]:
# CAUTION: THIS CODE SEGMENT CAN TAKE MORE THAN TEN MINUTES TO RUN!
for srname, id, url in zip(data["SUBREDDIT"], data["ID"], data["URL"]):
    print(f"{srname} {id} {url}")
    job_pool.put((srname, id, url))
job_pool.join()

# Collate Reddit dataset

After downloading the images, collate all image metadata again to verify dataset. The csv file for image metadata and json files will be used in the model notebook. 

In [5]:
import argparse
import json
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split

PERCENTILE_BINS = [0.5, 0.9, 1.0]

def split_dataset(
        dataset,
        train_size,
        val_size,
        stratify_by=None,
        random_seed=0):
    if stratify_by:
        stratify = dataset[stratify_by]
    else:
        stratify = None
    train, val_test = train_test_split(
        dataset,
        train_size=train_size,
        random_state=random_seed,
        shuffle=True,
        stratify=stratify)

    if stratify_by:
        stratify = val_test[stratify_by]
    else:
        stratify = None
    val, test = train_test_split(
        val_test,
        train_size=val_size / (1.0 - train_size),
        random_state=random_seed,
        shuffle=True,
        stratify=stratify)

    return train, val, test

def collate_reddit_data(
        data_path,
        reddit_levels_path,
        output_path,
        labels_path,
        min_posts=1,
        train_size=0.8,
        val_size=0.1):
    data_path = Path(data_path)
    print(f'Reading data from: {data_path}')

    csv_paths = list(data_path.glob('**/*.csv'))
    csv_paths.sort()

    image_paths = data_path.glob('**/*.jpeg')
    image_id_to_path = {path.stem: path for path in image_paths}

    # Load individual subreddit data
    skipped_subreddits = []
    data_list = []
    for csv_path in tqdm(csv_paths):
        data = pd.read_csv(csv_path, skiprows=2, on_bad_lines='skip')
        data = data[data['ID'].isin(image_id_to_path)]
        if len(data) < min_posts:
            skipped_subreddits.append(data['SUBREDDIT'][0])
            continue

        data['PATH'] = data['ID'].map(image_id_to_path)

        # Get percentile and percentile bin for each post in subreddit
        data['PERCENTILE'] = data['SCORE'].rank(pct=True)
        data['PERCENTILE BIN'] = np.digitize(
            data['PERCENTILE'], PERCENTILE_BINS, right=True)
        data['PERCENTILE BIN'] = data['PERCENTILE BIN'].map(
            {index: bin for index, bin in enumerate(PERCENTILE_BINS)})

        data_list.append(data)
    data = pd.concat(data_list, ignore_index=True)
    print(f'Skipped subreddits: {skipped_subreddits}')

    # Merge reddit levels
    reddit_levels = pd.read_csv(reddit_levels_path)
    data = pd.merge(data, reddit_levels, how='left', on='SUBREDDIT')

    # Create and save labels
    labels = {
        'percentile_bin': PERCENTILE_BINS,
    }
    for level in reddit_levels:
        labels[level.lower()] = list(data[level].dropna().unique())
    with open(labels_path, 'w') as file:
        json.dump(labels, file, indent=4)
    print(f'Saved labels to {labels_path}')

    # Split dataset
    train, val, test = split_dataset(
        data,
        train_size,
        val_size,
        stratify_by='PERCENTILE BIN',
        random_seed=0)
    train['SPLIT'] = 'train'
    val['SPLIT'] = 'val'
    test['SPLIT'] = 'test'
    data = pd.concat([train, val, test], ignore_index=True)

    data.to_csv(output_path, index=False)
    print(f'Saved data to {output_path}')


ModuleNotFoundError: No module named 'tqdm'

In [None]:
CONFIG = {
    'data_path': 'data/reddit',
    'labels_path': 'data/reddit_labels.json',
    'reddit_levels_path': 'dataset/reddit_levels.csv',
    'output_path': 'data/reddit_data.csv',
    'min_posts': 500,
    'train_size': 0.8,
    'val_size': 0.1
}
collate_reddit_data(**CONFIG)

In [19]:
data = pd.read_csv(Path('../data/short_reddit_data.csv'), on_bad_lines='skip')
labels_path = "../data/reddit_labels.json"
with open(labels_path, 'r') as f:
    labels = json.load(f)

sr_list = labels["subreddit"]
mr_list = labels["multireddit"]
percentile = labels["percentile_bin"]
ans = f"multireddit,{','.join([str(x) for x in percentile])},"
total_size = len(data)
for mr in mr_list:
    line = f"\n{mr},"
    count = 0
    for pc in percentile:
        subdata = data[(data["MULTIREDDIT"] == mr.lower()) & (data["PERCENTILE BIN"] == pc)]
        line += f"{len(subdata)},"
        count += len(subdata)
    ans += line + f"{count/total_size:.03f}"
line = "\n,"
for pc in percentile:
    subdata = data[data["PERCENTILE BIN"] == pc]
    line += f"{len(subdata)/total_size:.03f},"
ans += line

print(ans)

multireddit,0.5,0.9,1.0,
imraces,400,400,388,0.058
imcharacters,600,600,588,0.087
imarchitecture,200,200,189,0.029
imtechnology,500,500,478,0.072
imlandscapes,600,600,587,0.087
immonsters,550,550,530,0.079
sfwpornnature,800,800,776,0.116
sfwpornsynthetic,1300,1300,1259,0.188
sfwpornaesthetic,1050,1050,1029,0.153
sfwpornorganic,400,400,396,0.058
sfwpornscholastic,250,250,246,0.036
,0.336,0.336,0.327,
