In [2]:
import os
import subprocess
from bz2 import BZ2File
import lzma

import json
from tqdm import tqdm_notebook as tqdm

from subprocess import Popen, PIPE

import pandas as pd

from collections import defaultdict, Counter

# downloading data

In [28]:
def download_file(url):
    print(f'downloading file from {url} ...')
    local_filename = url.split('/')[-1]
    try:
        os.unlink(local_filename)
    except FileNotFoundError:
        pass
    cmd = f'wget {url} -O {local_filename}'
    subprocess.call(cmd.split())
    return local_filename

def open_zipped_file(fn):
    if fn.endswith('.bz2'):
        return BZ2File(fn)
    elif fn.endswith('.xz'):
        return lzma.open(fn)
    else:
        raise Exception(f'unknown extension for file {fn}')

keep_columns = {'id', 'created_utc', 'subreddit', 'title', 
                'selftext', 'score', 'created', 'author', 
                'distinguished', 'score'}
        
def get_cleaned_json(jsn):
    return  {k:v for k,v in jsn.items() if k in keep_columns}



def get_self_posts(zipped_file):
    print('processing file...')
    out = []
    total = 0
    bad_lines = 0
    
    for line in tqdm(zipped_file):
        total += 1
        try:
            jsn = json.loads(line)

            if jsn['is_self'] and len(jsn['selftext']) >= MIN_LINE_LENGTH and jsn['subreddit'] != 'AskReddit' \
                              and len(jsn['selftext']) <= MAX_LINE_LENGTH:      
                jsn = get_cleaned_json(jsn)
                out.append(jsn)
        except Exception:
            bad_lines+=1
            continue
    print(total, bad_lines)
    return out
    

def get_self_posts_from_url(url):
    local_filename = url.split('/')[-1]
    out_filename = f"/data/reddit/{local_filename}"
    out_filename = ".".join(out_filename.split(".")[:-1]) + ".json"
    print(out_filename)
    download_file(url)
    
    #local_filename = url.split('/')[-1]
    with open_zipped_file(local_filename) as f:
        self_posts = get_self_posts(f)
    json.dump(self_posts, open(out_filename, 'w'))   
    #os.unlink(local_filename)
    
    
#url = 'https://files.pushshift.io/reddit/submissions/RS_2018-01.xz'
#get_self_posts_from_url(url)

In [29]:
filenames = ['/RS_2017-04.bz2',
             '/RS_2017-01.bz2',
             '/RS_2017-02.bz2',
             '/RS_2017-03.bz2',
             '/RS_2017-05.bz2',
             '/RS_2017-06.bz2',
             '/RS_2017-07.xz',
             '/RS_2017-08.bz2',
             '/RS_2017-09.bz2',
             '/RS_2017-10.bz2',
             '/RS_2017-11.xz',
             '/RS_2017-12.xz']

filenames += [f'/RS_2016-{str(i).zfill(2)}.bz2' for i in range(6, 13)]
filenames += [f'/RS_2018-0{i}.xz' for i in range(1, 6)]


urls = ['https://files.pushshift.io/reddit/submissions'+ s for s in filenames]


bad_urls = []
for url in urls:
    try: 
        get_self_posts_from_url(url)
    except:
        raise
        print(f"BAD FILE! {url}")
        bad_urls.append(url)

24

### cleantext methods

In [1]:
from html.parser import HTMLParser
import string
import re

URL_REGEX = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
REPEATED_CHARACTER_REGEX = re.compile(r"(([A-z])\2{2,})")

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)


def remove_excess_whitespace_from_string(string):
    string = string.strip()
    return " ".join(string.split())


def remove_punctuation_from_string(s):
    return s.translate(None, string.punctuation)


def remove_html_from_string(string):
    stripper = MLStripper()
    stripper.feed(string)
    return stripper.get_data()

def replace_with_double_character(matchobj):
    return matchobj.group(2) * 2


def remove_repeated_alpha_chars(string):
    """
    Looks for runs of characters of 3 or more of the same thing, and then replaces it with just 2
    of that same character
    (useful for user text e.g. twitter, with strings like 'oooohhhhhhhh noooooooo' ->  'oohh noo'
    """
    return REPEATED_CHARACTER_REGEX.sub(replace_with_double_character, string)


def clean_string(string,
                 lowercase_characters=True,
                 remove_html=True,
                 remove_excess_whitespace=True,
                 replace_repeated_characters=True,
                 remove_urls=True
                 ):
    if remove_urls:
        string = URL_REGEX.sub('', string)
    if remove_html:
        string = remove_html_from_string(string)
    if replace_repeated_characters:
        string = remove_repeated_alpha_chars(string)
    if remove_excess_whitespace:
        string = remove_excess_whitespace_from_string(string)
    if lowercase_characters:
        string = string.lower()
    

    return string

In [None]:
local_filename = url.split('/')[-1]
print(local_filename)

MIN_LINE_LENGTH = 256
MAX_LINE_LENGTH = 4096

with open_zipped_file(local_filename) as f:
    for line in f:
        try:
            jsn = json.loads(line)
        except Exception:
            continue
        
        if jsn['is_self'] and len(jsn['selftext']) >= MIN_LINE_LENGTH and jsn['subreddit'] != 'AskReddit' \
                          and len(jsn['selftext']) <= MAX_LINE_LENGTH:     
            print(jsn['title'])
            print(jsn['selftext'])
            print(jsn['subreddit'])
            print(len(jsn['selftext']))
            print(jsn['author'])
            print(jsn.keys())
            break

### other filters

In [None]:
from collections import Counter
import glob
import pickle 


author_counts = Counter()

for fn in sorted(glob.glob('/data/reddit/*.json')):
    jsn = json.load(open(fn))
    for d in jsn:
        author = d.get('author')
        subreddit = d.get('subreddit')
        author_counts[(author, subreddit)] += 1
        

pickle.dump(author_counts, open('/data/reddit/author_counts.pkl', 'wb'))

In [4]:
import pickle

author_counts = pickle.load(open('/data/reddit/author_counts.pkl', 'rb'))

### language detect

In [None]:
import multiprocessing
import langdetect

import time
import random
from multiprocessing import Pool
from tqdm import tqdm_notebook as tqdm


def myfunc(row):
    selftext = row['selftext']
    id_ = row['id']
    selftext = clean_string(selftext)
    try:
        lang = langdetect.detect(selftext[:200])
    except Exception:
        lang = 'none'
    pbar.update(1)
    return (id_, lang)

ids_langs = []

for fn in tqdm(sorted(glob.glob('/data/reddit/*.json'))):
    print(fn)
    jsn = json.load(open(fn))
    #jsn = jsn[:1000]

    pool = Pool(30)

    arr = pool.imap_unordered(myfunc, jsn)

    pool.close()
    pool.join()
    pbar.close()
    
    ids_langs += list(arr)

In [6]:
#pickle.dump(ids_langs, open('ids_langs.pkl', 'wb'))

ids_langs = pickle.load(open('/data/reddit/ids_langs.pkl', 'rb'))

In [9]:
bad_lang_ids = set()

for id_, lang in ids_langs:
    if lang !='en':
        bad_lang_ids.add(id_)

In [10]:
len(bad_lang_ids) / len(ids_langs)

0.016939969456531208

In [14]:
import glob
import json
from collections import Counter, defaultdict
import pandas as pd

MIN_LINE_LENGTH = 256
MAX_LINE_LENGTH = 4096

def bot_name(author):
    author = author.lower()
    for s in ['bot', 'mod', 'moderator', 'moderater']:
        if author.endswith(s):
            return True
    if author.startswith('auto'):
        return True
    return False

subreddit_counter = Counter()
raw_dict = defaultdict(list)

for fn in sorted(glob.glob('/data/reddit/downloads/*.json')):
    print(fn)
    jsn = json.load(open(fn))
    for d in jsn:
        id_ = d['id']
        author = d.get('author')
        subreddit = d.get('subreddit')
        
        author_count = author_counts[(author, subreddit)]
        
        raw_dict['id'].append(id_)
        raw_dict['good_lang'].append(not id_ in bad_lang_ids)
        raw_dict['author_count'].append(author_count)
        raw_dict['bot_name'].append(bot_name(author))
        
        raw_dict['subreddit'].append(subreddit)
        
        is_distinguished = d['distinguished'] is not None
        raw_dict['is_distinguished'].append(is_distinguished)
        
        selftext = d['selftext']
        selftext = clean_string(selftext)

        good_text = len(selftext) >= MIN_LINE_LENGTH and len(selftext) < MAX_LINE_LENGTH
        raw_dict['good_text'].append(good_text)

good_data_df = pd.DataFrame(raw_dict)

/data/reddit/downloads/RS_2016-06.json
/data/reddit/downloads/RS_2016-07.json
/data/reddit/downloads/RS_2016-08.json
/data/reddit/downloads/RS_2016-09.json
/data/reddit/downloads/RS_2016-10.json
/data/reddit/downloads/RS_2016-11.json
/data/reddit/downloads/RS_2016-12.json
/data/reddit/downloads/RS_2017-01.json
/data/reddit/downloads/RS_2017-02.json
/data/reddit/downloads/RS_2017-03.json
/data/reddit/downloads/RS_2017-04.json
/data/reddit/downloads/RS_2017-05.json
/data/reddit/downloads/RS_2017-06.json
/data/reddit/downloads/RS_2017-07.json
/data/reddit/downloads/RS_2017-08.json
/data/reddit/downloads/RS_2017-09.json
/data/reddit/downloads/RS_2017-10.json
/data/reddit/downloads/RS_2017-11.json
/data/reddit/downloads/RS_2017-12.json
/data/reddit/downloads/RS_2018-01.json
/data/reddit/downloads/RS_2018-02.json
/data/reddit/downloads/RS_2018-03.json
/data/reddit/downloads/RS_2018-04.json
/data/reddit/downloads/RS_2018-05.json


In [15]:
good_data_df.to_csv('good_data_df.tsv', sep='\t', index=False)

In [20]:
import numpy as np

good_data_df.columns

for c in ['bot_name', 'good_lang', 'good_text',  'is_distinguished',]:
    print(c.ljust(30), np.mean(good_data_df[c]))

bot_name                       0.039800376285254145
good_lang                      0.9830600305434688
good_text                      0.9420818143254593
is_distinguished               0.014504196844542561


In [None]:
import pandas as pd
import numpy as np
from collections import Counter

good_data_df = pd.read_csv('good_data_df.tsv', sep='\t')


MAX_AUTHOR_COUNT = 10

subdf = good_data_df[~good_data_df.is_distinguished]
subdf = subdf[subdf.good_text]
subdf = subdf[subdf.author_count <= MAX_AUTHOR_COUNT]
subdf = subdf[subdf.good_lang]
subdf = subdf[~subdf.bot_name]

counter = Counter(subdf.subreddit)
enough_post_subreddits = set([k for k, cnt in counter.most_common() if cnt >= 1000])

subdf = subdf[subdf.subreddit.isin(enough_post_subreddits)]

subdf.shape

(18661479, 7)

In [43]:
subdf['id_hash'] = [hash(s) for s in subdf.id]

In [46]:
# we do some subsampling here - this is mainly to help speed up the long deduplication process later

SUBSAMPLE_SIZE = 2000

def hashed_subsample(subdf_):
    subdf_ = subdf_.sort_values('id_hash')
    return subdf_.head(SUBSAMPLE_SIZE)

subsample_df = subdf.groupby('subreddit').apply(hashed_subsample)

In [47]:
import glob
import json
from collections import Counter, defaultdict
from tqdm import tqdm_notebook as tqdm

sample_ids = set(subsample_df['id'])
len(sample_ids)

sample_jsns = []

for fn in tqdm(sorted(glob.glob('/data/reddit/downloads/*.json'))):
    print(fn)
    jsn = json.load(open(fn))
    for d in jsn:
        id_ = d['id']
        if id_ in sample_ids:
            subreddit = d['subreddit']
            #assert(subreddit in good_subreddits)
            sample_jsns.append(d)

/data/reddit/downloads/RS_2016-06.json
/data/reddit/downloads/RS_2016-07.json
/data/reddit/downloads/RS_2016-08.json
/data/reddit/downloads/RS_2016-09.json
/data/reddit/downloads/RS_2016-10.json
/data/reddit/downloads/RS_2016-11.json
/data/reddit/downloads/RS_2016-12.json
/data/reddit/downloads/RS_2017-01.json
/data/reddit/downloads/RS_2017-02.json
/data/reddit/downloads/RS_2017-03.json
/data/reddit/downloads/RS_2017-04.json
/data/reddit/downloads/RS_2017-05.json
/data/reddit/downloads/RS_2017-06.json
/data/reddit/downloads/RS_2017-07.json
/data/reddit/downloads/RS_2017-08.json
/data/reddit/downloads/RS_2017-09.json
/data/reddit/downloads/RS_2017-10.json
/data/reddit/downloads/RS_2017-11.json
/data/reddit/downloads/RS_2017-12.json
/data/reddit/downloads/RS_2018-01.json
/data/reddit/downloads/RS_2018-02.json
/data/reddit/downloads/RS_2018-03.json
/data/reddit/downloads/RS_2018-04.json
/data/reddit/downloads/RS_2018-05.json



In [2]:
sample_df = pd.DataFrame(sample_jsns)

print(sample_df.shape)

#sample_df.to_csv('/data/reddit/sample_data.zip', index=False, compression='gzip') # pandas struggled to open this

import joblib

joblib.dump(sample_df, open('/data/reddit/good_posts.pkl', 'wb'))

In [1]:
import joblib

sample_df = joblib.load(open('/data/reddit/good_posts.pkl', 'rb'))

In [3]:
import pandas as pd

subreddit_info = pd.read_csv('subreddit_info.csv')

def filter_subreddits(checked_df):
    checked_df = checked_df[~checked_df.subreddit.isnull()]

    checked_subreddits = {s.replace('www.reddit.com/r/', '') for s in checked_df.subreddit if type(s) == str}
    
    
    for i in range(1, 4):
        checked_df = checked_df[~checked_df[f'level{i}'].isin(bad_categories)]
    checked_df = checked_df[checked_df['reason_for_exclusion'].isnull()]
    
    checked_df = checked_df[~checked_df.level2.isnull()]
    return checked_df

bad_categories = {
     'too_regimented',
     'too_broad',
     'broad',
     'game_recruitment',
     'requests',
     'bot',
     'non_english',
     'buy/sell',
     'r4r',
     'exchange',
     'city/province', 
     'commercial',  
     'festival',
     'unspecific_posts',
    }

for i in range(1, 4):
    for s in bad_categories:
        message = f"excluding {s} in categories"
        bad_rows = (subreddit_info[f'category_{i}'] == s) & (subreddit_info.reason_for_exclusion.isnull())
        subreddit_info.loc[bad_rows, 'reason_for_exclusion'] = message


# category specific filtering

In [4]:
to_merge  = subreddit_info[subreddit_info.reason_for_exclusion.isnull()][['subreddit', 'category_1']]
sample_df = sample_df.merge(to_merge, on='subreddit')

bad_word_dict ={
    'video_game' : {
             'bug', 
             'connection', 
             'patch', 
             'resolution', 
             'screen',
             'glitch', 
             'launcher',
             'framerate',
             'frames',
             'fps',
             'update',
             'crash',
             'lobby',
             'matchmak',
             'latency',
             'black screen',
             'issue',
             'steam',
             'desync',
             'gtx',
             'cheat',
             'file',
             'download',
             'upload',
#              'pc',
#              'xbox',
#              'ps2', 
#              'ps3', 
#              'ps4', 
#              'ios',
#              'android',
#              'psp',
#              'windows'
    },
    'music' : {
        'tour', 'ticket', 'concert', 'show', 'venue'
    }
}

def get_text(rd):
    return (rd['title'] + ' ' + rd['selftext']).lower()

def bad_row(rd):
    category = rd['category_1']
    if category in bad_word_dict:
        
        bad_words = bad_word_dict[category]
        text = get_text(rd)
        return any(s in text for s in bad_words)
    else:
        return False

bad_row_bools = sample_df.apply(bad_row, axis=1)
sample_df = sample_df[~bad_row_bools]

sample_df.shape

#(3844498, 11)

(3844498, 10)

In [5]:
from collections import Counter

subreddit_counter = Counter(list(sample_df.subreddit))
good_subreddits = {k for k, v in subreddit_counter.most_common() if v >= 1000}

bad_info_rows = ~subreddit_info.subreddit.isin(good_subreddits) & \
                subreddit_info.reason_for_exclusion.isnull()

subreddit_info.loc[bad_info_rows, 'reason_for_exclusion'] = "not enough posts"

sample_df = sample_df.merge(pd.DataFrame({'subreddit' : list(good_subreddits)}), on='subreddit')
sample_df.shape



(3696440, 10)

In [6]:
def apply(df_):
    df_ = df_.sort_values(['subreddit_count', 'subreddit'], ascending=False)
    return df_.head(1)

subreddit_info['subreddit_count'] = [subreddit_counter[s] for s in subreddit_info.subreddit]
subreddit_info['category_3'] = [s if type(s) == str else '' for s in subreddit_info.category_3]
subdf = subreddit_info[subreddit_info.reason_for_exclusion.isnull()]
subdf = subdf[subdf.subreddit.isin(good_subreddits)]
groupby = subdf.groupby(['category_2',], sort=False)['subreddit_count', 
                                                              'subreddit', 
                                                              'category_1', 
                                                              'category_3']


in_dataset = groupby.apply(apply)
in_dataset = in_dataset.reset_index(drop=False)

most_post_subreddits = set(in_dataset.subreddit)


topic_to_subreddit = {row[f'category_2'] : row['subreddit'] for row in in_dataset.to_dict(orient='rows')}

def apply(row):
    if type(row['reason_for_exclusion']) == str or row['subreddit'] in most_post_subreddits :
        return row['reason_for_exclusion']
    else:
        topic = row[f'category_2']
        bigger_subreddit = topic_to_subreddit[topic]
        
        return f"fewer posts than r/{bigger_subreddit} which shares topic"

    
sample_df = sample_df.merge(pd.DataFrame({'subreddit' : list(most_post_subreddits )}), on='subreddit')
subreddit_info['reason_for_exclusion'] = subreddit_info.apply(apply, axis=1)


In [7]:
sum(subreddit_info.reason_for_exclusion.isnull()), len(most_post_subreddits )

(1156, 1156)

In [8]:
def join_text(rd):
    out = f"{rd['title'][:100]} ||| {rd['selftext'][:128]}"
    out = out.replace('\n', '')
    out = out.replace('\r', '')
    out = out.replace('\t', '')
    return out

sample_df['text'] = sample_df.apply(join_text, axis=1)

In [9]:

# to_write = sample_df[['id', 'text']]
# to_write.columns = ['identifier', 'text']

# to_write.to_csv("reddit_dedupe_input_v2.tsv", sep='\t', index=False)

# len(set(sample_df.subreddit))

In [10]:
duplicates_df = pd.read_csv('/home/mike/dev/duplicates/reddit_dedupe_output_v2.tsv', sep='\t')

duplicates_df = duplicates_df.drop_duplicates()

sample_df = sample_df.merge(duplicates_df, on='text', how='left')

text_counter = Counter(sample_df.selftext)

sample_df['raw_dups'] = [text_counter[s] for s in sample_df.selftext]
sample_df = sample_df[sample_df.dups.isnull()]
sample_df = sample_df[sample_df.raw_dups == 1]

sample_df = sample_df[['id', 'subreddit', 'title', 'selftext',]]

In [11]:
from collections import Counter

subreddit_counter = Counter(list(sample_df.subreddit))
good_subreddits = {k for k, v in subreddit_counter.most_common() if v >= 1000}

bad_info_rows = ~subreddit_info.subreddit.isin(good_subreddits) & \
                subreddit_info.reason_for_exclusion.isnull()

subreddit_info.loc[bad_info_rows, 'reason_for_exclusion'] = "not enough posts"

sample_df = sample_df.merge(pd.DataFrame({'subreddit' : list(good_subreddits)}), on='subreddit').copy()



(1939932, 4)

In [12]:
sample_df['id_hash'] = [hash(s) for s in sample_df['id']]

In [13]:
SUBSAMPLE_SIZE = 1000

def hashed_subsample(subdf_):
    subdf_ = subdf_.sort_values('id_hash')
    return subdf_.head(SUBSAMPLE_SIZE)

sample_df = sample_df.groupby('subreddit').apply(hashed_subsample)

sample_df = sample_df[['id', 'subreddit',  'title', 'selftext', ]]
sample_df.shape

(1153000, 4)

## downsampling video_game subreddits

In [61]:
video_game_subreddits = subreddit_info[(subreddit_info.reason_for_exclusion.isnull()) & \
                (subreddit_info.category_1 == 'video_game')].subreddit

to_remove = list(sorted(video_game_subreddits, key = hash))[100:]
subreddit_info.loc[subreddit_info.subreddit.isin(to_remove), 'reason_for_exclusion'] = "randomly removed video game subreddit"

In [69]:
subreddit_info['in_data'] = subreddit_info.reason_for_exclusion.isnull()
subreddit_info = subreddit_info.sort_values(['category_1','category_2','category_3', 'subreddit'])
subreddit_info = subreddit_info[['subreddit','category_1','category_2','category_3', 
                                 'in_data', 'reason_for_exclusion']]

subreddit_info.to_csv('subreddit_info.csv', index=False)

In [65]:
final_subreddits = subreddit_info[subreddit_info.reason_for_exclusion.isnull()].subreddit

sample_df = sample_df[sample_df.subreddit.isin(final_subreddits)]

In [None]:
from sklearn.model_selection import train_test_split


train_df, test_df = train_test_split(sample_df, test_size=0.2, stratify = list(sample_df.subreddit))
sample_df = pd.concat([train_df, test_df])

In [68]:
def replace_bad_characters(string):
    string = string.replace('\n', '<lb>')
    string = string.replace('\r', '<lb>')
    string = string.replace('\t', '<tab>')
    return string

to_write = sample_df.copy()
to_write['title'] = [replace_bad_characters(s) for s in to_write.title]
to_write['selftext'] = [replace_bad_characters(s) for s in to_write.selftext]

to_write.to_csv('rspct.tsv' , sep='\t', index=False,)