In [15]:

s3 = boto3.resource('s3')
bucket_name = 'nr-wsip'

index_prefix = "index_output/"
data_prefix = "data_output/"
min_alive_hours = 4

index_df = s3_to_df(bucket_name, index_prefix, parse_index_row)
data_df = s3_to_df(bucket_name, data_prefix, parse_data_row)
posts = index_df.join(data_df.set_index('id'), on='id')
posts = posts.assign(alive_time=(posts['last_updated'] - posts['created_utc']))

posts = posts[posts['alive_time'] > pd.Timedelta(hours=min_alive_hours)]

subs = posts['subreddit'].unique()

In [24]:
sub_posts = {sub: posts[posts["subreddit"] == sub] for sub in subs}

In [12]:
import pandas as pd
import boto3
import json

def to_dates(keys):
    dates = []
    for key in keys:
        parts = key.split('/')
        if (parts[1] == ""):
            continue
        dates.append(parts[1])
    return set(dates)

def to_latest_output_key(latest_output_time, output_keys):
    for key in output_keys:
        parts = key.split('/')
        if len(parts) < 3:
            continue
        if (parts[1] == latest_output_time) & (parts[2] != '_SUCCESS') & (parts[2] != 'manifest'):
            return key

def get_latest_output_key(bucket_name, prefix):
    output_keys = [obj.key for obj in s3.Bucket(bucket_name).objects.filter(Prefix=prefix)]
    output_times = to_dates(output_keys) 
    latest_time = max(output_times)
    latest_output_key = to_latest_output_key(latest_time, output_keys)
    return latest_output_key

def s3_to_df(bucket_name, prefix, parser):
    key = get_latest_output_key(bucket_name, prefix)
    s3_obj = s3.Object(bucket_name, key).get()['Body'].read().decode('utf-8')
    rows = s3_obj.split("\n")[:-1]
    output = [parser(row) for row in rows]
    return pd.DataFrame(output)

def to_ts(unix_str):
    return pd.Timestamp(int(unix_str), unit='s')

def get_string_prop(prop, row):
    return row[prop]['s']

def parse_index_row(row):
    output = {}
    row_dict = json.loads(row)
    output['created_utc'] = to_ts(row_dict['created_utc']['n'])
    output['last_updated'] = to_ts(row_dict['last_updated']['n'])
    output['day'] = row_dict['day']['s']
    output['id'] = row_dict['id']['s']
    output['score'] = int(row_dict['score']['n'])
    return output

def parse_data_row(row):
    row_dict = json.loads(row)
    props = ['id', 'author', 'title', 'permalink', 'selftext', 'subreddit']
    return {prop: get_string_prop(prop, row_dict) for prop in props}

In [86]:
def posts_to_data(posts):
    return {
        "time_of_day": get_time_of_day_data(posts),
        "words": get_word_probs(posts)
    }

sub_data = {sub: posts_to_data(s_posts) for sub, s_posts in sub_posts.items()}
with open('sub_data.json', 'w') as outfile:
    json.dump(sub_data, outfile)

In [77]:
def get_time_of_day_data(posts):

    def to_minute_of_day(timestamp):
        return (timestamp.hour * 60) + timestamp.minute
    created_times = posts['created_utc']
    created_minutes = [to_minute_of_day(t) for _, t in created_times.items()]

    bins = [30 * n for n in range(49)]
    group_names = range(48)
    time_bins = pd.cut(created_minutes, bins, labels=group_names)
    posts = posts.assign(time_bin=time_bins)

#     bin_results = {'bin': [], 'med': [], 'upper': [], 'lower': [], 'count':[]}
    bin_results = {'bin': [], 'med': [], 'count':[]}    
    for t in group_names:
        bin_posts = posts[posts["time_bin"] == t]
        bin_med_score = bin_posts['score'].median()
        if len(bin_posts):
            bin_med_score = bin_posts['score'].median()
        else:
            bin_med_score = 0
        bin_upper_score = bin_med_score + bin_posts['score'].std()
        bin_lower_score = bin_med_score + bin_posts['score'].std()
        bin_count = bin_posts['score'].count().item()
        bin_results['bin'].append(t)
        bin_results['med'].append(bin_med_score)
#         bin_results['upper'].append(bin_upper_score)
#         bin_results['lower'].append(bin_lower_score)
        bin_results['count'].append(bin_count)
    
    return bin_results

In [84]:
bla = [1,2,3]
za = ['a', 'b', 'c']

i = pd.Series(bla, index=za).index
v = pd.Series(bla, index=za).values

v.tolist()

[1, 2, 3]

In [85]:
import math
from collections import defaultdict
import re

def to_message(row):
    is_good = row['good']
    words = "{0} {1}".format(row['title'], row['selftext'])
    return (words, is_good)

def classify_posts(lower, upper, score):
    if score >= upper:
        return True
    elif score < lower:
        return False
    else:
        return
    
def probs_to_distance(p_good, p_bad):
    return (p_good - p_bad)/math.sqrt(2)

def get_word_probs(posts):
    result = {
        "best": {"words": [], "probs": []}, "worst": {"words": [], "probs": []}
    }
#     lower_score = max([posts['score'].quantile(0.33), 2])
    lower_score = 3
    upper_score = max([posts['score'].quantile(0.66), 4])
    posts = posts.assign(
        good=posts['score'].apply(
            lambda p: classify_posts(lower_score, upper_score, p)
        )).dropna(subset=['good'])
    messages = [to_message(post) for _, post in posts.iterrows()]
    word_probs = messages_to_word_probabilities(messages)
    word_dists = [(word, probs_to_distance(p_good, p_bad)) for
                  word, p_good, p_bad in word_probs]

    if len(word_dists) == 0:
        return result
    
    words, dists = zip(*word_dists)
    word_dists = pd.Series(dists, index=words).sort_values()
    worst = word_dists.head(10)
    best = word_dists.tail(10).sort_values(ascending=False)
    result['best']['words'] = best.index.tolist()
    result['best']['probs'] = best.values.tolist()
    result['worst']['words'] = worst.index.tolist()
    result['worst']['probs'] = worst.values.tolist()
    return result

    
def tokenize(message):
    stops = ['our', 'their', 'your', 'one', 'about', "i'm", 'up', 'out', 'am', 'any', 'like', 'when', 'now', 'her', "it's",  'we', 'us', 'they', 'he', 'she', 'his', 'u', 'com', 'http', 'https', 'www', 'or', 'who', 'would', 'had', 'any' 'at', 'got', 'l', 'my', 'me', 'does','get', 'were', 'what', 'at', 'too', 'as', 's', 'an', 'than', 'do', 'so', 'no', 'it', 'how', 'be', 'has',  'a','can', 'will', 'have', 'if', 'why', 'but','he', 'the', 'and', 'on', 'all', 'is', 'emptystring', 'are', 'said', 'in', 'of', 'just', 'that', 'i', 'with', 'was', 'r', 't', 'to', 'for', 'by', 'you', 'there', 'not', 'to', 'from', 'this', '0' , '1', '2', '3', '4', '5', '6', '7', '8', '9']
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    filt_words = list(filter(lambda x: x not in stops, all_words))
    
    return set(filt_words)   
    
def count_words(messages):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_good in messages:
        for word in tokenize(message):
            counts[word][0 if is_good else 1] += 1
    return counts
    
def messages_to_word_probabilities(messages):
    num_goodposts = len([is_good
                     for message, is_good in messages
                     if is_good])
    num_non_goodposts = len(messages) - num_goodposts

    k = 0.5
    word_counts = count_words(messages)
    word_probs = word_probabilities(word_counts,
                                         num_goodposts,
                                         num_non_goodposts,
                                         k)
    return word_probs

def word_probabilities(counts, total_goodposts, total_non_goodposts, k=0.5):
    """turn the word_counts into a list of triplets
    w, p(w | good) and p(w | ~good)"""
    return [(w,
             (good + k) / (total_goodposts + 2 * k),
             (non_good + k) / (total_non_goodposts + 2 * k))
             for w, (good, non_good) in counts.items()]
