In [None]:
import pandas as pd
import boto3
import json

s3 = boto3.resource('s3')

bucket_name = 'nr-wsip'

index_output_keys = [obj.key for obj in s3.Bucket(bucket_name).objects.filter(Prefix="index_output/")]
data_output_keys = [obj.key for obj in s3.Bucket(bucket_name).objects.filter(Prefix="data_output/")]

def to_dates(keys):
    dates = []
    for key in keys:
        parts = key.split('/')
        if (parts[1] == ""):
            continue
        dates.append(parts[1])
    return set(dates)

index_output_times = to_dates(index_output_keys)
latest_index_output_time = max(index_output_times)

data_output_times = to_dates(data_output_keys)
latest_data_output_time = max(data_output_times)

def to_latest_output_key(latest_output_time, output_keys):
    for key in output_keys:
        parts = key.split('/')
        if len(parts) < 3:
            continue
        if (parts[1] == latest_output_time) & (parts[2] != '_SUCCESS') & (parts[2] != 'manifest'):
            return key
        
latest_index_output_key = to_latest_output_key(latest_index_output_time, index_output_keys)
latest_data_output_key = to_latest_output_key(latest_data_output_time, data_output_keys)


def to_ts(unix_str):
    return pd.Timestamp(int(unix_str), unit='s')

def parse_index_row(row):
    output = {}
    row_dict = json.loads(row)
    output['created_utc'] = to_ts(row_dict['created_utc']['n'])
    output['last_updated'] = to_ts(row_dict['last_updated']['n'])
    output['day'] = row_dict['day']['s']
    output['id'] = row_dict['id']['s']
    output['score'] = int(row_dict['score']['n'])
    return output

def get_string_prop(prop, row):
    return row[prop]['s']

def parse_data_row(row):
    row_dict = json.loads(row)
    props = ['id', 'author', 'title', 'permalink', 'selftext', 'subreddit']
    return {prop: get_string_prop(prop, row_dict) for prop in props}
    

def get_s3_obj(bucket_name, key):
    return s3.Object(bucket_name, key).get()['Body'].read().decode('utf-8')
    
def index_obj_to_df(index_obj):
    rows = index_obj.split("\n")[:-1]
    output_json = [parse_index_row(row) for row in rows]
    return pd.DataFrame(output_json)

def data_obj_to_df(data_obj):
    rows = data_obj.split("\n")[:-1]
    output_json = [parse_data_row(row) for row in rows]
    return pd.DataFrame(output_json)

    
index = index_obj_to_df(get_s3_obj(bucket_name, latest_index_output_key))
data = data_obj_to_df(get_s3_obj(bucket_name, latest_data_output_key))
posts = index.join(data.set_index('id'), on='id')
posts = posts.assign(alive_time=(posts['last_updated'] - posts['created_utc']))

In [None]:
subs = posts['subreddit'].unique()

result = []
for sub in subs:
    sub_posts = posts[posts['subreddit'] == sub]
    
    sub_posts = sub_posts[sub_posts['alive_time'] > pd.Timedelta(hours=4)]
    lower = sub_posts['score'].quantile(0.33)
    upper = sub_posts['score'].quantile(0.66)
    if len(sub_posts) > 500:
        result.append({"sub": sub, "lower": lower, "upper": upper, "count": len(sub_posts)})
#     print("sub: {0} -- lower: {1} -- upper: {2}".format(sub, lower, upper))

popular_subs_summary = pd.DataFrame(result).sort_values(by="lower", ascending=False)

In [None]:
popular_subs = popular_subs_summary['sub'].values

In [None]:
for sub in popular_subs:
    diffs = get_diffs_for_sub(sub, 4)
    print(sub, diffs)

In [None]:
from collections import Counter, defaultdict
import re

def get_diffs_for_sub(sub, min_alive_hours):

    def classif_sub_posts(score):
        if score >= upper:
            return True
        elif score < lower:
            return False
        else:
            return
    sub_posts = posts[posts['subreddit'] == sub]
    sub_posts = sub_posts[sub_posts['alive_time'] >= pd.Timedelta(hours=min_alive_hours)]
    lower = max([sub_posts['score'].quantile(0.33), 2])
    upper = max([sub_posts['score'].quantile(0.33), 2])
    

    sub_posts = sub_posts.assign(good=sub_posts['score'].apply(classif_sub_posts))
    sub_posts = sub_posts.dropna(subset=['good'])
    
    def to_message(row):
        is_good = row['good']
        words = "{0} {1}".format(row['title'], row['selftext'])
        return (words, is_good)
    
    messages = [to_message(post) for _, post in sub_posts.iterrows()]
    
    word_probs = messages_to_word_probabilities(messages)
    
    words_good = [(word, p_good) for word, p_good, p_bad in word_probs]
    words_bad = [(word, p_bad) for word, p_good, p_bad in word_probs]
    
    def to_series(word_tups):
        words, scores = zip(*word_tups)
        return pd.Series(scores, index=words)
    
#     best = to_series(words_good).sort_values(ascending=False).head(100)
#     worst = to_series(words_bad).sort_values(ascending=False).head(100)
    
#     best_index_unique = np.setdiff1d(best.index, worst.index)
#     best = best[best_index_unique].sort_values(ascending=False).head(10)
    
#     worst_index_unique = np.setdiff1d(worst.index, best.index)
#     worst = worst[worst_index_unique].sort_values(ascending=False).head(10)
#     diffs = word_diffs(messages)

    diffs = to_series(words_good).sort_values(ascending=False)
    best = diffs.head(10)
    worst = diffs.tail(10).sort_values()
    return best


def tokenize(message):
    stops = ['our', 'their', 'your', 'one', 'about', "i'm", 'up', 'out', 'am', 'any', 'like', 'when', 'now', 'her', "it's",  'we', 'us', 'they', 'he', 'she', 'his', 'u', 'com', 'http', 'https', 'www', 'or', 'who', 'would', 'had', 'any' 'at', 'got', 'l', 'my', 'me', 'does','get', 'were', 'what', 'at', 'too', 'as', 's', 'an', 'than', 'do', 'so', 'no', 'it', 'how', 'be', 'has',  'a','can', 'will', 'have', 'if', 'why', 'but','he', 'the', 'and', 'on', 'all', 'is', 'emptystring', 'are', 'said', 'in', 'of', 'just', 'that', 'i', 'with', 'was', 'r', 't', 'to', 'for', 'by', 'you', 'there', 'not', 'to', 'from', 'this', '0' , '1', '2', '3', '4', '5', '6', '7', '8', '9']
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    filt_words = list(filter(lambda x: x not in stops, all_words))
    
    return set(filt_words)                          # remove duplicates


def count_words(messages):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_good in messages:
        for word in tokenize(message):
            counts[word][0 if is_good else 1] += 1
    return counts

def word_probabilities(counts, total_goodposts, total_non_goodposts, k=0.5):
    """turn the word_counts into a list of triplets
    w, p(w | good) and p(w | ~good)"""
    return [(w,
             (good + k) / (total_goodposts + 2 * k),
             (non_good + k) / (total_non_goodposts + 2 * k))
             for w, (good, non_good) in counts.items()]

def messages_to_word_probabilities(messages):
    num_goodposts = len([is_good
                     for message, is_good in messages
                     if is_good])
    num_non_goodposts = len(messages) - num_goodposts

    # run training data through our "pipeline"
    k = 0.5
    word_counts = count_words(messages)
    word_probs = word_probabilities(word_counts,
                                         num_goodposts,
                                         num_non_goodposts,
                                         k)
    
    return word_probs
#     words_and_diffs = [(word, (p_good - p_bad)) for word, p_good, p_bad in word_probs]
    
    # try just the p_good
#     words_and_diffs = [(word, p_good) for word, p_good, p_bad in word_probs]

#     if len(words_and_diffs) == 0:
#         return pd.Series()
#     words, diffs = zip(*words_and_diffs)
#     diffs = pd.Series(diffs, index=words).sort_values(ascending=False)
#     return diffs.sort_values(ascending=False)

In [None]:
sh_items = posts[posts['subreddit'] == 'soccer']
min_alive_time = 8

%matplotlib inline
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
scores = sh_items[sh_items['alive_time'] > pd.Timedelta(hours=min_alive_time)]['score']
scores[scores < 51].hist(bins=50)


In [None]:
created_times = sh_items['created_utc']

In [None]:
created_times.iloc[0].minute

In [None]:


created_times = sh_items['created_utc']

created_minutes = pd.Series(created_minutes)

In [None]:
ax = created_minutes.hist(bins=48)
ax.set_xlim([0,1441])

In [None]:
def get_subreddit_time_data(subreddit_name):

    def to_minute_of_day(timestamp):
        return (timestamp.hour * 60) + timestamp.minute

    sub_posts = posts[posts["subreddit"] == subreddit_name]
    sub_posts = sub_posts[sub_posts['alive_time'] > pd.Timedelta(hours=4)]
    created_times = sub_posts['created_utc']
    created_minutes = [to_minute_of_day(t) for _, t in created_times.items()]

    bins = [30 * n for n in range(49)]
    group_names = range(48)
    time_bins = pd.cut(created_minutes, bins, labels=group_names)
    sub_posts = sub_posts.assign(time_bin=time_bins)

#     bin_results = {'bin': [], 'med': [], 'upper': [], 'lower': [], 'count':[]}
    bin_results = {'bin': [], 'med': [], 'count':[]}    
    for t in group_names:
        bin_posts = sub_posts[sub_posts["time_bin"] == t]
        bin_med_score = bin_posts['score'].median()
        bin_upper_score = bin_med_score + bin_posts['score'].std()
        bin_lower_score = bin_med_score + bin_posts['score'].std()
        bin_count = bin_posts['score'].count()
        bin_results['bin'].append(t)
        bin_results['med'].append(bin_med_score)
#         bin_results['upper'].append(bin_upper_score)
#         bin_results['lower'].append(bin_lower_score)
        bin_results['count'].append(bin_count)
    
    return pd.DataFrame(bin_results).set_index('bin')

In [None]:
get_subreddit_time_data('aww').plot()

In [None]:
categories.value_counts().sort_index().plot()