In [160]:
import pandas as pd

import boto3
from boto3.dynamodb.conditions import Key, Attr

dynamodb = boto3.resource('dynamodb')
indexTable = dynamodb.Table('nr-wsip-posts_index')

In [161]:
response = indexTable.query(
    KeyConditionExpression=Key('day').eq('2017-05-20')
)
items_5_20 = response['Items']

response = indexTable.query(
    KeyConditionExpression=Key('day').eq('2017-05-21')
)
items_5_21 = response['Items']

items = items_5_20 + items_5_21

In [None]:
def parse_decimals(item):
    dec_fields = ['created_utc', 'last_updated', 'score']
    for f in dec_fields:
        item[f] = int(item[f])
    return item
    
items = [parse_decimals(i) for i in items]
items = pd.DataFrame(items)

date_fields = ['created_utc', 'last_updated']

for d in date_fields:
    items[d] = items[d].map(lambda x: pd.Timestamp(x, unit='s'))
    
items = items.assign(alive_time=(items['last_updated'] - items['created_utc']))

dataTable = dynamodb.Table('nr-wsip-posts_data')

def get_data_item(id):
    try:
        response = dataTable.get_item(
            Key={
                'id': id
            }
        )
        data_item = response['Item']
        return data_item
    except:
        print("bad response for id {0}".format(id))
        return {}

data = [get_data_item(id) for _, id in items['id'].iteritems()]
data = pd.DataFrame(data).set_index('id')

items = items.join(data, on='id')

In [163]:
counts = items.groupby("subreddit").count()['score'].sort_index(ascending=False)
med_scores = items.groupby("subreddit").median()['score'].sort_index(ascending=False)

array(['SubredditDrama', nan, 'nottheonion', 'tifu', 'WTF', 'OutOfTheLoop',
       'news', 'BlackPeopleTwitter', 'gifs', 'europe', 'neoliberal',
       'movies', 'todayilearned', 'MMA', 'nfl', 'worldnews', 'technology',
       'AdviceAnimals', 'aww', 'hockey', 'pics', 'rupaulsdragrace', 'nba',
       'soccer', 'baseball', 'OldSchoolCool', 'MarchAgainstTrump',
       'videos', 'CringeAnarchy', 'gaming', 'politics', 'anime', 'funny',
       'leagueoflegends', 'PrequelMemes', 'Overwatch', 'Showerthoughts',
       'SquaredCircle', 'Jokes', 'GlobalOffensive', 'hiphopheads', 'DotA2',
       'conspiracy', 'relationships', 'DestinyTheGame', 'AskReddit',
       'dankmemes', 'The_Donald', 'mildlyinteresting', 'pcmasterrace',
       'me_irl'], dtype=object)

In [274]:
def get_diffs_for_sub(sub, non_good_max, good_min):
    def classif_sub_posts(score):
        if score >= good_min:
            return True
        elif score <= non_good_max:
            return False
        else:
            return 
    sub_posts = items[items['subreddit'] == sub]
    sub_posts = sub_posts.assign(good=sub_posts['score'].apply(classif_sub_posts))
    sub_posts = sub_posts.dropna(subset=['good'])
    
    def to_message(row):
        is_good = row['good']
        words = "{0} {1}".format(row['title'], row['selftext'])
        return (words, is_good)
    
    messages = [to_message(post) for _, post in sub_posts.iterrows()]
    diffs = word_diffs(messages)
    best = diffs.head(10)
    worst = diffs.tail(10).sort_values()
    return {"best": best, "worst": worst }
    

In [287]:
counts[counts > 100].index

Index(['videos', 'politics', 'pics', 'nba', 'leagueoflegends', 'gaming',
       'funny', 'aww', 'The_Donald', 'Showerthoughts', 'Overwatch',
       'AskReddit'],
      dtype='object', name='subreddit')

In [296]:
results = {sub: get_diffs_for_sub(sub, 3, 5) for sub in counts[counts > 100].index}

for k, v in results.items():
    print("================")
    print("===== {0} - best".format(k))
    print(v['best'])
    print("===== {0} - worst".format(k))
    print(v['worst'])
    print("================")

===== videos - best
big          0.150815
guy          0.139946
his          0.139946
1            0.139946
guess        0.088315
stores       0.088315
24           0.088315
come         0.088315
spongebob    0.088315
stream       0.088315
dtype: float64
===== videos - worst
my         -0.017663
energy     -0.006793
drink      -0.006793
first      -0.006793
few        -0.006793
streamer   -0.006793
snow       -0.006793
live       -0.006793
we         -0.006793
school     -0.006793
dtype: float64
===== politics - best
trump          0.225753
impeachment    0.085284
david          0.059643
thesis         0.059643
sheriff        0.059643
house          0.059643
arms           0.059643
plagiarized    0.046823
clarke         0.046823
russia         0.041806
dtype: float64
===== politics - worst
weiner    -0.122074
anthony   -0.122074
sexting   -0.069677
under     -0.069677
saudi     -0.062988
gop       -0.052954
iowa      -0.047938
profile   -0.047938
year      -0.047938
iran      -0.047938

In [294]:
from collections import Counter, defaultdict
import re

def tokenize(message):
    stops = ['or', 'as', 's', 'an', 'than', 'do', 'so', 'no', 'it', 'how', 'be', 'has',  'a','can', 'will', 'have', 'if', 'why', 'but','he', 'the', 'and', 'on', 'all', 'is', 'emptystring', 'are', 'said', 'in', 'of', 'just', 'that', 'i', 'with', 'was', 'r', 't', 'to', 'for', 'by', 'you', 'there', 'not', 'to', 'from', 'this']
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    filt_words = list(filter(lambda x: x not in stops, all_words))
    
    return set(filt_words)                          # remove duplicates


def count_words(messages):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_good in messages:
        for word in tokenize(message):
            counts[word][0 if is_good else 1] += 1
    return counts

def word_probabilities(counts, total_goodposts, total_non_goodposts, k=0.5):
    """turn the word_counts into a list of triplets
    w, p(w | good) and p(w | ~good)"""
    return [(w,
             (good + k) / (total_goodposts + 2 * k),
             (non_good + k) / (total_non_goodposts + 2 * k))
             for w, (good, non_good) in counts.items()]

def word_diffs(messages):
    num_goodposts = len([is_good
                     for message, is_good in messages
                     if is_good])
    num_non_goodposts = len(messages) - num_goodposts

    # run training data through our "pipeline"
    k = 0.5
    word_counts = count_words(messages)
    word_probs = word_probabilities(word_counts,
                                         num_goodposts,
                                         num_non_goodposts,
                                         k)
    words_and_diffs = [(word, (p_good - p_bad)) for word, p_good, p_bad in word_probs]
    words, diffs = zip(*words_and_diffs)
    diffs = pd.Series(diffs, index=words).sort_values(ascending=False)
    return diffs.sort_values(ascending=False)