In [1]:
import pandas as pd

import boto3
from boto3.dynamodb.conditions import Key, Attr

dynamodb = boto3.resource('dynamodb')
indexTable = dynamodb.Table('nr-wsip-posts_index')

In [2]:
response = indexTable.query(
    KeyConditionExpression=Key('day').eq('2017-05-20')
)
items_5_20 = response['Items']

response = indexTable.query(
    KeyConditionExpression=Key('day').eq('2017-05-21')
)
items_5_21 = response['Items']

items = items_5_20 + items_5_21

In [3]:
def parse_decimals(item):
    dec_fields = ['created_utc', 'last_updated', 'score']
    for f in dec_fields:
        item[f] = int(item[f])
    return item
    
items = [parse_decimals(i) for i in items]
items = pd.DataFrame(items)

date_fields = ['created_utc', 'last_updated']

for d in date_fields:
    items[d] = items[d].map(lambda x: pd.Timestamp(x, unit='s'))
    
items = items.assign(alive_time=(items['last_updated'] - items['created_utc']))

dataTable = dynamodb.Table('nr-wsip-posts_data')

def get_data_item(id):
    try:
        response = dataTable.get_item(
            Key={
                'id': id
            }
        )
        data_item = response['Item']
        return data_item
    except:
#         print("bad response for id {0}".format(id))
        return {}

data = [get_data_item(id) for _, id in items['id'].iteritems()]
data = pd.DataFrame(data).set_index('id')

items = items.join(data, on='id')

In [4]:
items

Unnamed: 0,created_utc,day,id,last_updated,score,alive_time,author,permalink,selftext,subreddit,title
0,2017-05-20 00:05:42,2017-05-20,6c7dhz,2017-05-21 04:16:05,74,1 days 04:10:23,Mr_Cryptic,/r/SubredditDrama/comments/6c7dhz/is_bigotry_t...,emptyString,SubredditDrama,Is bigotry towards bigotry bigoted or begotten?
1,2017-05-20 00:06:20,2017-05-20,6c7dl9,2017-05-21 00:46:24,10,1 days 00:40:04,,,,,
2,2017-05-20 00:12:03,2017-05-20,6c7end,2017-05-21 04:16:09,30,1 days 04:04:06,jaykirsch,/r/nottheonion/comments/6c7end/theres_a_storm_...,emptyString,nottheonion,There's a storm chaser traffic jam in Tornado ...
3,2017-05-20 00:31:40,2017-05-20,6c7i6n,2017-05-21 00:46:29,20,1 days 00:14:49,,,,,
4,2017-05-20 01:05:49,2017-05-20,6c7nxh,2017-05-21 04:16:09,36,1 days 03:10:20,LuvBamboo,/r/nottheonion/comments/6c7nxh/study_booze_cof...,emptyString,nottheonion,"Study: Booze, coffee and lack of breaks is imp..."
5,2017-05-20 01:09:15,2017-05-20,6c7ojq,2017-05-21 04:16:05,24,1 days 03:06:50,i_post_gibberish,/r/SubredditDrama/comments/6c7ojq/is_capitalis...,emptyString,SubredditDrama,Is capitalism radiant and the prognosis for so...
6,2017-05-20 01:09:37,2017-05-20,6c7olt,2017-05-21 01:56:13,26,1 days 00:46:36,Emranotkool,/r/tifu/comments/6c7olt/tifu_a_strangers_wedding/,This TIFU happened earlier today and the Weddi...,tifu,TIFU a Strangers Wedding
7,2017-05-20 01:18:24,2017-05-20,6c7q0b,2017-05-21 02:36:11,28,1 days 01:17:47,Slappin45,/r/tifu/comments/6c7q0b/tifu_by_slitting_my_ow...,"I was at work kicking ass, had to cut up some ...",tifu,TIFU by slitting my own wrist on accident
8,2017-05-20 01:27:48,2017-05-20,6c7riz,2017-05-21 01:41:07,73,1 days 00:13:19,thesimplemachine,/r/WTF/comments/6c7riz/accused_shooter_alleged...,emptyString,WTF,Accused Shooter Allegedly Poops On His Arrest ...
9,2017-05-20 01:40:47,2017-05-20,6c7tqp,2017-05-21 02:11:08,20010,1 days 00:30:21,BedrockPerson,/r/WTF/comments/6c7tqp/expressing_a_tooth_absc...,emptyString,WTF,Expressing a tooth abscess.


In [5]:
counts = items.groupby("subreddit").count()['score'].sort_index(ascending=False)
med_scores = items.groupby("subreddit").median()['score'].sort_index(ascending=False)

In [6]:
counts[counts > 100].index

Index(['worldnews', 'videos', 'politics', 'pics', 'nba', 'mildlyinteresting',
       'me_irl', 'leagueoflegends', 'gaming', 'funny', 'aww', 'The_Donald',
       'SquaredCircle', 'Showerthoughts', 'Overwatch', 'DestinyTheGame',
       'AskReddit'],
      dtype='object', name='subreddit')

In [8]:
counts.sort_values(ascending=False)

subreddit
The_Donald            979
AskReddit             872
Showerthoughts        276
aww                   270
funny                 255
pics                  252
Overwatch             216
leagueoflegends       206
me_irl                178
videos                160
politics              159
nba                   153
gaming                148
mildlyinteresting     128
SquaredCircle         117
DestinyTheGame        108
worldnews             102
rupaulsdragrace       100
pcmasterrace           98
Jokes                  96
conspiracy             89
DotA2                  88
todayilearned          85
dankmemes              85
relationships          82
movies                 81
soccer                 80
gifs                   72
GlobalOffensive        70
BlackPeopleTwitter     69
news                   69
hiphopheads            67
PrequelMemes           65
anime                  58
MarchAgainstTrump      46
hockey                 46
OldSchoolCool          45
baseball               43
Cr

In [16]:
results = {sub: get_diffs_for_sub(sub, 3, 5) for sub in counts[counts > 200].index}

for k, v in results.items():
    print("================")
    print("===== {0} - best".format(k))
    print(v['best'])
    print("===== {0} - worst".format(k))
    print(v['worst'])
    print("================")

===== pics - best
today     0.059471
when      0.054312
sunset    0.042980
we        0.042980
your      0.042980
pic       0.036807
some      0.034227
wanted    0.034227
pull      0.034227
mirror    0.034227
dtype: float64
===== pics - worst
like      -0.036254
thought   -0.030081
one       -0.030081
friend    -0.026488
out       -0.026488
people    -0.020315
here      -0.020315
really    -0.020315
sign      -0.020315
night     -0.020315
dtype: float64
===== leagueoflegends - best
community    0.315564
their        0.278915
your         0.263208
still        0.257972
they         0.242266
skt          0.237030
even         0.231794
into         0.226559
much         0.226559
http         0.226559
dtype: float64
===== leagueoflegends - worst
play      -0.040933
anyone    -0.035697
riot      -0.030462
can't     -0.025226
then      -0.025226
its       -0.019990
client    -0.019990
support   -0.019990
5         -0.014755
every     -0.014755
dtype: float64
===== funny - best
sign           

In [15]:
from collections import Counter, defaultdict
import re

def get_diffs_for_sub(sub, non_good_max, good_min):
    def classif_sub_posts(score):
        if score >= good_min:
            return True
        elif score <= non_good_max:
            return False
        else:
            return 
    sub_posts = items[items['subreddit'] == sub]
    sub_posts = sub_posts.assign(good=sub_posts['score'].apply(classif_sub_posts))
    sub_posts = sub_posts.dropna(subset=['good'])
    
    def to_message(row):
        is_good = row['good']
        words = "{0} {1}".format(row['title'], row['selftext'])
        return (words, is_good)
    
    messages = [to_message(post) for _, post in sub_posts.iterrows()]
    diffs = word_diffs(messages)
    best = diffs.head(10)
    worst = diffs.tail(10).sort_values()
    return {"best": best, "worst": worst }


def tokenize(message):
    stops = ['or', 'at', 'got', 'l', 'my', 'me', 'does','get', 'were', 'what', 'at', 'too', 'as', 's', 'an', 'than', 'do', 'so', 'no', 'it', 'how', 'be', 'has',  'a','can', 'will', 'have', 'if', 'why', 'but','he', 'the', 'and', 'on', 'all', 'is', 'emptystring', 'are', 'said', 'in', 'of', 'just', 'that', 'i', 'with', 'was', 'r', 't', 'to', 'for', 'by', 'you', 'there', 'not', 'to', 'from', 'this']
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    filt_words = list(filter(lambda x: x not in stops, all_words))
    
    return set(filt_words)                          # remove duplicates


def count_words(messages):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_good in messages:
        for word in tokenize(message):
            counts[word][0 if is_good else 1] += 1
    return counts

def word_probabilities(counts, total_goodposts, total_non_goodposts, k=0.5):
    """turn the word_counts into a list of triplets
    w, p(w | good) and p(w | ~good)"""
    return [(w,
             (good + k) / (total_goodposts + 2 * k),
             (non_good + k) / (total_non_goodposts + 2 * k))
             for w, (good, non_good) in counts.items()]

def word_diffs(messages):
    num_goodposts = len([is_good
                     for message, is_good in messages
                     if is_good])
    num_non_goodposts = len(messages) - num_goodposts

    # run training data through our "pipeline"
    k = 0.5
    word_counts = count_words(messages)
    word_probs = word_probabilities(word_counts,
                                         num_goodposts,
                                         num_non_goodposts,
                                         k)
    words_and_diffs = [(word, (p_good - p_bad)) for word, p_good, p_bad in word_probs]
    words, diffs = zip(*words_and_diffs)
    diffs = pd.Series(diffs, index=words).sort_values(ascending=False)
    return diffs.sort_values(ascending=False)