In [44]:
def read_dat(filename):
    with open(filename, 'r', newline='') as f:
        for line in f:
            if line:
                try:
                    yield json.loads(line)
                except:
                    continue

In [45]:
dat_file = "../data/life_cos1ne.dat"
data = list(read_dat(dat_file))

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [datum["body"] for datum in data]
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X = vectorizer.fit_transform(corpus)

In [47]:
important_words = set(vectorizer.get_feature_names())
print(important_words)

{'non', 'believe', 'isn', 'like', 'life', 'people', 'exist', 'cincinnati', 'soul', 'thing', 'means', 'evidence', 've', 'actually', 'gt', 'true', 'year', 'universe', 'based', 'yes', 'make', 'possible', 'right', 'okay', 'aren', 'oh', 'way', 'evil', 'don', 'going', 'know', 'religious', 'reason', 'moral', 'sin', 'new', 'think', 'best', 'does', 'game', 'understand', 'll', 'christian', 'mean', 'person', 'need', 'matter', 'things', 'big', 'body', 'heaven', 'question', 'fact', 'good', 'religion', 'better', 'human', 'yeah', 'years', 'church', 'vote', 'really', 'world', 'place', 'just', 'different', 'act', 'bible', 'saying', 'said', 'en', 'come', 'sex', 'doesn', 'teams', 'won', 'feel', 'free', 'http', 'wouldn', 'belief', 'want', 'team', 'marriage', 'hell', 'say', 'east', 'time', 'point', 'god', 'www', 'did', 'use', 'conference', 'jesus', 'wrong', 'state', 'didn', 'catholic', 'man'}


In [60]:
import time
import datetime
counts = dict()
filtered_data = dict()
for datum in data:
    if datum["body"].startswith("&gt"):
        continue
    t = time.gmtime(int(datum["timeLong"]))
    dt = datetime.datetime.fromtimestamp(time.mktime(t))
    toStr = dt.strftime("%y-%m")
    if toStr not in filtered_data:
        filtered_data[toStr] = datum
        counts[toStr] = 1
    else:
        # more reservoir sampling!
        counts[toStr] += 1
        if np.random.randint(0, counts[toStr]) == 0:
            filtered_data[toStr] = datum

In [61]:
def get_ratios(d):
    total = 0
    counts = dict()
    most_counts = 0
    for datum in d:
        sub = datum["subreddit"]
        if sub not in counts:
            counts[sub] = 0
        counts[sub] += 1
        if counts[sub] > most_counts:
            most_counts = counts[sub]
    for sub, count in counts.items():
        counts[sub] = 100*count/most_counts
    return counts

In [62]:
sub_ratios = get_ratios(data)
sub_ratios = {sub: ratio for sub, ratio in sub_ratios.items() if ratio > 1}
print(sub_ratios)

{'DebateReligion': 100.0, 'CFB': 13.38097239736584, 'WTF': 1.120919153706039, 'nfl': 9.205548549810844, 'masseffect': 4.735883424408015, 'Catholicism': 13.801317080005605, 'funny': 1.7094017094017093, 'philosophy': 1.9475970295642426, 'politics': 5.646630236794171, 'Christianity': 16.32338517584419, 'socialism': 2.269861286254729, 'todayilearned': 2.648171500630517, 'MapPorn': 2.844332352529074, 'AskReddit': 1.3310914950259212, 'bengals': 1.5132408575031526, 'soccer': 2.73224043715847, 'polandball': 1.0228387277567605, 'worldbuilding': 1.092896174863388, 'FantasyRealignment': 3.656998738965952, 'AmericanAthletic': 1.008827238335435, 'kingme': 1.6253327728737565, 'SandersForPresident': 6.0109289617486334, 'MLS': 1.7094017094017093, 'FCCincinnati': 2.4099761804679836, 'USLPRO': 2.3959646910466583}


In [63]:
body = "\n".join([key + ": " + filtered_data[key]["body"] for key in sorted(filtered_data)])

In [64]:
print(body)

12-01: I agree with you the last bit (religion promoting illegal activity) is a ridiculous claim made by the Bishop to appeal to conservatives.
12-02: You are using it correctly.
12-03: Why don't you try out new classes?
12-04: Haha this is true and not ragging on you at all it's just I've found when *one* mentions downvotes it attracts downvotes and when *someone else* mentions downvotes it attracts upvotes.
12-05: No he can't it's a logical impossibility.
12-06: God does not rule over us because he created us.
12-07: Yes pretty much.
12-08: This is handled by the Catechism.
12-09: Allow me to explain further.
12-10: Well for the Catholic Church will consider this a bad thing as it forces Catholic institutions to commit a sin by at least implicitly condoning birth control.
12-11: He means according to this scenario, not according to how it is.
12-12: In my view they've always been unacceptable, but I have the benefit of Centuries of understanding and humanization that they did not hav