## Dependencies:

B0. Condolence Topic Modeling

In [None]:
COMMENTS_FILE = "../working-dir/parsed/classified_comments/sample_total_comments_2018.tsv"
FLATTENED_TREES_FILE = "../working-dir/parsed/classified_comments/flattened_comment_trees_all_2018.tsv"
LIWC_DICT_FILE = "/home/REDACTED/LIWC2015_English.dic"
POST_METADATA_FILE = "../working-dir/parsed/classified_comments/post_metadata_total_2018.tsv"
GRIEF_TOPICS_PATH = "../working-dir/mallet/seeking_topics.txt"

In [None]:
import datetime
from collections import defaultdict

import ujson
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import seaborn as sns
sns.set()
%matplotlib notebook
%load_ext autoreload
%load_ext ipycache

%autoreload 2

np.random.seed(0xB1AB)

In [None]:
from util import read_labeled_output
(year,
 c,
 cs,
 both) = read_labeled_output(COMMENTS_FILE)

In [None]:
c_topics = pd.read_csv("./mallet/condolences_topics.txt", sep="\t", names=["index",
                                                                           "id",
                                                                           "0",
                                                                           "1",
                                                                           "2",
                                                                           "3",
                                                                           "4",
                                                                           "5",
                                                                           "6",
                                                                           "7",
                                                                           "8",
                                                                           "9",
                                                                           "10",
                                                                           "11",
                                                                           "12",
                                                                           "13",
                                                                           "14",
                                                                           "15",
                                                                           "16",
                                                                           "17",
                                                                           "18",
                                                                           "19"], index_col="id")

In [None]:
def read_trees(fname):
    tree = dict()
    with open(fname, "r") as f:
        for line in f:
            parts = line.strip().split("\t")
            parent = parts[0]
            children = parts[1:]
            tree[parent] = children
    return tree
tree = read_trees(FLATTENED_TREES_FILE)

In [None]:
replies = pd.DataFrame([ujson.loads(line.strip())
                        for line in open("./parsed/classified_comments/replies_to_condolence_total.tsv", "r")])
replies = replies.set_index("id")

In [None]:
phrases = ["thanks", "thank you", "i appreciate", "crying just reading this", "made my day", ]

replies.loc[:, "positive_response"] = replies.body.str.lower().apply(lambda x: any([p in x for p in phrases]))

In [None]:
grief_ids = set(cs.index)
condolence_ids = set(c.index)
both = grief_ids.intersection(condolence_ids)

reply_ids = set(replies.index)
reply_dict = replies.to_dict()

condolence_to_grief_map = dict()
reply_to_condolence_map = dict()

for i, c_id in enumerate(condolence_ids):
    if i % 100_000 == 0:
        print(i)
    if condolence_dict["parent_id"][c_id][:3] == "t3_":
        continue
    parent_comment = condolence_dict["parent_id"][c_id][3:]
    if parent_comment in grief_ids:
        condolence_to_grief_map[c_id] = parent_comment

for i, r_id in enumerate(reply_ids):
    if i % 100_000 == 0:
        print(i)
    if reply_dict["parent_id"][r_id][:3] == "t3_":
        continue
    parent_comment = reply_dict["parent_id"][r_id][3:]
    if parent_comment in condolence_ids:
        reply_to_condolence_map[r_id] = parent_comment

In [None]:
def is_reply_from_op(row):
    if row.name not in reply_to_condolence_map:
        return False
    if reply_to_condolence_map[row.name] in condolence_to_grief_map:
        return grief_dict["author"][condolence_to_grief_map[reply_to_condolence_map[row.name]]] == row.author
    return False

replies.loc[:, "from_op"] = replies.apply(is_reply_from_op, axis=1)

In [None]:
good_condolences = set([p_id[3:] for p_id in replies[replies.from_op & replies.positive_response].parent_id])

In [None]:
# there aren't a lot of good condolences, so we add the all in, and randomly sample other ones until we hit 1mil
# random_id_sample = good_condolences.union(random.sample(set(condolence_to_grief_map.keys()) - good_condolences, k=(1_000_000 - len(good_condolences))))

# since there aren't a lot of condolence comments that are in reply to a condolence seeking comment, we use all of them
# so, the "random_id_sample" isn't actually random, but ok.
random_id_sample = condolence_to_grief_map.keys()

In [None]:
import liwc
import re
parse, category_names = liwc.load_token_parser(LIWC_DICT_FILE)
def tokenize(text):
    # you may want to use a smarter tokenizer
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)

        
def get_liwc(body):
    liwc_first_person = 0
    liwc_second_person = 0
    liwc_third_person = 0
    for tok in tokenize(body.lower()):
        for category in parse(tok):
            if category == "i (I)" or category == "we (We)":
                liwc_first_person += 1
            elif category == "you (You)":
                liwc_second_person += 1
            elif category == "shehe (SheHe)" or category == "they (They)":
                liwc_third_person += 1
    return liwc_first_person, liwc_second_person, liwc_third_person


from nltk.tokenize import word_tokenize

def get_ling_accommodation(parent_id, child_id):
    parent_comment = grief_dict["body"][parent_id].lower()
    child_comment = condolence_dict["body"][child_id].lower()
    parent_words = set(word_tokenize(parent_comment))
    child_words = set(word_tokenize(child_comment))
    return len(parent_words.intersection(child_words)) / len(child_words)

In [None]:
# Features:


# LIWC features

# all the normal controls

# topics

# speech acts


dataset = c[["subreddit", "link_id", "body", "score", "created_utc", "author"]].loc[random_id_sample]
dataset = dataset.drop_duplicates()
print("Calculating Length")
# Length in space-delimited words
dataset.loc[:, ("length")] = dataset.body.apply(lambda x: len(x.split()))


print("Filter Link IDs")
counts = dataset.link_id.value_counts(ascending=False)
links = set((counts[counts > 50]).index)
dataset.loc[:, "filtered_link_id"] = dataset.loc[:, "link_id"].apply(lambda x: x if x in links else "dummy")

print("Gender Scores")
gender_scores = get_gender_scores(dataset.author)
dataset.loc[:, ("is_male")] = gender_scores < 0.1
dataset.loc[:, ("is_female")] = gender_scores > 0.9

print("Post Features")
dataset.loc[:, "link_id_stripped"] = dataset.loc[:, "link_id"].apply(lambda x: x[3:])
dataset = dataset.join(post_data, on="link_id_stripped", rsuffix="_post")

print("Temporal")
# set dataset timezone
dataset.loc[:, "created_utc"] = dataset.created_utc.dt.tz_convert("US/Eastern")
dataset.loc[:, "created_utc_post"] = dataset.created_utc_post.dt.tz_convert("US/Eastern")
dataset.loc[:, "hour"] = dataset.created_utc.dt.hour
dataset.loc[:, "month"] = dataset.created_utc.dt.month
dataset.loc[:, "day_of_month"] = dataset.created_utc.dt.day
dataset.loc[:, "weekday"] = dataset.created_utc.dt.weekday

print("Topics")
# Before running this, make sure to run topic modeling code below.
dataset = dataset.join(c_topics)

# print("Comment Depth")
# dataset.loc[:, "depth"] = np.array([id_to_depth[condolence_to_grief_map[c_id]] + 1 for c_id in dataset.index])

# print("Comment Age in hours")
# dataset.loc[:, "comment_age"] = ((dataset.loc[:, "created_utc"] - dataset.loc[:, "created_utc_post"]) / np.timedelta64(1, 'h'))
dataset.loc[:, "comment_age"] = ((dataset.loc[:, "created_utc"] - grief_dict["created_utc"][condolence_to_grief_map[c_id]]) / np.timedelta64(1, 'h'))

print("Linguistic Alignment")
dataset.loc[:, "ling_align"] = np.array([get_ling_accommodation(condolence_to_grief_map[c_id], c_id) for c_id in dataset.index])

print("LIWC Features")
dataset.loc[:, "liwc_first_person"], dataset.loc[:, "liwc_second_person"], dataset.loc[:, "liwc_third_person"] = zip(*dataset.body.apply(get_liwc))


print("Response Value")
# Add response values
dataset.loc[:, "good_condolence"] = [ind in good_condolences for ind in dataset.index.values]

In [None]:
dataset.to_csv("../data/regression/condolence_input.csv")