In [1]:
import little_mallet_wrapper as lmw
import pandas as pd 
import ast 
from lmw import *
import textwrap
import nltk
import matplotlib.pyplot as plt
import numpy as np
lemmatizer = nltk.stem.WordNetLemmatizer()
from nltk.corpus import stopwords
STOP = stopwords.words('english')
STOP.append("amp")

MALLET_PATH = "~/mallet/bin/mallet"
USER_POS_PATH = "../data/user_positive.csv"
USER_SQL_PATH = "../data/user_sqlite.csv"
NAR_POS_PATH = "../../narrative_detection/narrative_posts_by_trained_classification.csv"
num_top = [5,10,15,20]
output_directory_path = "../data/output/pos-output"

## Data Preprocessing

In [2]:
# Positive narrative dataset grouped by user ----
positive_nar = pd.read_csv(NAR_POS_PATH)
positive_users = pd.read_csv(USER_POS_PATH)
positive_users['selftext'] = positive_users['selftext'].apply(ast.literal_eval)
positive_users['created_utc'] = positive_users['created_utc'].apply(ast.literal_eval)
positive_users['title'] = positive_users['title'].apply(ast.literal_eval)

print("Number of users with a narrative positive post:", len(positive_users))
positive_users = positive_users[positive_users['created_utc'].apply(lambda x: len(x) >= 2 and len(x) < 50)].reset_index(drop=True)
print("Number of users with at least 2 narrative positive posts:", len(positive_users))

Number of users with a narrative positive post: 5161
Number of users with at least 2 narrative positive posts: 901


In [3]:
# Generating training data ----
training_data = [lmw.process_string(t) for post in positive_users['selftext'] for t in post]
training_data = [d for d in training_data if d.strip()]

# Generate groupings ----
groupings = []
counting = 0 
for item in positive_users['selftext']: 
    idx = []
    for post in item: 
        idx.append(counting)
        counting += 1
    groupings.append(idx)

## Topic Modeling
- Training data:
    + All posts in the database where the users post at least twice and fewer than 50 times.
    + Each document represent a post. 
    

### Training 
- Do not run if only doing analyses

In [4]:
# Training ----
for num in num_top: 
    lmw_training(num, output_directory_path, training_data)

Importing data...
Complete
Training topic model...


Mallet LDA: 5 topics, 3 topic bits, 111 topic mask
Data loaded.
max tokens: 972
total tokens: 290071
<10> LL/token: -8.64484
<20> LL/token: -8.37843
<30> LL/token: -8.25695
<40> LL/token: -8.19936

0	1	get want know feel need going time life work years recovery help anyone like purging really since even better things 
1	1	eat food NUM eating day like feel ate binge today calories meal much even foods hungry days get still meals 
2	1	like feel body want even look never know weight always fat hate people see lose much fucking think get still 
3	1	NUM weight eating recovery know months back gain still much really lost last healthy past gained days ago restricting time 
4	1	really told said didn know people one would eating also think got time made disorder family thought thing kind like 

<50> LL/token: -8.16095
<60> LL/token: -8.13755
<70> LL/token: -8.11575
<80> LL/token: -8.10185
<90> LL/token: -8.098

0	1	get want know life time need feel recovery really years going help work like pur

Complete
Importing data...
Complete
Training topic model...


Mallet LDA: 10 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 972
total tokens: 290071
<10> LL/token: -8.96868
<20> LL/token: -8.68426
<30> LL/token: -8.5288
<40> LL/token: -8.45314

0	0.5	want know recovery treatment get time better really years much life tell health recover NUM told see going like said 
1	0.5	eat food eating like day feel today meal ate much hungry binge days foods meals even full going dinner stomach 
2	0.5	recovery life time years things would still back really one people anyone disorder many past long restriction able mental advice 
3	0.5	weight body like gain even back really look much know lose lost underweight fat didn still eating healthy would little 
4	0.5	NUM food would night one day made didn home first bed time good thought felt ate foods little went water 
5	0.5	NUM weight eating calories eat much months feel since started back want day lose know still restricting bmi gained lost 
6	0.5	like feel people body look eating never think even w

Complete
Importing data...
Complete
Training topic model...


Mallet LDA: 15 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 972
total tokens: 290071
<10> LL/token: -9.099
<20> LL/token: -8.78217
<30> LL/token: -8.62755
<40> LL/token: -8.55075

0	0.33333	weight body feel know recovery still eating like gain want underweight back months healthy really gaining years lost lot bmi 
1	0.33333	binge food purging binging purge stop NUM years bulimia times restricting back cycle want b/p like restrict restriction control would 
2	0.33333	eat like feel eating ate night days hunger hungry morning food really stomach feeling meal didn today lot water little 
3	0.33333	like feel people want body look hate never see life always even fat think way one know better much skinny 
4	0.33333	feel really time eating eat much think like day lot know even days thinking haven get week going getting stomach 
5	0.33333	NUM since help time really recovery need therapist treatment weeks last also ago one finally started went week parents want 
6	0.33333	said 

Complete
Importing data...
Complete
Training topic model...


Mallet LDA: 20 topics, 5 topic bits, 11111 topic mask
Data loaded.
max tokens: 972
total tokens: 290071
<10> LL/token: -9.15295
<20> LL/token: -8.8356
<30> LL/token: -8.69748
<40> LL/token: -8.61648

0	0.25	NUM going day today ate night food know went one foods binge cream want last fucking cake ice tomorrow days 
1	0.25	people like feel want one hate never wish much fucking ever around always bad point shit everyone feels fat could 
2	0.25	NUM weight want eat feel day know eating much stop days even like lose purging since gain week restricting binge 
3	0.25	know like feel want anyone really help since even years one anything life well ever past else struggling need lot 
4	0.25	NUM weight eating calories started time months underweight would lost ago since never much years gaining around loss start lot 
5	0.25	like eat really eating feel also day get much idk food bad trying lot something still time even always things 
6	0.25	know think like eating really didn want something small wou

Complete


In [5]:
# Examining all topics (top_view.txt) ----
with open(output_directory_path + "/top_view.txt", 'w') as f: 
    for num in num_top: 
        topic_keys = lmw.load_topic_keys(output_directory_path + "/mallet.topic_keys." + str(num))
        f.write("Topics for k=" + str(num) + "\n")
        for i, t in enumerate(topic_keys):
            line = str(i) + '\t' + ' '.join(t[:10]) + "\n"
            f.write(line)
        f.write('\n')

### Topic Analysis

Following Akoury 2020, we examine common local topic transitions between entries written by the same user across time. We compute the transition probability from topic A to topic B by counting how many times A and B are the most probable topics for two consecutive entries, respectively, and normalizing by the total number of occurrences of topic A.

- Table in the paper: Topics with the highest relative importance, which illustrate the diversity of the dataset.

In [6]:
num_topics = 10             # Change this to view different output files 

# Viewing topics ---- 
topic_keys = lmw.load_topic_keys(output_directory_path + "/mallet.topic_keys." + str(num_topics))
topic_label = []
for i, t in enumerate(topic_keys):
    topic_label.append(' '.join(t[:10]))
topic_label

['NUM treatment get work need year time therapist last back',
 'eat food eating day meal hungry like meals calories binge',
 'life recovery people body self nNUM things would every love',
 'look body like weight fat hate see skinny people always',
 'NUM eat ate chocolate foods sugar cream food cake one',
 'NUM weight gain back eating months lose lost still body',
 'like know feel want really even eating get think never',
 'said told didn mom family made got would one went',
 'doctor blood take heart symptoms hospital taking doctors medication pain',
 'like NUM feel day want get today time days eat']

In [7]:
topic_distributions = lmw.load_topic_distributions(output_directory_path + "/mallet.topic_distributions." + str(num_topics))

# Get the most probable topic and topic distribution for each document ---- 
most_prob = []
topics = []
for doc in range(len(topic_distributions)): 
    item = topic_distributions[doc]
    most_prob.append(item.index(max(item)))
    topics.append(item)

# Get the most probable topic and topic distribution for each document (grouped by user) ----
most_prob_grouped = []
prob_grouped = []
for group in groupings:
    most_prob_grouped.append([most_prob[i] for i in group])
    prob_grouped.append([topics[i] for i in group])

positive_users['Topic Distribution'] = prob_grouped

In [8]:
exploded = positive_users.explode(['selftext', 'created_utc', 'title', 'Topic Distribution']).reset_index(drop=True).to_csv("../data/positive_topic_dist.csv")

In [9]:
# Local topic transition between entries within the same user ----
# For each user, get the transition probability from topic A to topic B by
# counting the number of times topic B appears after another topic A
# normalize by the total number of times topic A is the most probable topic

# Get the transition probability for each user ----
# transition_prob: List of dictionaries {topic A: {topic B: countBA}}
# topic_count: List of dictionaries {topic A: countA}
transition_count, topic_count = [], []
for user in most_prob_grouped:
    transition, topic = {}, {}
    for i in range(len(user)-1):
        if user[i] not in transition:
            transition[user[i]] = {user[i+1]: 1}
        else: 
            transition[user[i]][user[i+1]] = transition[user[i]].get(user[i+1], 0) + 1
        topic[user[i]] = topic.get(user[i], 0) + 1
    transition_count.append(transition)
    topic_count.append(topic)

In [10]:
# Normalize the transition probability ----
transition_prob_norm = []
for user in range(len(transition_count)): 
    transition_norm = {}
    for topicA in transition_count[user]: 
        for topicB in transition_count[user][topicA]: 
            transition_norm[topicA] = transition_norm.get(topicA, {})
            transition_norm[topicA][topicB] = transition_count[user][topicA][topicB] / topic_count[user][topicA]
    transition_prob_norm.append(transition_norm)

In [11]:
# Get the most probable transition for each user----
max_top = []
for user in transition_prob_norm:
    maxing = {}
    for topic in user:
        max_val = 0 
        max_topic = []
        for t in user[topic]:
            if user[topic][t] > max_val: 
                max_val = user[topic][t]
                max_topic = [t]
            elif user[topic][t] == max_val: 
                max_topic.append(t)
        maxing[topic] = max_topic
    max_top.append(maxing)


# reformat max_top so that it is a list of lists of tuples ----
all_users = []
for user in max_top: 
    user_list = []
    for topic in user: 
        for t in user[topic]: 
            user_list.append((topic, t))
    all_users.append(user_list)
all_users_expanded = [user[i] for user in all_users for i in range(len(user))]

In [12]:
# Get the most frequent transition across all users ----
max_top_freq = {}
for tup in all_users_expanded:
    # get the most frequent transition for each user 
    if tup[0] not in max_top_freq: 
        max_top_freq[tup[0]] = {tup[1]: 1}
    else:
        max_top_freq[tup[0]][tup[1]] = max_top_freq[tup[0]].get(tup[1], 0) + 1

# Get the value with the highest count for each topic ----
# Topic     Most Frequent Transition
for topic in max_top_freq: 
    print(topic, max(max_top_freq[topic], key=max_top_freq[topic].get))


4 6
6 6
3 6
5 6
7 6
1 6
9 6
2 6
8 6
0 6


In [13]:
# Antoniak (2019)'s time series analysis ----

# For each topic, get the ranking of all the documents
ranking_doc = {}
for i in range(num_topics):
    ranking_doc[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=10):
        ranking_doc[topic_label[i]].append(str(training_data.index(d)))

# Get the position of each document in the ranking
ranking_pos = {}
for topic in ranking_doc: 
    for doc in ranking_doc[topic]: 
        if doc not in ranking_pos:
            ranking_pos[doc] = [(topic, ranking_doc[topic].index(doc))]
        else: 
            ranking_pos[doc].append((topic, ranking_doc[topic].index(doc)))

ranking_pos = {k: sorted(v, key=lambda x: x[1]) for k, v in ranking_pos.items()}
ranking_pos = dict(sorted(ranking_pos.items(), key=lambda item: int(item[0])))

# For each topic, get its probability of appearing in each document 
ranking_topic_prob = {}
for i in range(num_topics):
    ranking_topic_prob[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=len(training_data)):
        ranking_topic_prob[topic_label[i]].append((str(training_data.index(d)), p))
# Sort ranking_topic_prob by the first index of the value tuple
ranking_topic_prob = {k: sorted(v, key=lambda x: int(x[0])) for k, v in ranking_topic_prob.items()}