In [12]:
import little_mallet_wrapper as lmw
import pandas as pd 
import ast 
from lmw import *
import textwrap
import nltk
import matplotlib.pyplot as plt
import numpy as np
lemmatizer = nltk.stem.WordNetLemmatizer()
from nltk.corpus import stopwords
STOP = stopwords.words('english')
STOP.append("amp")

MALLET_PATH = "~/mallet/bin/mallet"
USER_POS_PATH = "../data/user_positive.csv"
USER_SQL_PATH = "../data/user_sqlite.csv"
NAR_POS_PATH = "../../narrative_detection/narrative_posts_by_trained_classification.csv"

## Data Preprocessing

In [13]:
# Positive narrative dataset grouped by user ----
positive_nar = pd.read_csv(NAR_POS_PATH)
positive_users = pd.read_csv(USER_POS_PATH)
positive_users['selftext'] = positive_users['selftext'].apply(ast.literal_eval)
positive_users['merged'] = positive_users['selftext'].apply(lambda x: ' '.join(x))
positive_users['created_utc'] = positive_users['created_utc'].apply(ast.literal_eval)
print("Number of users with a narrative positive post:", len(positive_users))
positive_users = positive_users[positive_users['created_utc'].apply(lambda x: len(x) >= 2 and len(x) < 50)].reset_index(drop=True)
print("Number of users with at least 2 narrative positive posts:", len(positive_users))

Number of users with a narrative positive post: 5161
Number of users with at least 2 narrative positive posts: 901


## Topic Modeling
- Training data:
    + All posts in the database where the users post at least three times.
    + Each document represent a chunk derived from a post (each post has 5)
    

### Data preparation

In [14]:
# Generating training data ----
training_data = [lmw.process_string(t) for post in positive_users['selftext'] for t in post]
training_data = [d for d in training_data if d.strip()]

# Generate groupings ----
groupings = []
counting = 0 
for item in positive_users['selftext']: 
    idx = []
    for post in item: 
        idx.append(counting)
        counting += 1
    groupings.append(idx)

In [15]:
num_top = [5,10,15,20]
output_directory_path = "../data/output/pos-output/"
for num in num_top: 
    lmw_training(num, output_directory_path, training_data)

Importing data...
Complete
Training topic model...


Mallet LDA: 5 topics, 3 topic bits, 111 topic mask
Data loaded.
max tokens: 960
total tokens: 287155
<10> LL/token: -8.62125
<20> LL/token: -8.38521
<30> LL/token: -8.24701
<40> LL/token: -8.1835

0	1	really like NUM today time would ate one good went got back didn every night food felt last going said 
1	1	like feel people know want even one body look never think would always see really life way say something someone 
2	1	recovery know NUM want years get really time help year need life never last going anyone eating since enough back 
3	1	NUM weight eating like feel body much still calories eat gain back even day lose healthy since months fat normal 
4	1	eat food feel like day eating get even want going know binge days hungry mom anything full work need make 

<50> LL/token: -8.14915
<60> LL/token: -8.12055
<70> LL/token: -8.09975
<80> LL/token: -8.08462
<90> LL/token: -8.06685

0	1	NUM would went ate today really felt one didn got good night going back food started time first also co

Complete
Importing data...
Complete
Training topic model...


Mallet LDA: 10 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 960
total tokens: 287155
<10> LL/token: -8.93867
<20> LL/token: -8.67175
<30> LL/token: -8.52342
<40> LL/token: -8.42919

0	0.5	like feel NUM want going life back body every never even know get time still stomach think way would tired 
1	0.5	really like feel eating recovery anyone things get mental time also disorder feeling know take trying still health thoughts need 
2	0.5	NUM day eat today ate yesterday two one morning really got dinner night lunch sugar went last hours food every 
3	0.5	want know get even would think need feel also eating help better tell make really going scared try started anyone 
4	0.5	people recovery life years never enough therapist know always one talk work back see long things anorexia better new person 
5	0.5	like one didn mom made family would food make felt could good fucking people way say much friends actually literally 
6	0.5	eating eat food NUM calories day much days meal me

Complete
Importing data...
Complete
Training topic model...


Mallet LDA: 15 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 960
total tokens: 287155
<10> LL/token: -9.05248
<20> LL/token: -8.71661
<30> LL/token: -8.56213
<40> LL/token: -8.47423

0	0.33333	NUM time going get life years every want keep back everything made day work done never got thing first make 
1	0.33333	NUM weight since years lost year even lose gained pounds started fat last time look body back months scale weigh 
2	0.33333	help really know NUM told need tell said doctor therapist eating also well heart hospital take even someone self please 
3	0.33333	even get time got work long back still big last able dad school said mom home lot try much though 
4	0.33333	weight eating gain recovery body still healthy lose really gaining lot underweight months bmi want back normal calories much past 
5	0.33333	today didn thought felt could got going said would really last mom away night good right small told one took 
6	0.33333	recovery years disorder one life eating health

Complete
Importing data...
Complete
Training topic model...


Mallet LDA: 20 topics, 5 topic bits, 11111 topic mask
Data loaded.
max tokens: 960
total tokens: 287155
<10> LL/token: -9.11663
<20> LL/token: -8.79236
<30> LL/token: -8.64335
<40> LL/token: -8.5621

0	0.25	want eating know disorder life years better get think would feel started really anyone help never bulimia struggling anything health 
1	0.25	like feel really know want even think also bad makes lot doesn much get still well said cause stuff though 
2	0.25	day time work get home really take meal want plan need next anxiety days scared eat also working away going 
3	0.25	like feel body people even weight fat look always see NUM way still think dont skinny time years never cant 
4	0.25	NUM weight pounds know want get bmi gain lose even scared since put lost loss back scale much week probably 
5	0.25	weight NUM eating recovery still gain healthy lose want months calories lot eat much past underweight gaining recover body gained 
6	0.25	recovery foods things food fear really still restri

Complete


### Topic Analysis

Following Akoury 2020, we examine common local topic transitions between entries written by the same user across time. We compute the transition probability from topic A to topic B by counting how many times A and B are the most probable topics for two consecutive entries, respectively, and normalizing by the total number of occurrences of topic A.

- Table: Topics with the highest relative importance for a sample of STORIUM worlds, which illustrate the diversity of the dataset.

In [17]:
# Examining all topics 
with open(output_directory_path + "top_view.txt", 'w') as f: 
    for num in num_top: 
        topic_keys = lmw.load_topic_keys(output_directory_path + "mallet.topic_keys." + str(num))
        f.write("Topics for k=" + str(num) + "\n")
        for i, t in enumerate(topic_keys):
            line = str(i) + '\t' + ' '.join(t[:10]) + "\n"
            f.write(line)
        f.write('\n')

In [19]:
num_topics = 10             # Change this to view different output files 

# Viewing topics ---- 
topic_keys = lmw.load_topic_keys(output_directory_path + "mallet.topic_keys." + str(num_topics))
topic_label = []
for i, t in enumerate(topic_keys):
    topic_label.append(' '.join(t[:10]))
topic_label

['doctor take heart NUM blood taking said laxatives hospital drink',
 'recovery body still restriction hair physical long extreme able anyone',
 'food NUM eat ate foods one sugar chocolate cream today',
 'know like want really feel eating get think even also',
 'life never would people enough love every xNUMb body world',
 'said told people like one didn mom look always never',
 'eat NUM eating day food calories days binge hungry meal',
 'like feel want even know going time fucking get day',
 'NUM work last treatment time get year back years got',
 'weight NUM body gain lose lost look back gained fat']

In [20]:
# For each topic, get the ranking of all the documents ----
ranking_doc = {}
topic_distributions = lmw.load_topic_distributions(output_directory_path + "mallet.topic_distributions." + str(num_topics))
for i in range(num_topics):
    ranking_doc[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=10):
        ranking_doc[topic_label[i]].append(str(training_data.index(d)))

In [21]:
# Get the position of each document in the ranking
ranking_pos = {}
for topic in ranking_doc: 
    for doc in ranking_doc[topic]: 
        if doc not in ranking_pos:
            ranking_pos[doc] = [(topic, ranking_doc[topic].index(doc))]
        else: 
            ranking_pos[doc].append((topic, ranking_doc[topic].index(doc)))

ranking_pos = {k: sorted(v, key=lambda x: x[1]) for k, v in ranking_pos.items()}
ranking_pos = dict(sorted(ranking_pos.items(), key=lambda item: int(item[0])))

In [22]:
# For each topic, get its probability of appearing in each document 
ranking_topic_prob = {}
for i in range(num_topics):
    ranking_topic_prob[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=len(training_data)):
        ranking_topic_prob[topic_label[i]].append((str(training_data.index(d)), p))
# Sort ranking_topic_prob by the first index of the value tuple
ranking_topic_prob = {k: sorted(v, key=lambda x: int(x[0])) for k, v in ranking_topic_prob.items()}

In [46]:
# Get the most probable topic for each document ---- 
most_prob = []
for doc in range(len(topic_distributions)): 
    item = topic_distributions[doc]
    most_prob.append(item.index(max(item)))

# Get the most probable topic for each document (grouped by user) ----
most_prob_grouped = []
for group in groupings:
    most_prob_grouped.append([most_prob[i] for i in group])

2

In [58]:
# Local topic transition between entries within the same user ----
# For each user, get the transition probability from one topic to another by
# counting the number of times a topic appears after another topic
# normalize by the total number of occurrences of the first topic

# Get the transition probability for each user ----
transition_prob = []
for group in most_prob_grouped:
    transition = {}
    for i in range(len(group)-1):
        if group[i] not in transition:
            transition[group[i]] = {group[i+1]: 1}
        else: 
            transition[group[i]][group[i+1]] = transition[group[i]].get(group[i+1], 0) + 1
    transition_prob.append(transition)

# Counting the number of occurrence for each topic ----
topic_count = {}
for user in most_prob_grouped:
    for topic in user:
        topic_count[topic] = topic_count.get(topic, 0) + 1

# Normalize the transition probability ----
transition_prob_norm = []
for user in transition_prob:
    user_norm = {}
    for topic in user:
        user_norm[topic] = {k: v/topic_count[topic] for k, v in user[topic].items()}
    transition_prob_norm.append(user_norm)
transition_prob_norm

# Get the most probable transition for each user----
max_top = {}
for user in transition_prob_norm:
    for topic in user:
        if topic not in max_top: 
            max_top[topic] = [max(user[topic], key=user[topic].get)]
        else: 
            max_top[topic].append(max(user[topic], key=user[topic].get))

# Get the most frequent transition across all users ----
max_top_freq = {}
for topic in max_top:
    max_top_freq[topic] = max(set(max_top[topic]), key=max_top[topic].count)
max_top_freq

Most probable transition across users:


{2: 3, 3: 3, 9: 3, 6: 3, 7: 3, 1: 1, 4: 3, 8: 3, 0: 3, 5: 3}