In [1]:
import little_mallet_wrapper as lmw
import pandas as pd 
import ast 
from lmw import *
import textwrap
import nltk
import matplotlib.pyplot as plt
import numpy as np
lemmatizer = nltk.stem.WordNetLemmatizer()
from nltk.corpus import stopwords
STOP = stopwords.words('english')
STOP.append("amp")

MALLET_PATH = "~/mallet/bin/mallet"
USER_POS_PATH = "../data/user_positive.csv"
USER_SQL_PATH = "../data/user_sqlite.csv"
NAR_POS_PATH = "../../narrative_detection/narrative_posts_by_trained_classification.csv"
num_top = [5,10,15,20]
output_directory_path = "../data/output/pos-output"

## Data Preprocessing

In [2]:
# Positive narrative dataset grouped by user ----
positive_users = pd.read_csv(USER_POS_PATH)[['subreddit_id', 'author', 'selftext',
                                             'title', 'id', 'created_utc']]
# Convert string to list
positive_users[['selftext','title', 'id', 'created_utc']] = positive_users[['selftext', 'title', 'id', 'created_utc']].apply(lambda x: x.apply(ast.literal_eval))

print("Number of users with a narrative positive post:", len(positive_users))
positive_users = positive_users[positive_users['created_utc'].apply(lambda x: len(x) >= 2 and len(x) < 50)]
print("Number of users with at least 2 narrative positive posts:", len(positive_users))
positive_users

Number of users with a narrative positive post: 5161
Number of users with at least 2 narrative positive posts: 901


Unnamed: 0,subreddit_id,author,selftext,title,id,created_utc
1,t5_o3plh,-CreamyPie-,[Hello!\n\nIm 11 and Im a boy self recovering ...,"[Fear Food Friday!! So far its successful, Fea...","[gbj0rh, gagddu, g7xk05]","[1588343070, 1588190127, 1587836033]"
22,t5_s5o7i,194569324,[Just want to get this off my chest because I ...,"[Hardcore restricting, close to being discharg...","[g3255g, fvycov]","[1587130632, 1586178189]"
23,t5_2tmc8,197326743251b,[i can eat without rules \n\nim ok with not pu...,"[body image is the last thing to go, sensory o...","[pn4k5v, pjyyfb, oyf5pl]","[1631492734, 1631058338, 1628161076]"
40,t5_2tmc8,40sareinteresting,[I have been off and on bp for 20 years. It’s ...,"[Binging and gaining weight fast, How to truly...","[zyx8lq, ywoeic]","[1672395862, 1668588759]"
47,t5_rbmui,50gayrats,[Today my dad called go make sure I ate I said...,"[Eating disorder Health scare, My brother is s...","[10900qu, zy1gd0]","[1673426889, 1672307211]"
...,...,...,...,...,...,...
5111,t5_rbmui,yellmoe,"[My dad lives continents away from me, and usu...","[I cooked with my dad today!!, Freaked out by ...","[10l6umt, 10j52kx, 10a28kr]","[1674673942, 1674452520, 1673537087]"
5127,t5_2tmc8,you_enjoy_my_yoga,[I can’t take this anymore. I need to free mys...,"[Recovery as a New Years resolution, Addicted ...","[zz3hrt, yrvzux]","[1672414828, 1668125294]"
5131,t5_2ydy9,yourbaepuppy,[Ever since I wad young I've always been on th...,[I feel like my parents are the reason why I'm...,"[10obocf, 10nve4f]","[1675009767, 1674955710]"
5140,t5_2zzq8,yuyuanmi,"[The transition to college, plus some other li...",[Can exercise be a healthy coping mechanism? O...,"[eu1tuk, eodu51]","[1580011342, 1578964660]"


In [3]:
# Generating training data ----
training_data = [lmw.process_string(t) for post in positive_users['selftext'] for t in post]
training_data = [d for d in training_data if d.strip()]

# Generate groupings ----
groupings = []
counting = 0 
for item in positive_users['selftext']: 
    idx = []
    for post in item: 
        idx.append(counting)
        counting += 1
    groupings.append(idx)

## Topic Modeling
- Training data:
    + All posts in the database where the users post at least twice and fewer than 50 times.
    + Each document represent a post. 
    

### Training 
- Do not run if only doing analyses

In [4]:
# Training ----
for num in num_top: 
    lmw_training(num, output_directory_path, training_data)

Importing data...
Complete
Training topic model...


Mallet LDA: 5 topics, 3 topic bits, 111 topic mask
Data loaded.
max tokens: 972
total tokens: 290071
<10> LL/token: -8.67672
<20> LL/token: -8.44713
<30> LL/token: -8.30195
<40> LL/token: -8.23101

0	1	body look know recovery would get even like want fat hate still way back weight looking thought feel see bad 
1	1	eat food like eating day feel ate binge today foods going one full get every meal good something much even 
2	1	weight eating feel like NUM eat know much want still even lot really think lose calories gain started recovery restricting 
3	1	NUM last time years back said since year didn got went going work ago know get lost weeks purging first 
4	1	like want feel know people really life get one think better help anyone time even could never much something ever 

<50> LL/token: -8.19422
<60> LL/token: -8.16798
<70> LL/token: -8.15254
<80> LL/token: -8.1359
<90> LL/token: -8.13022

0	1	like body look even feel fat see know still hate always would weight way looking recovery skinn

Complete
Importing data...
Complete
Training topic model...


Mallet LDA: 10 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 972
total tokens: 290071
<10> LL/token: -8.96798
<20> LL/token: -8.66299
<30> LL/token: -8.52594
<40> LL/token: -8.4497

0	0.5	time would years still family first thought didn said good long friends back made could get one got big point 
1	0.5	eat NUM eating day food ate like today calories even meal dinner made hungry much days going want meals really 
2	0.5	eat food eating like feel much get binge know stomach tired time doesn also fucking sometimes gonna binging something makes 
3	0.5	day NUM get felt last work one days take got help didn purging new went done today going night back 
4	0.5	like people feel want know one never even get always would think better see going someone thing life told ever 
5	0.5	like feel want body really even know look much hate purging bad anyone stop think since else hard purge feeling 
6	0.5	NUM really know recovery want going need also much get time anyone week year told hos

Complete
Importing data...
Complete
Training topic model...


Mallet LDA: 15 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 972
total tokens: 290071
<10> LL/token: -9.10546
<20> LL/token: -8.78985
<30> LL/token: -8.62894
<40> LL/token: -8.55677

0	0.33333	like feel look people body hate want fat even see never always one skinny think thin thing literally fucking way 
1	0.33333	life get work new time want job like things well hope let one people hair nand right find everything away 
2	0.33333	back also day small still meal little exercise home get would bit big felt thought maybe going like got kinda 
3	0.33333	know want get eating really told eat would going like said make help also tell say didn need mom doesn 
4	0.33333	weight body gain much recovery know lose really back even lost eating gaining gained healthy underweight lot also still loss 
5	0.33333	recovery anyone would take know anxiety purging well purge else one bulimia last get long post bad maybe day won 
6	0.33333	like feel eat eating want even enough sick still much 

Complete
Importing data...
Complete
Training topic model...


Mallet LDA: 20 topics, 5 topic bits, 11111 topic mask
Data loaded.
max tokens: 972
total tokens: 290071
<10> LL/token: -9.17992
<20> LL/token: -8.85742
<30> LL/token: -8.70462
<40> LL/token: -8.60941

0	0.25	people like said one told think time doesn someone know didn tell see say could care saying understand knows telling 
1	0.25	NUM eat ate day dinner today food meal meals breakfast lunch morning went last one water big days yesterday two 
2	0.25	feel like eat eating food hungry even want know still hunger feeling full day stomach meal night extreme binge makes 
3	0.25	week like going day little one work well hair make know two super time keep workout full stay nothing take 
4	0.25	eating eat really food recovery things could disorder long like way one still restriction normal always calories anyone use please 
5	0.25	want know get make would also going right disorder friends everything talk trying even really people family hard wanted need 
6	0.25	weight body gain healthy still know

Complete


In [5]:
# Examining all topics (top_view.txt) ----
with open(output_directory_path + "/top_view.txt", 'w') as f: 
    for num in num_top: 
        topic_keys = lmw.load_topic_keys(output_directory_path + "/mallet.topic_keys." + str(num))
        f.write("Topics for k=" + str(num) + "\n")
        for i, t in enumerate(topic_keys):
            line = str(i) + '\t' + ' '.join(t[:10]) + "\n"
            f.write(line)
        f.write('\n')

### Topic Analysis

Following Akoury 2020, we examine common local topic transitions between entries written by the same user across time. We compute the transition probability from topic A to topic B by counting how many times A and B are the most probable topics for two consecutive entries, respectively, and normalizing by the total number of occurrences of topic A.

- Table in the paper: Topics with the highest relative importance, which illustrate the diversity of the dataset.

In [6]:
num_topics = 10             # Change this to view different output files 

# Viewing topics ---- 
topic_keys = lmw.load_topic_keys(output_directory_path + "/mallet.topic_keys." + str(num_topics))
topic_label = []
for i, t in enumerate(topic_keys):
    topic_label.append(' '.join(t[:10]))
topic_label

['would time nNUM going fucking life xNUMb every let shit',
 'eat food NUM ate day eating dinner foods today binge',
 'like feel want know eat even get eating really time',
 'purging purge NUM day night take sleep morning got felt',
 'said told never didn would people mom like one really',
 'look body weight like fat see skinny clothes hate face',
 'NUM get work treatment time therapist need help know home',
 'recovery eating anyone body still restriction hunger advice extreme also',
 'life people years things disorder one mental health recovery anorexia',
 'NUM weight gain calories lost eating back lose much gained']

In [7]:
topic_distributions = lmw.load_topic_distributions(output_directory_path + "/mallet.topic_distributions." + str(num_topics))

# Get the most probable topic and topic distribution for each document ---- 
most_prob = []
topics = []
for doc in range(len(topic_distributions)): 
    item = topic_distributions[doc]
    most_prob.append(item.index(max(item)))
    topics.append(item)

# Get the most probable topic and topic distribution for each document (grouped by user) ----
most_prob_grouped = []
prob_grouped = []
for group in groupings:
    most_prob_grouped.append([most_prob[i] for i in group])
    prob_grouped.append([topics[i] for i in group])

positive_users['Topic Distribution'] = prob_grouped

In [8]:
exploded = positive_users.explode(['selftext', 'created_utc', 'title', 'Topic Distribution', 'id']).to_csv("../data/positive_topic_dist.csv")

In [9]:
# Local topic transition between entries within the same user ----
# For each user, get the transition probability from topic A to topic B by
# counting the number of times topic B appears after another topic A
# normalize by the total number of times topic A is the most probable topic

# Get the transition probability for each user ----
# transition_prob: List of dictionaries {topic A: {topic B: countBA}}
# topic_count: List of dictionaries {topic A: countA}
transition_count, topic_count = [], []
for user in most_prob_grouped:
    transition, topic = {}, {}
    for i in range(len(user)-1):
        if user[i] not in transition:
            transition[user[i]] = {user[i+1]: 1}
        else: 
            transition[user[i]][user[i+1]] = transition[user[i]].get(user[i+1], 0) + 1
        topic[user[i]] = topic.get(user[i], 0) + 1
    transition_count.append(transition)
    topic_count.append(topic)

In [10]:
# Normalize the transition probability ----
transition_prob_norm = []
for user in range(len(transition_count)): 
    transition_norm = {}
    for topicA in transition_count[user]: 
        for topicB in transition_count[user][topicA]: 
            transition_norm[topicA] = transition_norm.get(topicA, {})
            transition_norm[topicA][topicB] = transition_count[user][topicA][topicB] / topic_count[user][topicA]
    transition_prob_norm.append(transition_norm)

In [11]:
# Get the most probable transition for each user----
max_top = []
for user in transition_prob_norm:
    maxing = {}
    for topic in user:
        max_val = 0 
        max_topic = []
        for t in user[topic]:
            if user[topic][t] > max_val: 
                max_val = user[topic][t]
                max_topic = [t]
            elif user[topic][t] == max_val: 
                max_topic.append(t)
        maxing[topic] = max_topic
    max_top.append(maxing)


# reformat max_top so that it is a list of lists of tuples ----
all_users = []
for user in max_top: 
    user_list = []
    for topic in user: 
        for t in user[topic]: 
            user_list.append((topic, t))
    all_users.append(user_list)
all_users_expanded = [user[i] for user in all_users for i in range(len(user))]

In [12]:
# Get the most frequent transition across all users ----
max_top_freq = {}
for tup in all_users_expanded:
    # get the most frequent transition for each user 
    if tup[0] not in max_top_freq: 
        max_top_freq[tup[0]] = {tup[1]: 1}
    else:
        max_top_freq[tup[0]][tup[1]] = max_top_freq[tup[0]].get(tup[1], 0) + 1

# Get the value with the highest count for each topic ----
# Topic     Most Frequent Transition
for topic in max_top_freq: 
    print(topic, max(max_top_freq[topic], key=max_top_freq[topic].get))


1 2
2 2
9 2
4 2
7 2
6 2
5 2
3 2
0 2
8 2


In [13]:
# Antoniak (2019)'s time series analysis ----

# For each topic, get the ranking of all the documents
ranking_doc = {}
for i in range(num_topics):
    ranking_doc[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=10):
        ranking_doc[topic_label[i]].append(str(training_data.index(d)))

# Get the position of each document in the ranking
ranking_pos = {}
for topic in ranking_doc: 
    for doc in ranking_doc[topic]: 
        if doc not in ranking_pos:
            ranking_pos[doc] = [(topic, ranking_doc[topic].index(doc))]
        else: 
            ranking_pos[doc].append((topic, ranking_doc[topic].index(doc)))

ranking_pos = {k: sorted(v, key=lambda x: x[1]) for k, v in ranking_pos.items()}
ranking_pos = dict(sorted(ranking_pos.items(), key=lambda item: int(item[0])))

# For each topic, get its probability of appearing in each document 
ranking_topic_prob = {}
for i in range(num_topics):
    ranking_topic_prob[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=len(training_data)):
        ranking_topic_prob[topic_label[i]].append((str(training_data.index(d)), p))
# Sort ranking_topic_prob by the first index of the value tuple
ranking_topic_prob = {k: sorted(v, key=lambda x: int(x[0])) for k, v in ranking_topic_prob.items()}