In [1]:
import little_mallet_wrapper as lmw
import pandas as pd 
import ast 
from lmw import *
import textwrap
import nltk
import matplotlib.pyplot as plt
import numpy as np
from nltk import word_tokenize
lemmatizer = nltk.stem.WordNetLemmatizer()
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

MALLET_PATH = "~/mallet/bin/mallet"
NAR_POS_PATH = "../../narrative_detection/narrative_posts_by_trained_classification.csv"
num_top = [10, 15, 20, 30]
output_directory_path = "../data/output/"

## Data Preprocessing

In [2]:
# Positive narrative dataset grouped by user ----
positive_df = pd.read_csv(NAR_POS_PATH)[['subreddit_id', 'author', 'selftext', 'title', 'id', 'created_utc']]
positive_df['selftext'] = positive_df['selftext'].replace(r'\n',' ', regex=True) 
positive_users = positive_df.groupby(["author", "subreddit_id"]).agg(list).reset_index()
positive_users = positive_users[positive_users['created_utc'].apply(lambda x: len(x) >= 2 and len(x) < 50)].reset_index(drop=True)

sum_post = []
for cluster in positive_users['selftext']: 
    for post in cluster: 
        sum_post.append(len(post.split()))
print("Number of users with at least 2 narrative positive posts:", len(positive_users))
print("Average number of words per post:", sum(sum_post)/len(sum_post))
print("Standard deviation of number of words per post:", np.std(sum_post))
print("Min/max number of words per post:", min(sum_post), max(sum_post))
positive_users.head(5)

Number of users with at least 2 narrative positive posts: 901
Average number of words per post: 220.56056338028168
Standard deviation of number of words per post: 141.52999182363467
Min/max number of words per post: 100 1913


Unnamed: 0,author,subreddit_id,selftext,title,id,created_utc
0,-CreamyPie-,t5_o3plh,[Hello! Im 11 and Im a boy self recovering fr...,"[Fear Food Friday!! So far its successful, Fea...","[gbj0rh, gagddu, g7xk05]","[1588343070, 1588190127, 1587836033]"
1,194569324,t5_s5o7i,[Just want to get this off my chest because I ...,"[Hardcore restricting, close to being discharg...","[g3255g, fvycov]","[1587130632, 1586178189]"
2,197326743251b,t5_2tmc8,[i can eat without rules im ok with not purg...,"[body image is the last thing to go, sensory o...","[pn4k5v, pjyyfb, oyf5pl]","[1631492734, 1631058338, 1628161076]"
3,40sareinteresting,t5_2tmc8,[I have been off and on bp for 20 years. It’s ...,"[Binging and gaining weight fast, How to truly...","[zyx8lq, ywoeic]","[1672395862, 1668588759]"
4,50gayrats,t5_rbmui,[Today my dad called go make sure I ate I said...,"[Eating disorder Health scare, My brother is s...","[10900qu, zy1gd0]","[1673426889, 1672307211]"


In [3]:
# Creating stopwords list ----
# Custom stopwords
with open("../data/input/custom_stop.txt", "r") as f:
    custom_stop = f.read().split()

# TF-IDF stopwords
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(positive_df['selftext'].tolist())
feature_names = vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
doc_frequency = (df_tfidf != 0).sum(axis=0)
doc_frequency = doc_frequency / len(positive_df)
df_tfidf = df_tfidf.loc[:, doc_frequency >= 0.5]
stop_tfidf = df_tfidf.columns.tolist()

# Final stoplist
stoplist = stopwords.words('english') + ["amp", "like"] + stop_tfidf

In [4]:
# Generating training data ----
training_data = [lmw.process_string(t) for post in positive_users['selftext'] for t in post]
training_data = [d for d in training_data if d.strip()]

# Generate groupings ----
groupings = []
counting = 0 
for item in positive_users['selftext']: 
    idx = []
    for post in item: 
        idx.append(counting)
        counting += 1
    groupings.append(idx)

## Training Topic Model
- Training data:
    + All posts in the database where the users post at least twice and fewer than 50 times.
    + Each document represent a post (post-level topic modeling).
    

In [5]:
# Training ----
'''
for num in num_top: 
    lmw_training(num, output_directory_path, training_data)
'''

'\nfor num in num_top: \n    lmw_training(num, output_directory_path, training_data)\n'

In [6]:
# Examining all topics (top_view.txt) ----
'''
with open(output_directory_path + "/top_view.txt", 'w') as f: 
    for num in num_top: 
        topic_keys = lmw.load_topic_keys(output_directory_path + "/mallet.topic_keys." + str(num))
        f.write("Topics for k=" + str(num) + "\n")
        for i, t in enumerate(topic_keys):
            line = str(i) + '\t' + ' '.join(t[:10]) + "\n"
            f.write(line)
        f.write('\n')
'''

'\nwith open(output_directory_path + "/top_view.txt", \'w\') as f: \n    for num in num_top: \n        topic_keys = lmw.load_topic_keys(output_directory_path + "/mallet.topic_keys." + str(num))\n        f.write("Topics for k=" + str(num) + "\n")\n        for i, t in enumerate(topic_keys):\n            line = str(i) + \'\t\' + \' \'.join(t[:10]) + "\n"\n            f.write(line)\n        f.write(\'\n\')\n'

## Analyzing Topics

### Local Topic Transition
Following Akoury 2020, we examine common local topic transitions between entries written by the same user across time. We compute the transition probability from topic A to topic B by counting how many times A and B are the most probable topics for two consecutive entries, respectively, and normalizing by the total number of occurrences of topic A.

- Table in the paper: Topics with the highest relative importance, which illustrate the diversity of the dataset.

In [7]:
num_topics = 20             # Change this to view different output files 

# Viewing topics ---- 
topic_keys = lmw.load_topic_keys(output_directory_path + "/mallet.topic_keys." + str(num_topics))
topic_label = []
for i, t in enumerate(topic_keys):
    topic_label.append(' '.join(t[:10]))
topic_label

['purging binge purge binging stop bulimia b/p day cycle bad',
 'xNUMb hair would teeth dentist make mouth abs use email',
 'mom family fat dad sister said parents told comments always',
 'like feel even want know much get hate fucking never',
 'work get want time going need life back job hard',
 'like feel really also know lot think still idk maybe',
 'food mad buy money cookies store candy buying kitchen fucking',
 'know really get want eating think help tell disorder also',
 'eat eating food feel like meal even day hungry want',
 'people post group recovery made diet disordered certain looking posts',
 'weight gain lose gained eating back underweight bmi want NUM',
 'body look weight see clothes skinny fat looked face fit',
 'foods ate NUM eat food sugar chocolate cream ice cheese',
 'life eating self disorder things every people anxiety way mental',
 'never heart pain sick enough blood sleep night take water',
 'today didn felt got day going last back went time',
 'treatment inpati

In [8]:
topic_distributions = lmw.load_topic_distributions(output_directory_path + "/mallet.topic_distributions." + str(num_topics))

# Get the most probable topic and topic distribution for each document ---- 
most_prob = [np.argmax(d) for d in topic_distributions]

# Get the most probable topic and topic distribution for each document (grouped by user) ----
most_prob_grouped, prob_grouped = [], []
for group in groupings:
    most_prob_grouped.append([most_prob[i] for i in group])
    prob_grouped.append([topic_distributions[i] for i in group])

positive_users['Topic Distribution'] = prob_grouped
positive_users['most_probable_topic'] = most_prob_grouped
positive_users.head(5)

Unnamed: 0,author,subreddit_id,selftext,title,id,created_utc,Topic Distribution,most_probable_topic
0,-CreamyPie-,t5_o3plh,[Hello! Im 11 and Im a boy self recovering fr...,"[Fear Food Friday!! So far its successful, Fea...","[gbj0rh, gagddu, g7xk05]","[1588343070, 1588190127, 1587836033]","[[0.003704930553153588, 0.0008734280746749083,...","[15, 15, 12]"
1,194569324,t5_s5o7i,[Just want to get this off my chest because I ...,"[Hardcore restricting, close to being discharg...","[g3255g, fvycov]","[1587130632, 1586178189]","[[0.002993392052432115, 0.000705684659831887, ...","[16, 8]"
2,197326743251b,t5_2tmc8,[i can eat without rules im ok with not purg...,"[body image is the last thing to go, sensory o...","[pn4k5v, pjyyfb, oyf5pl]","[1631492734, 1631058338, 1628161076]","[[0.039118068182169494, 0.0009652005288195401,...","[3, 3, 8]"
3,40sareinteresting,t5_2tmc8,[I have been off and on bp for 20 years. It’s ...,"[Binging and gaining weight fast, How to truly...","[zyx8lq, ywoeic]","[1672395862, 1668588759]","[[0.17205510981904829, 0.0007751575022792212, ...","[18, 18]"
4,50gayrats,t5_rbmui,[Today my dad called go make sure I ate I said...,"[Eating disorder Health scare, My brother is s...","[10900qu, zy1gd0]","[1673426889, 1672307211]","[[0.001442258027445157, 0.0003400087084017289,...","[15, 8]"


In [9]:
# Get the most probable topic and topic distribution for each document (ungrouped) ----
exploded = positive_users.explode(['selftext', 'created_utc', 'title', 'Topic Distribution', 'most_probable_topic', 'id']).to_csv("../data/positive_topic_dist.csv")

In [10]:
# Local topic transition between entries within the same user ----
# For each user, get the transition probability from topic A to topic B by
# counting the number of times topic B appears after another topic A
# normalize by the total number of times topic A is the most probable topic

# Get the transition probability for each user ----
# transition_prob: List of dictionaries {topic A: {topic B: countBA}}
# topic_count: List of dictionaries {topic A: countA}
transition_count, topic_count = [], []
for user in most_prob_grouped:
    transition, topic = {}, {}
    for i in range(len(user)-1):
        if user[i] not in transition:
            transition[user[i]] = {user[i+1]: 1}
        else: 
            transition[user[i]][user[i+1]] = transition[user[i]].get(user[i+1], 0) + 1
        topic[user[i]] = topic.get(user[i], 0) + 1
    transition_count.append(transition)
    topic_count.append(topic)

# Normalize the transition probability ----
transition_prob_norm = []
for user in range(len(transition_count)): 
    transition_norm = {}
    for topicA in transition_count[user]: 
        for topicB in transition_count[user][topicA]: 
            transition_norm[topicA] = transition_norm.get(topicA, {})
            transition_norm[topicA][topicB] = transition_count[user][topicA][topicB] / topic_count[user][topicA]
    transition_prob_norm.append(transition_norm)

In [11]:
# Get the most probable transition for each user----
max_top = []
for user in transition_prob_norm:
    maxing = {}
    for topic in user:
        max_val = 0 
        max_topic = []
        for t in user[topic]:
            if user[topic][t] > max_val: 
                max_val = user[topic][t]
                max_topic = [t]
            elif user[topic][t] == max_val: 
                max_topic.append(t)
        maxing[topic] = max_topic
    max_top.append(maxing)


# Reformat max_top into a list of lists of tuples ----
all_users = []
for user in max_top: 
    user_list = []
    for topic in user: 
        for t in user[topic]: 
            user_list.append((topic, t))
    all_users.append(user_list)
all_users_expanded = [user[i] for user in all_users for i in range(len(user))]

In [12]:
# Get the most frequent transition across all users ----
max_top_freq = {}
for tup in all_users_expanded:
    # get the most frequent transition for each user 
    if tup[0] not in max_top_freq: 
        max_top_freq[tup[0]] = {tup[1]: 1}
    else:
        max_top_freq[tup[0]][tup[1]] = max_top_freq[tup[0]].get(tup[1], 0) + 1

# Get the value with the highest count for each topic ----
# Topic     Most Frequent Transition
topic_label = {}
with open("../data/analysis/topic_label_20.txt", 'r') as f:
    labs = f.readlines()
    labs = [t.strip() for t in labs]
    for topic in labs: 
        idx, label, keywords = topic.split("_")[0],  topic.split("_")[1],  topic.split("_")[2]
        if str(idx) not in topic_label: 
            topic_label[str(idx)] = [label, keywords]

print('Most common local topic transitions across users:')
for topic in max_top_freq: 
    print(topic_label[str(topic)][0], "---->", topic_label[str(max(max_top_freq[topic], key=max_top_freq[topic].get))][0])


Most common local topic transitions across users:
timeShort (moderate) ----> feeling (moderate)
formalTreatment (good) ----> feeling (moderate)
feeling (moderate) ----> feeling (moderate)
timeLong (moderate) ----> feeling (moderate)
eat(good) ----> eat(good)
recovery (bad) ----> recovery (bad)
weight (good) ----> weight (good)
feeling (moderate) ----> feeling (moderate)
community (moderate) ----> feeling (moderate)
food (good) ----> feeling (moderate)
work (bad) ----> work (bad)
feeling (moderate) ----> feeling (moderate)
calories (bad) ----> feeling (moderate)
bodyParts (good) ----> weight (good)
family (good) ----> feeling (moderate)
eating ----> feeling (moderate)
appearance (good) ----> feeling (moderate)
groceryShopping (moderate) ----> eat(good)
life (bad) ----> life (bad)
discomfort (moderate) ----> timeShort (moderate)


### Topic-sentiment analysis

In [13]:
# Adding sentence polarity to each user (positive_users df) ----
submission_polarity = pd.read_csv("../../data_collection/submission_polarity.csv")
polarity_pos = []
for id in positive_users['id']: 
    subpolarity = []
    for subid in id: 
        subpolarity.append(submission_polarity[submission_polarity['id'] == subid]['sentence_polarity'].values[0])
    polarity_pos.append(subpolarity)
positive_users['post_polarity'] = polarity_pos

Unnamed: 0,author,subreddit_id,selftext,title,id,created_utc,Topic Distribution,most_probable_topic,post_polarity
0,-CreamyPie-,t5_o3plh,[Hello! Im 11 and Im a boy self recovering fr...,"[Fear Food Friday!! So far its successful, Fea...","[gbj0rh, gagddu, g7xk05]","[1588343070, 1588190127, 1587836033]","[[0.003704930553153588, 0.0008734280746749083,...","[15, 15, 12]","[0.0870228548086017, -0.2328760368619884, 0.34..."
1,194569324,t5_s5o7i,[Just want to get this off my chest because I ...,"[Hardcore restricting, close to being discharg...","[g3255g, fvycov]","[1587130632, 1586178189]","[[0.002993392052432115, 0.000705684659831887, ...","[16, 8]","[-2.956558476721538, -2.654536509095336]"
2,197326743251b,t5_2tmc8,[i can eat without rules im ok with not purg...,"[body image is the last thing to go, sensory o...","[pn4k5v, pjyyfb, oyf5pl]","[1631492734, 1631058338, 1628161076]","[[0.039118068182169494, 0.0009652005288195401,...","[3, 3, 8]","[0.3099656246805764, -0.132066773155663, 0.068..."
3,40sareinteresting,t5_2tmc8,[I have been off and on bp for 20 years. It’s ...,"[Binging and gaining weight fast, How to truly...","[zyx8lq, ywoeic]","[1672395862, 1668588759]","[[0.17205510981904829, 0.0007751575022792212, ...","[18, 18]","[0.6908962699988288, 0.5311602587062468]"
4,50gayrats,t5_rbmui,[Today my dad called go make sure I ate I said...,"[Eating disorder Health scare, My brother is s...","[10900qu, zy1gd0]","[1673426889, 1672307211]","[[0.001442258027445157, 0.0003400087084017289,...","[15, 8]","[-1.1435405463625492, -1.1485738463722492]"


In [14]:
# Average polarity per topic (polarity is assigned to the most probable topic in each document) ---- 
topic_polarity = {}
for i in range(len(positive_users)): 
    for j in range(len(most_prob_grouped[i])): 
        if str(most_prob_grouped[i][j]) not in topic_polarity: 
            topic_polarity[str(most_prob_grouped[i][j])] = [positive_users['post_polarity'][i][j]]
        else: 
            topic_polarity[str(most_prob_grouped[i][j])].append(positive_users['post_polarity'][i][j])

topic_polarity_avg = {}
for topic in topic_polarity:
    topic_polarity_avg[topic_label[topic][0]] = sum(topic_polarity[topic]) / len(topic_polarity[topic])

topic_polarity_avg

{'timeShort (moderate)': -1.0042662335213395,
 'food (good)': -0.9399128282269104,
 'formalTreatment (good)': -1.365483075300481,
 'eat(good)': -0.8294382080727963,
 'feeling (moderate)': -1.276713874389304,
 'timeLong (moderate)': -1.3123584171180183,
 'recovery (bad)': -0.3513255975950317,
 'weight (good)': -1.2394931914468286,
 'community (moderate)': -0.5671461514699745,
 'work (bad)': -1.608300935687175,
 'calories (bad)': -1.077941042536435,
 'appearance (good)': -1.278935692896486,
 'bodyParts (good)': -0.6186034156220543,
 'family (good)': -1.246374382914337,
 'eating': -0.8096455087097165,
 'groceryShopping (moderate)': -1.236744592516649,
 'life (bad)': -1.184937645141877,
 'discomfort (moderate)': -1.2340944579601971}

In [19]:
# Get the polarity score associated with local topic transitions ----
# transition_polarity: List of dictionaries {topic A: {topic B: [polarityB - polarityA]}}
transition_polarity = []
for user in range(len(positive_users)):
    transition_user = {}
    for i in range(len(positive_users['most_probable_topic'][user])-1):
        transition_user[(positive_users['most_probable_topic'][user][i], positive_users['most_probable_topic'][user][i+1])] = transition_user.get((positive_users['most_probable_topic'][user][i], positive_users['most_probable_topic'][user][i+1]), [])
        transition_user[(positive_users['most_probable_topic'][user][i], positive_users['most_probable_topic'][user][i+1])].append(positive_users['post_polarity'][user][i+1] - positive_users['post_polarity'][user][i])
    transition_polarity.append(transition_user)

# Get average polarity score for each transition pair across all users ----
transition_polarity_avg = {}
for user in transition_polarity:
    for topic in user: 
        if topic not in transition_polarity_avg: 
            transition_polarity_avg[topic] = user[topic]
        else: 
            transition_polarity_avg[topic] += user[topic]

for topic in transition_polarity_avg: 
    transition_polarity_avg[topic] = sum(transition_polarity_avg[topic]) / len(transition_polarity_avg[topic])

transition_polarity_avg_sorted = dict(sorted(transition_polarity_avg.items(), key=lambda x: x[1], reverse=True))
print("Normalized polarity score for local topic transitions across users:")
for topic in transition_polarity_avg_sorted: 
    print(topic_label[str(topic[0])][0], "---->", topic_label[str(topic[1])][0], ":", transition_polarity_avg[topic])

Normalized polarity score for local topic transitions across users:
eating ----> life (bad) : 1.1195959947656848
feeling (moderate) ----> formalTreatment (good) : 1.0277378768123764
eating ----> food (good) : 0.8721667146657658
weight (good) ----> life (bad) : 0.8621830401831583
eat(good) ----> groceryShopping (moderate) : 0.8027146836446373
community (moderate) ----> groceryShopping (moderate) : 0.7905859316753618
feeling (moderate) ----> formalTreatment (good) : 0.7883741490538481
feeling (moderate) ----> calories (bad) : 0.7534539971608992
feeling (moderate) ----> discomfort (moderate) : 0.7428046861969628
calories (bad) ----> bodyParts (good) : 0.7086652728620207
family (good) ----> life (bad) : 0.673734773690271
work (bad) ----> food (good) : 0.6693355550030348
timeLong (moderate) ----> community (moderate) : 0.6650051256184442
recovery (bad) ----> groceryShopping (moderate) : 0.6042457019483545
timeLong (moderate) ----> appearance (good) : 0.6032207321465486
weight (good) ----> f

### Topic-factor analysis

### Antoniak (2019)'s time series analysis

In [None]:
# For each topic, get the ranking of all the documents
ranking_doc = {}
for i in range(num_topics):
    ranking_doc[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=10):
        ranking_doc[topic_label[i]].append(str(training_data.index(d)))

# Get the position of each document in the ranking
ranking_pos = {}
for topic in ranking_doc: 
    for doc in ranking_doc[topic]: 
        if doc not in ranking_pos:
            ranking_pos[doc] = [(topic, ranking_doc[topic].index(doc))]
        else: 
            ranking_pos[doc].append((topic, ranking_doc[topic].index(doc)))

ranking_pos = {k: sorted(v, key=lambda x: x[1]) for k, v in ranking_pos.items()}
ranking_pos = dict(sorted(ranking_pos.items(), key=lambda item: int(item[0])))

# For each topic, get its probability of appearing in each document 
ranking_topic_prob = {}
for i in range(num_topics):
    ranking_topic_prob[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=len(training_data)):
        ranking_topic_prob[topic_label[i]].append((str(training_data.index(d)), p))
# Sort ranking_topic_prob by the first index of the value tuple
ranking_topic_prob = {k: sorted(v, key=lambda x: int(x[0])) for k, v in ranking_topic_prob.items()}