In [1]:
import little_mallet_wrapper as lmw
import pandas as pd 
import ast 
from lmw import *
import textwrap
import nltk
import matplotlib.pyplot as plt
import numpy as np
lemmatizer = nltk.stem.WordNetLemmatizer()
from nltk.corpus import stopwords
STOP = stopwords.words('english')
STOP.append("amp")

MALLET_PATH = "~/mallet/bin/mallet"
USER_POS_PATH = "../data/user_positive.csv"
USER_SQL_PATH = "../data/user_sqlite.csv"
NAR_POS_PATH = "../../narrative_detection/narrative_posts_by_trained_classification.csv"
num_top = [5,10,15,20]
output_directory_path = "../data/output/pos-output/"

## Data Preprocessing

In [2]:
# Positive narrative dataset grouped by user ----
positive_nar = pd.read_csv(NAR_POS_PATH)
positive_users = pd.read_csv(USER_POS_PATH)
positive_users['selftext'] = positive_users['selftext'].apply(ast.literal_eval)
positive_users['merged'] = positive_users['selftext'].apply(lambda x: ' '.join(x))
positive_users['created_utc'] = positive_users['created_utc'].apply(ast.literal_eval)
print("Number of users with a narrative positive post:", len(positive_users))
positive_users = positive_users[positive_users['created_utc'].apply(lambda x: len(x) >= 2 and len(x) < 50)].reset_index(drop=True)
print("Number of users with at least 2 narrative positive posts:", len(positive_users))

Number of users with a narrative positive post: 5161
Number of users with at least 2 narrative positive posts: 901


In [3]:
# Generating training data ----
training_data = [lmw.process_string(t) for post in positive_users['selftext'] for t in post]
training_data = [d for d in training_data if d.strip()]

# Generate groupings ----
groupings = []
counting = 0 
for item in positive_users['selftext']: 
    idx = []
    for post in item: 
        idx.append(counting)
        counting += 1
    groupings.append(idx)

## Topic Modeling
- Training data:
    + All posts in the database where the users post at least twice and fewer than 50 times.
    + Each document represent a post. 
    

### Training 
- Do not run if only doing analyses

In [None]:
# Training ----
for num in num_top: 
    lmw_training(num, output_directory_path, training_data)

In [None]:
# Examining all topics (top_view.txt) ----
with open(output_directory_path + "top_view.txt", 'w') as f: 
    for num in num_top: 
        topic_keys = lmw.load_topic_keys(output_directory_path + "mallet.topic_keys." + str(num))
        f.write("Topics for k=" + str(num) + "\n")
        for i, t in enumerate(topic_keys):
            line = str(i) + '\t' + ' '.join(t[:10]) + "\n"
            f.write(line)
        f.write('\n')

### Topic Analysis

Following Akoury 2020, we examine common local topic transitions between entries written by the same user across time. We compute the transition probability from topic A to topic B by counting how many times A and B are the most probable topics for two consecutive entries, respectively, and normalizing by the total number of occurrences of topic A.

- Table in the paper: Topics with the highest relative importance, which illustrate the diversity of the dataset.

In [4]:
num_topics = 10             # Change this to view different output files 

# Viewing topics ---- 
topic_keys = lmw.load_topic_keys(output_directory_path + "mallet.topic_keys." + str(num_topics))
topic_label = []
for i, t in enumerate(topic_keys):
    topic_label.append(' '.join(t[:10]))
topic_label

['doctor take heart NUM blood taking said laxatives hospital drink',
 'recovery body still restriction hair physical long extreme able anyone',
 'food NUM eat ate foods one sugar chocolate cream today',
 'know like want really feel eating get think even also',
 'life never would people enough love every xNUMb body world',
 'said told people like one didn mom look always never',
 'eat NUM eating day food calories days binge hungry meal',
 'like feel want even know going time fucking get day',
 'NUM work last treatment time get year back years got',
 'weight NUM body gain lose lost look back gained fat']

In [5]:
topic_distributions = lmw.load_topic_distributions(output_directory_path + "mallet.topic_distributions." + str(num_topics))

# Get the most probable topic for each document ---- 
most_prob = []
for doc in range(len(topic_distributions)): 
    item = topic_distributions[doc]
    most_prob.append(item.index(max(item)))

# Get the most probable topic for each document (grouped by user) ----
most_prob_grouped = []
for group in groupings:
    most_prob_grouped.append([most_prob[i] for i in group])

In [6]:
# Local topic transition between entries within the same user ----
# For each user, get the transition probability from one topic to another by
# counting the number of times a topic appears after another topic
# normalize by the total number of occurrences of the first topic

# Get the transition probability for each user ----
transition_prob = []
for group in most_prob_grouped:
    transition = {}
    for i in range(len(group)-1):
        if group[i] not in transition:
            transition[group[i]] = {group[i+1]: 1}
        else: 
            transition[group[i]][group[i+1]] = transition[group[i]].get(group[i+1], 0) + 1
    transition_prob.append(transition)

# Counting the number of occurrence for each topic ----
topic_count = {}
for user in most_prob_grouped:
    for topic in user:
        topic_count[topic] = topic_count.get(topic, 0) + 1

# Normalize the transition probability ----
transition_prob_norm = []
for user in transition_prob:
    user_norm = {}
    for topic in user:
        user_norm[topic] = {k: v/topic_count[topic] for k, v in user[topic].items()}
    transition_prob_norm.append(user_norm)
transition_prob_norm

# Get the most probable transition for each user----
max_top = {}
for user in transition_prob_norm:
    for topic in user:
        if topic not in max_top: 
            max_top[topic] = [max(user[topic], key=user[topic].get)]
        else: 
            max_top[topic].append(max(user[topic], key=user[topic].get))

# Get the most frequent transition across all users ----
max_top_freq = {}
for topic in max_top:
    max_top_freq[topic] = max(set(max_top[topic]), key=max_top[topic].count)

print("Most probable topic transition across all users:", max_top_freq)

Most probable topic transition across all users: {2: 3, 3: 3, 9: 3, 6: 3, 7: 3, 1: 1, 4: 3, 8: 3, 0: 3, 5: 3}


In [None]:
# Antoniak (2019)'s time series analysis ----

# For each topic, get the ranking of all the documents
ranking_doc = {}
for i in range(num_topics):
    ranking_doc[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=10):
        ranking_doc[topic_label[i]].append(str(training_data.index(d)))

# Get the position of each document in the ranking
ranking_pos = {}
for topic in ranking_doc: 
    for doc in ranking_doc[topic]: 
        if doc not in ranking_pos:
            ranking_pos[doc] = [(topic, ranking_doc[topic].index(doc))]
        else: 
            ranking_pos[doc].append((topic, ranking_doc[topic].index(doc)))

ranking_pos = {k: sorted(v, key=lambda x: x[1]) for k, v in ranking_pos.items()}
ranking_pos = dict(sorted(ranking_pos.items(), key=lambda item: int(item[0])))

# For each topic, get its probability of appearing in each document 
ranking_topic_prob = {}
for i in range(num_topics):
    ranking_topic_prob[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=len(training_data)):
        ranking_topic_prob[topic_label[i]].append((str(training_data.index(d)), p))
# Sort ranking_topic_prob by the first index of the value tuple
ranking_topic_prob = {k: sorted(v, key=lambda x: int(x[0])) for k, v in ranking_topic_prob.items()}