In [1]:
import little_mallet_wrapper as lmw
import pandas as pd 
import ast 
from lmw import *
import textwrap
import nltk
import matplotlib.pyplot as plt
import numpy as np
lemmatizer = nltk.stem.WordNetLemmatizer()
from nltk.corpus import stopwords
STOP = stopwords.words('english')
STOP.append("amp")

MALLET_PATH = "~/mallet/bin/mallet"
USER_POS_PATH = "../data/user_positive.csv"
USER_SQL_PATH = "../data/user_sqlite.csv"
NAR_POS_PATH = "../../narrative_detection/narrative_posts_by_trained_classification.csv"
num_top = [5,10,15,20]
output_directory_path = "../data/output/pos-output/"

## Data Preprocessing

In [2]:
# Positive narrative dataset grouped by user ----
positive_nar = pd.read_csv(NAR_POS_PATH)
positive_users = pd.read_csv(USER_POS_PATH)
positive_users['selftext'] = positive_users['selftext'].apply(ast.literal_eval)
positive_users['merged'] = positive_users['selftext'].apply(lambda x: ' '.join(x))
positive_users['created_utc'] = positive_users['created_utc'].apply(ast.literal_eval)
print("Number of users with a narrative positive post:", len(positive_users))
positive_users = positive_users[positive_users['created_utc'].apply(lambda x: len(x) >= 2 and len(x) < 50)].reset_index(drop=True)
print("Number of users with at least 2 narrative positive posts:", len(positive_users))

Number of users with a narrative positive post: 5161
Number of users with at least 2 narrative positive posts: 901


In [3]:
# Generating training data ----
training_data = [lmw.process_string(t) for post in positive_users['selftext'] for t in post]
training_data = [d for d in training_data if d.strip()]

# Generate groupings ----
groupings = []
counting = 0 
for item in positive_users['selftext']: 
    idx = []
    for post in item: 
        idx.append(counting)
        counting += 1
    groupings.append(idx)

## Topic Modeling
- Training data:
    + All posts in the database where the users post at least twice and fewer than 50 times.
    + Each document represent a post. 
    

### Training 
- Do not run if only doing analyses

In [None]:
# Training ----
for num in num_top: 
    lmw_training(num, output_directory_path, training_data)

In [None]:
# Examining all topics (top_view.txt) ----
with open(output_directory_path + "top_view.txt", 'w') as f: 
    for num in num_top: 
        topic_keys = lmw.load_topic_keys(output_directory_path + "mallet.topic_keys." + str(num))
        f.write("Topics for k=" + str(num) + "\n")
        for i, t in enumerate(topic_keys):
            line = str(i) + '\t' + ' '.join(t[:10]) + "\n"
            f.write(line)
        f.write('\n')

### Topic Analysis

Following Akoury 2020, we examine common local topic transitions between entries written by the same user across time. We compute the transition probability from topic A to topic B by counting how many times A and B are the most probable topics for two consecutive entries, respectively, and normalizing by the total number of occurrences of topic A.

- Table in the paper: Topics with the highest relative importance, which illustrate the diversity of the dataset.

In [4]:
num_topics = 10             # Change this to view different output files 

# Viewing topics ---- 
topic_keys = lmw.load_topic_keys(output_directory_path + "mallet.topic_keys." + str(num_topics))
topic_label = []
for i, t in enumerate(topic_keys):
    topic_label.append(' '.join(t[:10]))
topic_label

['doctor take heart NUM blood taking said laxatives hospital drink',
 'recovery body still restriction hair physical long extreme able anyone',
 'food NUM eat ate foods one sugar chocolate cream today',
 'know like want really feel eating get think even also',
 'life never would people enough love every xNUMb body world',
 'said told people like one didn mom look always never',
 'eat NUM eating day food calories days binge hungry meal',
 'like feel want even know going time fucking get day',
 'NUM work last treatment time get year back years got',
 'weight NUM body gain lose lost look back gained fat']

In [14]:
topic_distributions = lmw.load_topic_distributions(output_directory_path + "mallet.topic_distributions." + str(num_topics))

# Get the most probable topic and topic distribution for each document ---- 
most_prob = []
topics = []
for doc in range(len(topic_distributions)): 
    item = topic_distributions[doc]
    most_prob.append(item.index(max(item)))
    topics.append(item)

# Get the most probable topic and topic distribution for each document (grouped by user) ----
most_prob_grouped = []
prob_grouped = []
for group in groupings:
    most_prob_grouped.append([most_prob[i] for i in group])
    prob_grouped.append([topics[i] for i in group])

positive_users['Topic Distribution'] = prob_grouped

In [26]:
# Local topic transition between entries within the same user ----
# For each user, get the transition probability from topic A to topic B by
# counting the number of times topic B appears after another topic A
# normalize by the total number of times topic A is the most probable topic

# Get the transition probability for each user ----
# transition_prob: List of dictionaries {topic A: {topic B: countBA}}
# topic_count: List of dictionaries {topic A: countA}
transition_count, topic_count = [], []
for user in most_prob_grouped:
    transition, topic = {}, {}
    for i in range(len(user)-1):
        if user[i] not in transition:
            transition[user[i]] = {user[i+1]: 1}
        else: 
            transition[user[i]][user[i+1]] = transition[user[i]].get(user[i+1], 0) + 1
        topic[user[i]] = topic.get(user[i], 0) + 1
    transition_count.append(transition)
    topic_count.append(topic)

[{2: {2: 2}},
 {3: {2: 1}},
 {3: {3: 1, 7: 1}},
 {9: {3: 1}},
 {6: {6: 1}},
 {6: {6: 1}},
 {7: {7: 1}},
 {7: {6: 1}},
 {1: {8: 1}},
 {9: {6: 1}, 6: {3: 1}},
 {3: {3: 3, 7: 1}},
 {4: {6: 1}},
 {3: {9: 1}},
 {2: {7: 1}},
 {6: {7: 1}},
 {3: {3: 1}},
 {3: {3: 1}},
 {3: {7: 1}, 7: {2: 1}},
 {3: {7: 1}, 7: {7: 1, 3: 1}},
 {3: {3: 2}},
 {6: {3: 1}},
 {3: {7: 1, 3: 2}, 7: {7: 1, 1: 1}, 1: {3: 1}},
 {9: {3: 1}},
 {6: {2: 1}},
 {7: {3: 1}, 3: {6: 1}, 6: {9: 1}},
 {7: {9: 1}},
 {3: {9: 1, 5: 1}, 9: {3: 1}},
 {7: {7: 1}},
 {3: {8: 1}},
 {3: {4: 1}},
 {3: {3: 1}},
 {3: {8: 1}},
 {7: {3: 1, 7: 1}, 3: {7: 1}},
 {7: {3: 1}},
 {6: {5: 1}},
 {3: {9: 1, 3: 2, 2: 1, 7: 2},
  9: {9: 1, 7: 1},
  7: {3: 2, 6: 1},
  2: {8: 1},
  8: {8: 1, 3: 1}},
 {9: {7: 1}},
 {2: {7: 3, 3: 1}, 7: {2: 2, 7: 2}, 3: {2: 1}},
 {7: {3: 1}, 3: {3: 2, 5: 1}},
 {7: {0: 1}, 0: {3: 1}, 3: {5: 1}},
 {3: {0: 1}, 0: {0: 1}},
 {7: {0: 1}, 0: {0: 1}},
 {6: {3: 1}, 3: {3: 2}},
 {3: {3: 1}},
 {3: {7: 1}, 7: {6: 1, 4: 1}, 6: {7: 1}, 4: {4: 1

In [39]:
# Normalize the transition probability ----
transition_prob_norm = []
for user in range(len(transition_count)): 
    transition_norm = {}
    for topicA in transition_count[user]: 
        for topicB in transition_count[user][topicA]: 
            transition_norm[topicA] = transition_norm.get(topicA, {})
            transition_norm[topicA][topicB] = transition_count[user][topicA][topicB] / topic_count[user][topicA]
    transition_prob_norm.append(transition_norm)

[{2: {2: 1.0}},
 {3: {2: 1.0}},
 {3: {3: 0.5, 7: 0.5}},
 {9: {3: 1.0}},
 {6: {6: 1.0}},
 {6: {6: 1.0}},
 {7: {7: 1.0}},
 {7: {6: 1.0}},
 {1: {8: 1.0}},
 {9: {6: 1.0}, 6: {3: 1.0}},
 {3: {3: 0.75, 7: 0.25}},
 {4: {6: 1.0}},
 {3: {9: 1.0}},
 {2: {7: 1.0}},
 {6: {7: 1.0}},
 {3: {3: 1.0}},
 {3: {3: 1.0}},
 {3: {7: 1.0}, 7: {2: 1.0}},
 {3: {7: 1.0}, 7: {7: 0.5, 3: 0.5}},
 {3: {3: 1.0}},
 {6: {3: 1.0}},
 {3: {7: 0.3333333333333333, 3: 0.6666666666666666},
  7: {7: 0.5, 1: 0.5},
  1: {3: 1.0}},
 {9: {3: 1.0}},
 {6: {2: 1.0}},
 {7: {3: 1.0}, 3: {6: 1.0}, 6: {9: 1.0}},
 {7: {9: 1.0}},
 {3: {9: 0.5, 5: 0.5}, 9: {3: 1.0}},
 {7: {7: 1.0}},
 {3: {8: 1.0}},
 {3: {4: 1.0}},
 {3: {3: 1.0}},
 {3: {8: 1.0}},
 {7: {3: 0.5, 7: 0.5}, 3: {7: 1.0}},
 {7: {3: 1.0}},
 {6: {5: 1.0}},
 {3: {9: 0.16666666666666666,
   3: 0.3333333333333333,
   2: 0.16666666666666666,
   7: 0.3333333333333333},
  9: {9: 0.5, 7: 0.5},
  7: {3: 0.6666666666666666, 6: 0.3333333333333333},
  2: {8: 1.0},
  8: {8: 0.5, 3: 0.5}},
 {9: {

In [59]:
# Get the most probable transition for each user----
max_top = []
for user in transition_prob_norm:
    maxing = {}
    for topic in user:
        max_val = 0 
        max_topic = []
        for t in user[topic]:
            if user[topic][t] > max_val: 
                max_val = user[topic][t]
                max_topic = [t]
            elif user[topic][t] == max_val: 
                max_topic.append(t)
        maxing[topic] = max_topic
    max_top.append(maxing)


# reformat max_top so that it is a list of lists of tuples ----
all_users = []
for user in max_top: 
    user_list = []
    for topic in user: 
        for t in user[topic]: 
            user_list.append((topic, t))
    all_users.append(user_list)
all_users_expanded = [user[i] for user in all_users for i in range(len(user))]

In [60]:
# Get the most frequent transition across all users ----
max_top_freq = {}
for tup in all_users_expanded:
    # get the most frequent transition for each user 
    if tup[0] not in max_top_freq: 
        max_top_freq[tup[0]] = {tup[1]: 1}
    else:
        max_top_freq[tup[0]][tup[1]] = max_top_freq[tup[0]].get(tup[1], 0) + 1

# Get the value with the highest count for each topic ----
# Topic     Most Frequent Transition
for topic in max_top_freq: 
    print(topic, max(max_top_freq[topic], key=max_top_freq[topic].get))


2 3
3 3
9 3
6 3
7 3
1 1
4 3
8 3
0 3
5 3


In [None]:
# Antoniak (2019)'s time series analysis ----

# For each topic, get the ranking of all the documents
ranking_doc = {}
for i in range(num_topics):
    ranking_doc[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=10):
        ranking_doc[topic_label[i]].append(str(training_data.index(d)))

# Get the position of each document in the ranking
ranking_pos = {}
for topic in ranking_doc: 
    for doc in ranking_doc[topic]: 
        if doc not in ranking_pos:
            ranking_pos[doc] = [(topic, ranking_doc[topic].index(doc))]
        else: 
            ranking_pos[doc].append((topic, ranking_doc[topic].index(doc)))

ranking_pos = {k: sorted(v, key=lambda x: x[1]) for k, v in ranking_pos.items()}
ranking_pos = dict(sorted(ranking_pos.items(), key=lambda item: int(item[0])))

# For each topic, get its probability of appearing in each document 
ranking_topic_prob = {}
for i in range(num_topics):
    ranking_topic_prob[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=len(training_data)):
        ranking_topic_prob[topic_label[i]].append((str(training_data.index(d)), p))
# Sort ranking_topic_prob by the first index of the value tuple
ranking_topic_prob = {k: sorted(v, key=lambda x: int(x[0])) for k, v in ranking_topic_prob.items()}