In [None]:
import little_mallet_wrapper as lmw
import pandas as pd 
import ast 
from lmw import *

MALLET_PATH = "~/mallet/bin/mallet"
POSITIVE_USER_PATH = "../data/positive_user.csv"
POSITIVE_NAR_PATH = "../data/narrative_positive.csv"

## Data Preprocessing

In [None]:
positive_users = pd.read_csv(POSITIVE_USER_PATH)
positive_users['selftext'] = positive_users['selftext'].apply(ast.literal_eval)
positive_users['created_utc'] = positive_users['created_utc'].apply(ast.literal_eval)
positive_users['link_flair_text'] = positive_users['link_flair_text'].apply(ast.literal_eval)
positive_users['title'] = positive_users['title'].apply(ast.literal_eval)
positive_users.head(5)

In [None]:
multiple_df = positive_users[positive_users['selftext'].apply(lambda x: len(x) > 1)].reset_index(drop=True)
multiple_df

In [None]:
positive_df = pd.read_csv(POSITIVE_NAR_PATH)

In [None]:
word_count = []
for posts in positive_users['selftext']:
    for post in posts: 
        word_count.append(len(post.split(".")))
word_count = pd.Series(word_count)
word_count.hist(bins=50, range=[0,500])
word_count.describe()

## Topic Modeling (temporal)
- Training data:
    + Selected narrative-positive posts from users who post at least twice. 
    + Each document represents a post. 

In [None]:
text = [sub for i in multiple_df['selftext'] for sub in i]
training_data = [lmw.process_string(t) for t in text]
training_data = [d for d in training_data if d.strip()]
training_data

In [None]:
num_topics = 5
output_directory_path = "../data/output/temporal-output/"
lmw_training(num_topics, output_directory_path, training_data)

In [None]:
topic_keys = lmw.load_topic_keys(output_directory_path + "mallet.topic_keys." + str(num_topics))
topic_label = []
for i, t in enumerate(topic_keys):
    print(i, '\t', ' '.join(t[:10]))
    topic_label.append(t[0])
topic_label

In [None]:
# For each topic, get the ranking of all the documents
ranking_doc = {}
topic_distributions = lmw.load_topic_distributions(output_directory_path + "mallet.topic_distributions." + str(num_topics))
for i in range(num_topics):
    ranking_doc[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=15):
        if p > 0.1: 
            ranking_doc[topic_label[i]].append(str(training_data.index(d)))
ranking_doc

In [None]:
# Get the position of each document in the ranking
ranking_pos = {}
for topic in ranking_doc: 
    for doc in ranking_doc[topic]: 
        if doc not in ranking_pos:
            ranking_pos[doc] = [(topic, ranking_doc[topic].index(doc))]
        else: 
            ranking_pos[doc].append((topic, ranking_doc[topic].index(doc)))

ranking_pos = {k: sorted(v, key=lambda x: x[1]) for k, v in ranking_pos.items()}
ranking_pos = dict(sorted(ranking_pos.items(), key=lambda item: int(item[0])))

In [None]:
# (0,1), (2,3), (4,5,6), (7,8), (9,10), (11,12), (13,14) belong to the same user
# (7,8) is about filiming documentary
for item in ranking_pos: 
    print(item, ranking_pos[item][0])

Not really interesting result because most users post about the same topic over time. Can be because the number of topics is small. However, k>5 returns topics of lower quality. 

## Topic Modeling (General)
- Training data:
    + Selected narrative-positive posts. 
    + Each document represent a post. 

In [None]:
training_data = [lmw.process_string(t) for t in positive_df['selftext'].tolist()]
training_data = [d for d in training_data if d.strip()]
training_data

In [None]:
num_topics = 15
output_directory_path = "../data/output/general-output/"
lmw_training(num_topics, output_directory_path, training_data)

In [None]:
topic_keys = lmw.load_topic_keys(output_directory_path + "mallet.topic_keys." + str(num_topics))
topic_label = []
for i, t in enumerate(topic_keys):
    print(i, '\t', ' '.join(t[:10]))
    topic_label.append(t[0])
topic_label

In [None]:
# For each topic, get the ranking of all the documents
ranking_doc = {}
topic_distributions = lmw.load_topic_distributions(output_directory_path + "mallet.topic_distributions." + str(num_topics))
for i in range(num_topics):
    ranking_doc[topic_label[i]] = []
    for p, d in lmw.get_top_docs(training_data, topic_distributions, topic_index=i, n=15):
        if p > 0.1: 
            ranking_doc[topic_label[i]].append(str(training_data.index(d)))
ranking_doc

In [None]:
# Get the position of each document in the ranking
ranking_pos = {}
for topic in ranking_doc: 
    for doc in ranking_doc[topic]: 
        if doc not in ranking_pos:
            ranking_pos[doc] = [(topic, ranking_doc[topic].index(doc))]
        else: 
            ranking_pos[doc].append((topic, ranking_doc[topic].index(doc)))

ranking_pos = {k: sorted(v, key=lambda x: x[1]) for k, v in ranking_pos.items()}
ranking_pos = dict(sorted(ranking_pos.items(), key=lambda item: int(item[0])))

In [None]:
# Top topic for each document
for item in ranking_pos: 
    print(item, ranking_pos[item][0])