In [1]:
from datetime import datetime
import json
import os
import re

from bertopic import BERTopic
from bertopic.backend._utils import select_backend
import ftlangdetect
import gensim
import numpy as np
import pandas as pd
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess(raw_text):
    text = gensim.utils.to_unicode(raw_text, 'utf8', errors='ignore')
    text = text.lower()
    text = gensim.utils.deaccent(text)
    text = re.sub('@[^ ]+', '@user', text)
    text = re.sub('http[^ ]+', 'http', text)
    return text

def check_english(text):
    try:
        result = ftlangdetect.detect(text)
        return result['lang'] == 'en'
    except Exception as e:
        if str(e) == 'No features in text.':
            return False
        else:
            raise Exception('Unknown error')

def check_for_repeating_tokens(tokens):
    num_tokens = len(tokens)
    num_distinct_tokens = len(set(tokens))
    return (num_tokens / num_distinct_tokens) > 4



def process_comment(comment):
    comment_user = comment['user']
    if isinstance(comment_user, str):
        author_id = comment_user
        author_name = comment_user
    elif isinstance(comment_user, dict):
        if 'unique_id' in comment_user:
            author_id = comment_user['uid']
            author_name = comment_user['unique_id']
        elif 'uniqueId' in comment_user:
            author_id = comment_user['id']
            author_name = comment_user['uniqueId']
        else:
            author_name = ''
            author_id = comment_user['uid']
    else:
        raise Exception()

    comment_text = comment['text']
    return (
        comment['cid'],
        datetime.fromtimestamp(comment['create_time']), 
        author_name,
        author_id, 
        comment_text,
        comment['aweme_id']
    )

def load_comments_df():
    this_dir_path = os.path.dirname(os.path.abspath(__file__))
    data_dir_path = os.path.join(this_dir_path, '..', '..', 'data')

    comment_dir_path = os.path.join(data_dir_path, 'comments')

    comments_data = []
    for file_name in tqdm.tqdm(os.listdir(comment_dir_path)):
        file_path = os.path.join(comment_dir_path, file_name, 'video_comments.json')

        if not os.path.exists(file_path):
            continue

        with open(file_path, 'r') as f:
            comments = json.load(f)

        comments_data += [process_comment(comment) for comment in comments]
            
    comment_df = pd.DataFrame(comments_data, columns=['comment_id', 'createtime', 'author_name', 'author_id', 'text', 'video_id'])

    comment_df = comment_df[comment_df['text'].notna()]
    comment_df = comment_df[comment_df['text'] != '']
    comment_df = comment_df[comment_df['text'] != 'Nan']

    comment_df['text_no_newlines'] = comment_df['text'].str.replace(r'\n',  ' ', regex=True)
    regex_whitespace = '^[\s ︎]+$' # evil weird whitespace character
    comment_df = comment_df[~comment_df['text_no_newlines'].str.fullmatch(regex_whitespace)]

    # get only english comments
    comment_df['english'] = comment_df['text_no_newlines'].apply(check_english)
    english_comments_df = comment_df[comment_df['english']]

    # tokenize
    english_comments_df['text_processed'] = english_comments_df['text_no_newlines'].apply(preprocess)

    english_comments_df = english_comments_df[english_comments_df['text_processed'].notna()]
    english_comments_df = english_comments_df[english_comments_df['text_processed'] != '']

    # use first 1 mil
    return english_comments_df.iloc[:500000]

In [5]:
this_dir_path = os.path.abspath('')
data_dir_path = os.path.join(this_dir_path, '..', '..', 'data')

df_path = os.path.join(data_dir_path, 'cache', 'half_mil_english_comments.csv')
if not os.path.exists(df_path):
    final_comments_df = load_comments_df()
    final_comments_df.to_csv(df_path)

final_comments_df = pd.read_csv(df_path)

eng_raw_docs = list(final_comments_df['text_no_newlines'].values)
docs = list(final_comments_df['text_processed'].values)
timestamps = list(final_comments_df['createtime'].values)

# Train the model on the corpus.
pretrained_model = 'cardiffnlp/twitter-roberta-base'

seed_topic_list = [
    ['zelensky', 'slava', 'ukraine', 'hero'],
    ['china', 'nato', 'biden', 'trump', 'macron', 'boris'],
    ['ura', 'uraa', 'uraaa', 'uraaah', 'putin'],
    ['hilarious', 'love', 'tiktok', 'haha']
]

topic_model = BERTopic(seed_topic_list=None, embedding_model=pretrained_model, nr_topics=100)

#model_path = os.path.join(data_dir_path, 'cache', 'model')

#if not os.path.exists(model_path):
# get embeddings so we can cache
embeddings_cache_path = os.path.join(data_dir_path, 'cache', 'english_comment_twitter_roberta_embeddings.npy')
if os.path.exists(embeddings_cache_path):
    with open(embeddings_cache_path, 'rb') as f:
        embeddings = np.load(f)
else:
    topic_model.embedding_model = select_backend(pretrained_model,
                                    language=topic_model.language)
    embeddings = topic_model._extract_embeddings(docs,
                                                method="document",
                                                verbose=topic_model.verbose)
    with open(embeddings_cache_path, 'wb') as f:
            np.save(f, embeddings)

topics, probs = topic_model.fit_transform(docs, embeddings)

No sentence-transformers model found with name /home/ndg/users/bsteel2/.cache/torch/sentence_transformers/cardiffnlp_twitter-roberta-base. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/ndg/users/bsteel2/.cache/torch/sentence_transformers/cardiffnlp_twitter-roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
9331.63s - pydevd:

In [None]:
outputs_dir_path = os.path.join(data_dir_path, 'outputs')

topic_df = topic_model.get_topic_info()
topic_df.to_csv(os.path.join(outputs_dir_path, 'topics.csv'))

hierarchical_topics = topic_model.hierarchical_topics(docs)
tree = topic_model.get_topic_tree(hierarchical_topics)
with open(os.path.join(outputs_dir_path, 'cluster_tree.txt'), 'w') as f:
    f.write(tree)

In [None]:
topics_over_time_df = topic_model.topics_over_time(docs, timestamps, nr_bins=50)
