In [1]:
from proj_utils import *

import os
import pickle
from os.path import join
import gc

from copy import deepcopy
from collections import defaultdict, OrderedDict, Counter
from itertools import combinations, accumulate
from datetime import datetime
from dateutil.relativedelta import relativedelta
import ipywidgets as widgets
from IPython.display import display

import numpy as np
import pandas as pd
from tqdm import tqdm

import sklearn as sk
from scipy.optimize import linear_sum_assignment
from scipy.optimize import curve_fit
from sklearn.metrics.pairwise import cosine_similarity

import torch
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

import bertopic
from sentence_transformers import SentenceTransformer

from proj_utils import *

  query = {'raw_message': {"$not": {"$regex": '.*https?:\/\/.*[\r\n]*'}}}     # store only comments that do not have urls


In [2]:
def date_label(start_date, end_date):
    date_list = pd.date_range(start_date, end_date, freq='Y').strftime('%Y').tolist()
    jan_indices = [i for i, date in enumerate(pd.date_range(start_date, end_date, freq='M')) if date.month == 1]
    date_list = [date.strftime('%Y') for date in pd.date_range(start_date, end_date, freq='M') if date.month == 1]
    return jan_indices, date_list

def power_law(alpha, xmax):
    x = np.arange(1, xmax+1, dtype='float')
    pmf = x**(-alpha)
    pmf /= pmf.sum()
    return pmf, stats.rv_discrete(values=(range(1, xmax+1), pmf))

def truncated_power_law(alpha, Lambda, xmax):
    x = np.arange(1, xmax+1, dtype='float')
    pmf = x**(-alpha) * np.exp(-Lambda * x)
    pmf /= pmf.sum()
    return pmf, stats.rv_discrete(values=(range(1, xmax+1), pmf))

def sorted_topic(rank, rd):
    return np.where(rd == rank)[0][0]

def get_ranking(array):
    temp = array.argsort()[::-1]
    ranks = np.empty_like(temp)
    ranks[temp] = np.arange(len(array))
    return ranks

# articles_by_month clear (remove 2.5% of <10 comments data)
def articles_by_month_clear(articles_by_month, comment_threshold=10, exclude_neg_one=True):

    original_article_num = 0
    filtered_article_num_1 = 0
    filtered_article_num_2 = 0
    
    articles_by_month_new = {}

    for month in articles_by_month.keys():
        tmp_df = articles_by_month[month]
        original_article_num += len(tmp_df)
        tmp_df['n1_num'] = tmp_df['comment_topics'].apply(lambda x: x.count(-1))
        tmp_df['comment_num'] = tmp_df['comment_topics'].apply(lambda x: len(x))
        if exclude_neg_one:
            tmp_df = tmp_df[tmp_df['comment_num']-tmp_df['n1_num'] > comment_threshold]
        else:
            tmp_df = tmp_df[tmp_df['comment_num'] > comment_threshold]
            
        filtered_article_num_1 += len(tmp_df)
        if 'topic_num' in tmp_df.columns:
            tmp_df = tmp_df[tmp_df['topic_num'].apply(lambda x: -1 not in x[:3])]
        else:
            tmp_df = tmp_df[tmp_df['topic_num_global'].apply(lambda x: -1 not in x[:3])]
            
        filtered_article_num_2 += len(tmp_df)
        articles_by_month_new[month] = tmp_df
        
    print(f'Original article number: {original_article_num}')
    print(f'Filtered article (after comment) number: {filtered_article_num_1}')
    print(f'Filtered article (after article) number: {filtered_article_num_2}')
    print(f'Filtered article ratio: {filtered_article_num_2/original_article_num:.6f}')
    return articles_by_month_new
    
# ==================================================================

from scipy.stats import kendalltau
from scipy.stats import spearmanr

def kendalltau_dist(x, y):
    x = kendalltau(np.argsort(x), np.argsort(y)).statistic
    return (1-x)/2

def autocorr(data, max_lag=30):
    T = data.shape[0]
    autocorr_results = np.zeros(max_lag + 1)
    
    for lag in range(max_lag + 1):
        if lag == 0:
            autocorr_results[lag] = 1
        else:
            # Correlate vectors at lag
            valid_range = slice(0, T - lag)
            corr_matrix = np.corrcoef(data[valid_range].flatten(), data[lag:T].flatten())
            autocorr_results[lag] = corr_matrix[0, 1]
            
    return autocorr_results

def crosscorr(data1, data2, max_lag=30):
    assert data1.shape[0] == data2.shape[0]  # Not considering the other case
    
    T = data1.shape[0]
    crosscorr_results = np.zeros(max_lag + 1)
    
    for lag in range(max_lag + 1):
        # Correlate vectors at lag
        valid_range = slice(0, T - lag)
        corr_matrix = np.corrcoef(data1[valid_range].flatten(), data2[lag:T].flatten())
        crosscorr_results[lag] = corr_matrix[0, 1]
            
    return crosscorr_results

# ==================================================================

with open(join('result', 'comment_topic_freq_dict.pkl'), 'rb') as f:
    comment_topic_freq_dict = pickle.load(f)

ranking_dict = {}
for collection_name in COLLECTION_NAMES + ['global']:
    topic_freq = comment_topic_freq_dict[collection_name][1:] # exclude -1
    ranking_dict[collection_name] = np.argsort(topic_freq)[::-1] # descending order

ranking_index_dict = {}
for collection_name in COLLECTION_NAMES + ['global']:
    ranking_index_dict[collection_name] = [sorted_topic(i, ranking_dict[collection_name]) for i in range(NUM_TOPICS[collection_name])]
    
color_list = ['#e84d8a', '#feb326', '#60bd68', '#64c5eb', '#7f58af', '#808080', '#000000']

# 1. **ARTICLES_BY_DAY**
* Caution : before run this and created articles_by_day, run [transform] in 'fit-final-models.py' and time-analysis.py.

## 1A. articles_by_day from article_dict

In [None]:
topic_model = (BERTopic.load(join('model', collection_name.lower(), model_name), embedding_model="all-MiniLM-L6-v2"))

# open pickle files
with open(join('article', collection_name.lower(), MODEL_NAMES[collection_name], 'article_dict.pkl'), 'rb') as f:
    article_dict = pickle.load(f)

In [None]:
# from pymongo collection, query articles with given id
def get_articles_from_ids(collection, ids, select_columns):
    query = {'_id': {'$in': ids}}
    projection = {k: 1 for k in select_columns}
    articles = collection.find(query, projection)
    return pd.DataFrame(articles)

# from pymongo collection, query articles with given date range
def get_articles_from_date(collection, start_date, end_date, select_columns):
    query = {'createdAt': {'$gte': start_date, '$lt': end_date}}
    projection = {k: 1 for k in select_columns}
    articles = collection.find(query, projection)
    return pd.DataFrame(articles)

# get comments with given article id
def get_comments_from_id(collection, id, select_columns):
    query = {'art_id': id}
    projection = {k: 1 for k in select_columns}
    comments = collection.find(query, projection)
    return pd.DataFrame(comments)

In [None]:
def get_articles_by_day(collection_articles, collection_comments, start_date, end_date, select_columns):
    device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
    sentence_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
    articles_by_day = defaultdict(list)
    while start_date < end_date:
        next_day = start_date + relativedelta(days=1)
        articles = get_articles_from_date(collection_articles, start_date, next_day, select_columns)
        if len(articles) > 0:
            articles['title_embeddings'] = sentence_model.encode(articles['clean_title'].values.tolist(), show_progress_bar=True, convert_to_tensor=True).tolist()
        articles_by_day[start_date.strftime("%Y-%m-%d")] = articles
        start_date = next_day
    return articles_by_day

In [None]:
from proj_utils import _init_mongo_collection

select_columns = ['_id', 'clean_title', 'createdAt']
mongo_client_articles, collection_articles = _init_mongo_collection('Articles', collection_name)
mongo_client_comments, collection_comments = _init_mongo_collection('Comments', collection_name)

### 1A-1. Query article title info from mongoDB

In [None]:
articles_by_day = get_articles_by_day(collection_articles, collection_comments, start_date, end_date, select_columns)

# pop element from articles_by_day if there are no articles
for day in list(articles_by_day.keys()):
    if len(articles_by_day[day]) == 0:
        articles_by_day.pop(day)

In [None]:
with open(join('article', collection_name.lower(), model_name, 'articles_by_day.pkl'), 'wb') as f:
    pickle.dump(articles_by_day, f)

### 1A-2. The guardians of time (relocating time-traveling articles into its rightful position)

In [None]:
# load articles_by_day from pickle file
with open(join('article', collection_name.lower(), model_name, 'articles_by_day.pkl'), 'rb') as f:
    articles_by_day = pickle.load(f)

In [None]:
time_traveler_df = pd.DataFrame(columns=articles_by_day[list(articles_by_day.keys())[0]].columns)
move_day_list = []

original_key_list = sorted(list(articles_by_day.keys()), key=lambda x: datetime.strptime(x, '%Y-%m-%d'))

articles_by_day.default_factory = pd.DataFrame

for article_day in original_key_list:
    print(article_day)
    articles = articles_by_day[article_day]
    article_date = datetime.strptime(article_day, '%Y-%m-%d').strftime('%m%y')

    # remove time travelers

    for article_id, article_createdAt in articles[['_id', 'createdAt']].values:
        if article_id in article_dict.keys():
            article = article_dict[article_id]
            
            # sort article['createdAt'] and return sorted index as well
            sorted_index = np.argsort(article['createdAt'])
            ids = np.array(article['id'])[sorted_index]
            comment_createdAt_list = np.array(article['createdAt'])[sorted_index]
            article_dict[article_id]['article_createdAt'] = article_createdAt
            
            # check whether the where the comments_createdAt is earlier thatn the createdAt, check those indices.
            time_check = np.where(np.array([(comment_createdAt - article_createdAt).days for comment_createdAt in comment_createdAt_list]) < 0)[0]
            if len(time_check) > 0:
                print(article_day + ' ' + article_id)
                time_traveler = articles[articles['_id'] == article_id]
                
                articles_by_day[article_day] = articles_by_day[article_day][articles_by_day[article_day]['_id'] != article_id] # pop time traveler from current day
                if articles_by_day[article_day].empty:
                    articles_by_day.pop(article_day)
                time_traveler_df = time_traveler_df.append(time_traveler)
                first_comment_date = comment_createdAt_list[0]
                move_day_list.append(first_comment_date)
                
                # change the createdAt of the article
                new_article_createdAt = first_comment_date - relativedelta(days=1)
                time_traveler['createdAt'] = new_article_createdAt
                article_dict[article_id]['article_createdAt'] = new_article_createdAt

                # append time traveler to new day
                new_article_day = first_comment_date.strftime('%Y-%m-%d')
                print(new_article_day, len(articles_by_day[new_article_day]), len(time_traveler))
                articles_by_day[new_article_day] = articles_by_day[new_article_day].append(time_traveler)
                print(len(articles_by_day[new_article_day]))
                
                # sort articles_by_day[new_article_day] by createdAt and reindex
                articles_by_day[new_article_day] = articles_by_day[new_article_day].sort_values(by='createdAt').reset_index(drop=True)
                
time_traveler_df['move_day'] = move_day_list

In [None]:
# load time_travler_df.feather
time_traveler_df = pd.read_feather(join('article', collection_name.lower(), model_name, 'time_traveler_df.feather'))
time_traveler_df

In [None]:
# save time_traveler_df
time_traveler_df.reset_index().to_feather(join('article', collection_name.lower(), model_name, 'time_traveler_df.feather'))
# save article_dict again
with open(join('article', collection_name.lower(), model_name, 'article_dict.pkl'), 'wb') as f:
    pickle.dump(article_dict, f)
# save articles_by_day to pickle file
with open(join('article', collection_name.lower(), model_name, 'articles_by_day.pkl'), 'wb') as f:
    pickle.dump(articles_by_day, f)

### 1A-3. Add title topic classification / comments data into article_by_days

In [None]:
# load articles_by_day from pickle file
with open(join('article', collection_name.lower(), model_name, 'articles_by_day.pkl'), 'rb') as f:
    articles_by_day = pickle.load(f)

In [None]:
topic_model = (BERTopic.load(join('model', collection_name.lower(), model_name), embedding_model="all-MiniLM-L6-v2"))

In [None]:
# open pickle files
with open(join('article', collection_name.lower(), MODEL_NAMES[collection_name], 'article_dict.pkl'), 'rb') as f:
    article_dict = pickle.load(f)

In [None]:
def load_embeddings(date):
    print(date)
    next_date = next_month(date)
    embedding_foler = '/data/comments/valentin/sentence-embeddings/'
    embedding_path = embedding_foler + f'{collection_name.lower()}/bert-emb-{date}-{next_date}.pt'
    embedding = torch.load(embedding_path, map_location=torch.device('cpu'))
    if embedding['embeddings'].device.type == 'cuda':
        print(date, 'cuda')
        embedding['embeddings']= embedding['embeddings'].to('cpu')
    return embedding

In [None]:
first=True
current_date = datetime.strptime(list(articles_by_day.keys())[0], '%Y-%m-%d').strftime('%m%y')
next_date = next_month(current_date)

original_key_list = list(articles_by_day.keys())
original_key_list = sorted(original_key_list, key=lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [None]:
first=True
current_date = '0116'
next_date = next_month(current_date)

original_key_list = ['2016-01-05','2016-01-06']

In [None]:
first=True
current_date = datetime.strptime(list(articles_by_day.keys())[0], '%Y-%m-%d').strftime('%m%y')
next_date = next_month(current_date)

start_date, end_date = datetime(2020, 11, 1), datetime(2021, 11, 1)

delta = timedelta(days=1)
current_date = start_date
original_key_list = []

while current_date <= end_date:
    original_key_list.append(current_date.strftime('%Y-%m-%d'))
    current_date += delta

print(original_key_list)

In [None]:
# check whether the file named as articly_day.feather exist
articles_by_day_folder = join('article', collection_name.lower(), model_name, 'articles_by_day')
if not os.path.exists(articles_by_day_folder):
    os.makedirs(articles_by_day_folder)

for article_day in original_key_list:
    
    print(article_day)
    articles = articles_by_day[article_day]
    article_date = datetime.strptime(article_day, '%Y-%m-%d').strftime('%m%y')
    
    '''
    if os.path.isfile(join('article', collection_name.lower(), model_name, 'articles_by_day', article_day + '.parquet')):
        print('Already processed', article_day)
        articles_by_day.pop(article_day)
        continue
    '''

    # load embeddings

    if first:
        current_date = datetime.strptime(article_day, '%Y-%m-%d').strftime('%m%y')
        next_date = next_month(current_date)
        current_embeddings = load_embeddings(current_date)
        if current_date == (end_date - relativedelta(months=1)).strftime('%m%y'):
            print('final')
            merged_embeddings = pd.DataFrame()
            mgi = np.array(current_embeddings['_id']).astype(int)
            mgi_argsort = mgi.argsort()
            merged_embeddings['_id'] = mgi[mgi_argsort]
            merged_embeddings['embeddings'] = (current_embeddings['embeddings']).numpy()[mgi_argsort].tolist()
            
        else:
            next_embeddings = load_embeddings(next_date)
            merged_embeddings = pd.DataFrame()
            mgi = np.array(current_embeddings['_id'] + next_embeddings['_id']).astype(int)
            mgi_argsort = mgi.argsort()
            merged_embeddings['_id'] = mgi[mgi_argsort]
            merged_embeddings['embeddings'] = torch.vstack((current_embeddings['embeddings'], next_embeddings['embeddings'])).numpy()[mgi_argsort].tolist()
        
        first = False
        
    if current_date != article_date:  # next month happened
        current_date = article_date
        next_date = next_month(current_date)
        current_embeddings = next_embeddings
        # Exception case: date is one month before the end_date
        if current_date == (end_date - relativedelta(months=1)).strftime('%m%y'):
            merged_embeddings = pd.DataFrame()
            mgi = np.array(current_embeddings['_id']).astype(int)
            mgi_argsort = mgi.argsort()
            merged_embeddings['_id'] = mgi[mgi_argsort]
            merged_embeddings['embeddings'] = (current_embeddings['embeddings']).numpy()[mgi_argsort].tolist()
        else:
            next_embeddings = load_embeddings(next_date)
            merged_embeddings = pd.DataFrame()
            mgi = np.array(current_embeddings['_id'] + next_embeddings['_id']).astype(int)
            mgi_argsort = mgi.argsort()
            merged_embeddings['_id'] = mgi[mgi_argsort]
            merged_embeddings['embeddings'] = torch.vstack((current_embeddings['embeddings'], next_embeddings['embeddings'])).numpy()[mgi_argsort].tolist()

    # title topic

    class_nums = []
    class_probs = []

    for i in range(len(articles)):
        output = topic_model.find_topics(articles['clean_title'].values[i])
        class_nums.append(output[0])
        class_probs.append(output[1])
        
    articles['topic_num'] = class_nums
    articles['topic_prob'] = class_probs
    
    # comments data
    
    id_list = []
    topic_list = []
    createdAt_list = []
    embeddings_list = []
    original_comment_num_list = []
    
    for article_id, article_createdAt in articles[['_id', 'createdAt']].values:
        if article_id in article_dict.keys():
            article = article_dict[article_id]
            article_ids = np.array(article['id']).astype(int)
            sorted_index = np.argsort(article_ids)
            ids = article_ids[sorted_index]
        
            merged_embeddings_id = np.array(merged_embeddings['_id'])
            id_indices, indices_exist = np.intersect1d(merged_embeddings_id, ids, assume_unique=True, return_indices=True)[1:]
            embeddings_list.append(merged_embeddings.iloc[id_indices]['embeddings'].tolist())
            merged_embeddings = merged_embeddings.drop(id_indices)
            merged_embeddings.reset_index(drop=True, inplace=True)

            id_list.append(np.array(article['id'])[sorted_index][indices_exist])
            topic_list.append(np.array(article['topics'])[sorted_index][indices_exist])
            createdAt_list.append(np.array(article['createdAt'])[sorted_index][indices_exist])
            original_comment_num_list.append(ids)
            assert len(embeddings_list[-1]) == len(id_list[-1]) == len(topic_list[-1]) == len(createdAt_list[-1])
            
        else:
            id_list.append([])
            topic_list.append([])
            createdAt_list.append([])
            embeddings_list.append([])
            
    articles['comment_id'] = id_list
    articles['comment_topics'] = topic_list
    articles['comment_createdAt'] = createdAt_list
    articles['comment_embeddings'] = embeddings_list
    
    articles.reset_index(drop=True).to_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', article_day + '.parquet'), compression='gzip')
    articles_by_day.pop(article_day)
    del id_list, topic_list, createdAt_list, embeddings_list, articles

### 1A-4. Add global topic data into articles_by_day (optional)

In [None]:
collection_name = 'Breitbart'
model_name = MODEL_NAMES[collection_name]
threshold = 10

start_date, end_date = DATE_RANGES[collection_name]
num_topics = NUM_TOPICS[collection_name]
topic_model = (BERTopic.load(join('model', 'global', MODEL_NAMES['global']), embedding_model="all-MiniLM-L6-v2"))

In [None]:
from functools import reduce

collection_name = 'Breitbart'
model_name = MODEL_NAMES[collection_name]
article_day = '2022-09-01'
previous_articles = pd.read_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', article_day + '.parquet'))
Counter(reduce((lambda x, y: x + y), [x.tolist() for x in previous_articles['comment_topics_global'].values]))

In [None]:
def load_embeddings(date):
    print(date)
    next_date = next_month(date)
    embedding_foler = '/data/comments/valentin/sentence-embeddings/'
    embedding_path = embedding_foler + f'{collection_name.lower()}/bert-emb-{date}-{next_date}.pt'
    embedding = torch.load(embedding_path, map_location=torch.device('cpu'))
    if embedding['embeddings'].device.type == 'cuda':
        print(date, 'cuda')
        embedding['embeddings']= embedding['embeddings'].to('cpu')
    return embedding

In [None]:
# IF GLOBAL ARTICLE DICT MERGE IS NEEDED

article_dict = {}

year_list = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']

for year in year_list:
    print(year)
    with open(join('article', collection_name.lower(), MODEL_NAMES[collection_name], f'article_global_dict_{year}.pkl'), 'rb') as f:
        tmp_article_dict = pickle.load(f)
        
    for key, value in tmp_article_dict.items():
        if key not in article_dict:
            article_dict[key] = value
        else:
            for key2, value2 in value.items():
                article_dict[key][key2] += value2
        
with open(join('article', collection_name.lower(), MODEL_NAMES[collection_name], f'article_global_dict.pkl'), 'wb') as f:
    pickle.dump(article_dict, f)

In [None]:
# open pickle files
with open(join('article', collection_name.lower(), MODEL_NAMES[collection_name], 'article_global_dict.pkl'), 'rb') as f:
    article_global_dict = pickle.load(f)


In [None]:

# load articles_by_day from pickle file
with open(join('article', collection_name.lower(), model_name, 'articles_by_day.pkl'), 'rb') as f:
    articles_by_day = pickle.load(f)

In [None]:
# check article_dict and article_global_dict have same elements (only different topics)

assert article_dict.keys() == article_global_dict.keys()

for key in tqdm(article_dict.keys()):
    old = article_dict[key]
    new = article_global_dict[key]
    assert old['id'] == new['id']

In [None]:
first=True
current_date = datetime.strptime(list(articles_by_day.keys())[0], '%Y-%m-%d').strftime('%m%y')
next_date = next_month(current_date)

original_key_list = list(articles_by_day.keys())
original_key_list = sorted(original_key_list, key=lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [None]:
first=True
current_date = '0116'
next_date = next_month(current_date)

original_key_list = ['2016-01-05','2016-01-06']

In [None]:
 ## OPTIONAL : REMOVE ALL PREVIOUS GLOBAL-RELATED COLUMNS
    ## TARGET : 'topic_num_global', 'topic_prob_global', 'comment_topics_global', 
    
for article_day in original_key_list:
    
    print(article_day)
    articles = articles_by_day[article_day]
    article_date = datetime.strptime(article_day, '%Y-%m-%d').strftime('%m%y')
    previous_articles = pd.read_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', article_day + '.parquet'))
    
    # REMOVE
    previous_articles = previous_articles.drop(columns=['topic_num_global', 'topic_prob_global', 'comment_topics_global'])
    previous_articles.to_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', article_day + '.parquet'), compression='gzip')
    

In [None]:
# check whether the file named as articly_day.feather exist
articles_by_day_folder = join('article', collection_name.lower(), model_name, 'articles_by_day')

for article_day in original_key_list:
    
    print(article_day)
    articles = articles_by_day[article_day]
    article_date = datetime.strptime(article_day, '%Y-%m-%d').strftime('%m%y')
    '''
    
    if datetime.strptime(article_day, '%Y-%m-%d') < datetime(2016, 7, 1):
        print(article_day, 'pass')
        continue
    
    '''
    
    previous_articles = pd.read_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', article_day + '.parquet'))

    if 'topic_num_global' in previous_articles.columns:
        print('global exist')
        previous_articles = previous_articles.drop(columns=['topic_num_global', 'topic_prob_global', 'comment_topics_global'])
    else:
        print('global removed')
        
    class_nums = []
    class_probs = []

    for i in range(len(articles)):
        output = topic_model.find_topics(articles['clean_title'].values[i])
        class_nums.append(output[0])
        class_probs.append(output[1])
        
    previous_articles['topic_num_global'] = class_nums
    previous_articles['topic_prob_global'] = class_probs

    # comments data
    
    topic_global_list = []
    for article_id, comment_id in previous_articles[['_id', 'comment_id']].values:
        if article_id in article_global_dict.keys():
            article = article_global_dict[article_id]
            article_ids = np.array(article['id']).astype(int)
            sorted_index = np.argsort(article_ids)
            ids = article_ids[sorted_index]
        
            id_indices, indices_exist = np.intersect1d(comment_id.astype(np.int64), ids, assume_unique=True, return_indices=True)[1:]
            topic_global_list.append(np.array(article['topics'])[sorted_index][indices_exist])
        else:
            topic_global_list.append([])

    previous_articles['comment_topics_global'] = topic_global_list
    assert previous_articles['comment_topics'].apply(len).values.tolist() == previous_articles['comment_topics_global'].apply(len).values.tolist()

    previous_articles.reset_index(drop=True).to_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', article_day + '.parquet'), compression='gzip')
    articles_by_day.pop(article_day)

### 1A-5. Remove duplicate comments (Gatewaypundit specific)

In [None]:
collection_name = 'Gatewaypundit'
model_name = MODEL_NAMES[collection_name]
comment_threshold = 10

start_date, end_date = DATE_RANGES[collection_name]
num_topics = NUM_TOPICS[collection_name]

from proj_utils import _init_mongo_collection
select_columns = ['_id', 'clean_title', 'createdAt', 'link']

mongo_client_articles, collection_articles = _init_mongo_collection('Articles', collection_name)
mongo_client_comments, collection_comments = _init_mongo_collection('Comments', collection_name)

In [None]:
start_date, end_date = datetime(2020, 11, 1), datetime(2021, 11, 1)

delta = timedelta(days=1)
current_date = start_date
key_list = []

while current_date <= end_date:
    key_list.append(current_date.strftime('%Y-%m-%d'))
    current_date += delta

print(key_list)

In [None]:
sim_threshold = 0.9

for key in key_list:
    print(key)

    articles = pd.read_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', key +'.parquet'))
    total_remove_num = 0
    
    comment_id_list = []
    comment_topics_list = []
    comment_topics_global_list = []
    comment_createdAt_list = []
    comment_embeddings_list = [] 
    comment_sentiments_list = []
    
    for i in range(len(articles)):
        test_article = articles.iloc[i]
        remove_index = []
        
        if len(test_article['comment_id'])>1:

            # Create a defaultdict to store comments with the same createdAt
            comments_by_createdAt = defaultdict(list)

            # Iterate over the comments in the test_article
            for comment_id, createdAt in zip(test_article['comment_id'], test_article['comment_createdAt']):
                comments_by_createdAt[createdAt].append(comment_id)

            # Filter out the comments with the same createdAt (more than 1)
            comments_d = [comments for comments in comments_by_createdAt.values() if len(comments) > 1]
            
            if len(comments_d) != 0:
                
                for i, comments in enumerate(comments_d):
                    num_comment = len(comments)
                    index_list = [list(test_article['comment_id']).index(comment_id) for comment_id in comments]
                    embeddings_array = np.array([test_article['comment_embeddings'][list(test_article['comment_id']).index(comment_id)] for comment_id in comments])

                    # Calculate the cosine similarity matrix
                    similarity_matrix = cosine_similarity(embeddings_array)
                    sim_data = similarity_matrix[np.triu_indices(num_comment, k = 1)]
                    pair_list = np.array(list(combinations(range(num_comment), 2)))
                    same_pair = pair_list[sim_data > sim_threshold]
                    
                    if len(same_pair) > 0:
                        if len(same_pair) == 1:
                            same_pair = same_pair[0]
                            remove_index.append(index_list[same_pair[1]]) # always remove latter one
                        else:
                            remove_indices = list(set(same_pair.flatten()))[1:]
                            for r in remove_indices:
                                remove_index.append(index_list[r])

  
        filtered_index = [i for i in range(len(test_article['comment_id'])) if i not in remove_index]
        total_remove_num += len(remove_index)
        
        comment_id_list.append([test_article['comment_id'][i] for i in filtered_index])
        comment_topics_list.append([test_article['comment_topics'][i] for i in filtered_index])
        comment_topics_global_list.append([test_article['comment_topics_global'][i] for i in filtered_index])
        comment_createdAt_list.append([test_article['comment_createdAt'][i] for i in filtered_index])
        comment_embeddings_list.append([test_article['comment_embeddings'][i] for i in filtered_index])
        comment_sentiments_list.append([test_article['comment_sentiments'][i] for i in filtered_index])
        
    print(total_remove_num)
    
    articles = articles.assign(comment_id=comment_id_list, 
                                 comment_topics=comment_topics_list,
                                 comment_topics_global=comment_topics_global_list, 
                                 comment_createdAt=comment_createdAt_list, 
                                 comment_embeddings=comment_embeddings_list, 
                                 comment_sentiments=comment_sentiments_list)


    if total_remove_num > 0:
        new_filepath = join('article', collection_name.lower(), model_name, 'articles_by_day', key + '_new.parquet')
        old_filepath = join('article', collection_name.lower(), model_name, 'articles_by_day', key + '.parquet')
        
        articles.to_parquet(new_filepath, compression='gzip')
        
        if os.path.exists(new_filepath):
            os.remove(old_filepath)
            os.rename(new_filepath, old_filepath)
    

## 1B. Adding sentiments into articles_by_day

In [None]:
def load_sentiments(collection_name, date):
    print(date)
    next_date = next_month(date)
    sentiments_foler = f'/data/comments/valentin/max-sentiment-analysis/{collection_name.lower()}/sentiments_by_comment/'
    sentiments_path = sentiments_foler + f'batch-{date}.csv_createdAt'
    try:
        sentiments = pd.read_csv(sentiments_path)  # order : anger	anticipation	disgust	fear	joy	love	optimism	pessimism	sadness	surprise	trust
    except:
        print('go to old')
        sentiments = pd.read_csv(f'_old/batch-{date}.csv')
    return sentiments

In [None]:
original_key_list = ['2016-01-05','2016-01-06']

In [None]:
original_key_list = os.listdir(join('article', collection_name.lower(), model_name, 'articles_by_day'))
original_key_list = [file_name.split('.')[0] for file_name in original_key_list]
original_key_list = sorted(original_key_list, key=lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [None]:
first=True
current_date = datetime.strptime(list(articles_by_day.keys())[0], '%Y-%m-%d').strftime('%m%y')
next_date = next_month(current_date)

start_date, end_date = datetime(2020, 11, 1), datetime(2021, 11, 1)

delta = timedelta(days=1)
current_date = start_date
original_key_list = []

while current_date <= end_date:
    original_key_list.append(current_date.strftime('%Y-%m-%d'))
    current_date += delta

print(original_key_list)

In [None]:
first=True
current_date = datetime.strptime(original_key_list[0], '%Y-%m-%d').strftime('%m%y')
next_date = next_month(current_date)

for article_day in original_key_list:
    print(article_day) 
    
    '''
    
    if datetime.strptime(article_day, '%Y-%m-%d') < datetime(2019, 1, 1):
        print('Already processed', article_day)
        continue
        
    '''
    
    with open(join('article', collection_name.lower(), model_name, 'articles_by_day', article_day + '.parquet'), 'rb') as f:
        articles = pd.read_parquet(f)
        if 'comment_sentiments' in articles.columns:
            print('Already processed', article_day)
            continue
    
    article_date = datetime.strptime(article_day, '%Y-%m-%d').strftime('%m%y')
    
    if first:
        current_date = datetime.strptime(article_day, '%Y-%m-%d').strftime('%m%y')
        next_date = next_month(current_date)
        current_sentiments = load_sentiments(collection_name, current_date)
        next_sentiments = load_sentiments(collection_name, next_month(current_date)) 
        merged_sentiments = pd.concat([current_sentiments, next_sentiments], ignore_index=True)
        merged_sentiments.set_index('id', inplace=True)
        first = False
        
    if current_date != article_date:  # next month happened
        current_date = article_date
        next_date = next_month(current_date)
        current_sentiments = next_sentiments
        
        if current_date == (end_date - relativedelta(months=1)).strftime('%m%y'):
            merged_sentiments = current_sentiments
            merged_sentiments.set_index('id', inplace=True)
        else:
            next_sentiments = load_sentiments(collection_name, next_month(current_date)) 
            merged_sentiments = pd.concat([current_sentiments, next_sentiments], ignore_index=True)
            merged_sentiments.set_index('id', inplace=True)
        

    sentiments_list = []
    
    for comment_id in articles['comment_id']:
        if len(comment_id):
            id_list = [int(c) for c in comment_id]
            sentiments_list.append(merged_sentiments.loc[id_list][['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']].values.tolist())
        else:
            sentiments_list.append([])

    articles['comment_sentiments'] = sentiments_list
    articles.to_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', article_day + '.parquet'), compression='gzip')

# 2. **Aggregation**

In [None]:
def aggregate_save(tmp_df, comment_date_cutoff, is_global=False):

    comment_createdAt_list = []
    comment_id_list = []
    comment_topics_list = []
    comment_embeddings_list = []
    comment_sentiments_list = []
    topic_freq = Counter({i: 0 for i in range(-1, num_topics)})
    
    for article in tmp_df.itertuples():
        filtered_index = np.where(np.array([(comment_createdAt - article.createdAt).days for comment_createdAt in article.comment_createdAt]) < comment_date_cutoff)[0]
        comment_createdAt_list.append([article.comment_createdAt[i] for i in filtered_index])
        comment_id_list.append([article.comment_id[i] for i in filtered_index])
        if is_global:
            comment_topics = [article.comment_topics_global[i] for i in filtered_index]
        else:
            comment_topics = [article.comment_topics[i] for i in filtered_index]
        comment_topics_list.append(comment_topics)
        topic_freq.update(comment_topics)
        comment_embeddings_list.append([article.comment_embeddings[i] for i in filtered_index])
        comment_sentiments_list.append([article.comment_sentiments[i] for i in filtered_index])
        
    tmp_df = tmp_df.assign(comment_id=comment_id_list, comment_topics=comment_topics_list, comment_createdAt=comment_createdAt_list, comment_embeddings=comment_embeddings_list, comment_sentiments=comment_sentiments_list)

    if is_global:
        topic_dict_tmp_df = tmp_df[['_id', 'topic_num_global', 'topic_prob_global', 'comment_topics']]
    else:
        topic_dict_tmp_df = tmp_df[['_id', 'topic_num', 'topic_prob', 'comment_topics']]

    topic_embedding_list = [[] for _ in range(num_topics+1)] # including -1
    topic_sentiment_list = [[] for _ in range(num_topics+1)] # including -1
    topic_mean_embedding_list = [[] for _ in range(num_topics+1)] # including -1
    topic_mean_sentiment_list = [[] for _ in range(num_topics+1)] # including -1

    for article in tmp_df.itertuples():
        for i in range(num_topics+1):  # including -1  
            comment_index = np.where(np.array(article.comment_topics) == i-1)[0]
            if len(comment_index)>0:
                topic_embedding_list[i].append(np.array(article.comment_embeddings)[comment_index])
                topic_sentiment_list[i].append(np.array(article.comment_sentiments)[comment_index])
                    
    for i in range(num_topics+1): 
        assert len(topic_embedding_list[i]) == len(topic_sentiment_list[i])
        
        if len(topic_embedding_list[i])>0:
            averaged_embedding = np.vstack(topic_embedding_list[i]).mean(axis=0)
        else:
            averaged_embedding = np.zeros(384)  # embedding shape
            
        if len(topic_sentiment_list[i])>0:
            averaged_sentiment = np.vstack(topic_sentiment_list[i]).mean(axis=0)
        else:
            averaged_sentiment = np.zeros(11)  # sentiment shape
        
        topic_mean_embedding_list[i] = averaged_embedding 
        topic_mean_sentiment_list[i] = averaged_sentiment
        
    # sort topic_freq by key and get values
    topic_freq = [topic_freq[key] for key in sorted(topic_freq.keys())]
    summary_df = pd.DataFrame({'topic_freq': topic_freq, 'topic_mean_embedding': topic_mean_embedding_list, 'topic_mean_sentiment': topic_mean_sentiment_list})

    summary_df_csv = pd.DataFrame({'topic_freq': topic_freq})
    tme = pd.DataFrame(topic_mean_embedding_list, columns=['e'+str(i) for i in range(384)])
    tms = pd.DataFrame(topic_mean_sentiment_list, columns=['s'+str(i) for i in range(11)])
    summary_df_csv = pd.concat([summary_df_csv, tme, tms], axis=1)
    
    return summary_df, summary_df_csv, topic_dict_tmp_df

In [None]:
aggregate_start_date = datetime(2012, 5, 28, 0, 0)  ## Monday

collection_name = 'Atlantic'
model_name = MODEL_NAMES[collection_name]

is_global = True

start_date, end_date = DATE_RANGES[collection_name]
if is_global:
    num_topics = NUM_TOPICS['global']
else:
    num_topics = NUM_TOPICS[collection_name]

In [None]:
aggregate_day_list = ['3d', 'week', 'month']
comment_threshold = 10
add_dict = {'3d':relativedelta(days=3), 'week':relativedelta(weeks=1), 'month':relativedelta(months=1)}
comment_date_cutoff_dict = {'3d': 1, 'week' : 2, 'month': 7}

topic_dict = {aggregate_day : {} for aggregate_day in aggregate_day_list}
topic_tuple_dict_dict = {aggregate_day : {} for aggregate_day in aggregate_day_list}
title_topic_num_list_dict = {aggregate_day : [] for aggregate_day in aggregate_day_list}
title_constant_list_dict = {aggregate_day : [] for aggregate_day in aggregate_day_list}

tmp_list_df = {aggregate_day : [] for aggregate_day in aggregate_day_list}
len_list_dict = {aggregate_day: {} for aggregate_day in aggregate_day_list}

# make folders under /data/collmind/article

article_folder_name = 'article_global' if is_global else 'article'

for aggregate_day in aggregate_day_list:
    os.makedirs(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day), exist_ok=True)
    os.makedirs(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'csv'), exist_ok=True)
    os.makedirs(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'df'), exist_ok=True)

final_day = (end_date - relativedelta(days=30)).strftime('%Y-%m-%d')
print(final_day)

while True:
    file_path = join('article', collection_name.lower(), model_name, 'articles_by_day', f'{final_day}.parquet')
    if os.path.exists(file_path):
        print('aggregate start')
        break
    else:
        print(datetime.now(), 'still waiting')
        time.sleep(60 * 60)  # Sleep for 60 minutes


file_names = os.listdir(join('article', collection_name.lower(), model_name, 'articles_by_day'))
keys = sorted([file_name.split('.')[0] for file_name in file_names], key=lambda x: datetime.strptime(x, '%Y-%m-%d'))
start = datetime.strptime(keys[0], '%Y-%m-%d')

start_end_dict = {}
# 3d
for aggregate_day in aggregate_day_list:
    start_date = deepcopy(aggregate_start_date)
    
    if aggregate_day == 'month':
        
        start_date = start.replace(day=1)
        
        end_date = (start_date + relativedelta(months=1)).replace(day=1)

    else:
        while True:
            if start_date + add_dict[aggregate_day] > start:
                break
            else:
                start_date += add_dict[aggregate_day]
        
        end_date = start_date + add_dict[aggregate_day] 
   
    start_end_dict[aggregate_day] = (start_date, end_date)
    

for key in keys:
    print(key)
    articles = pd.read_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', key +'.parquet'))
    articles = articles[articles['comment_id'].apply(len) > comment_threshold]
    
    for aggregate_day in aggregate_day_list:
        if datetime.strptime(key, '%Y-%m-%d') >= start_end_dict[aggregate_day][1]:

            len_list_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = len(tmp_list_df[aggregate_day])
            tmp_df = pd.concat(tmp_list_df[aggregate_day])
            
            if len(tmp_df) > 0:
                tmp_df = pd.concat(tmp_list_df[aggregate_day])
                
                if is_global:
                    topic_tuple_dict = defaultdict(list)
                    for article in tmp_df.itertuples():
                        topic_tuple = tuple(sorted(article.topic_num_global[:3]))
                        topic_tuple_dict[topic_tuple].append((article._1, article.createdAt.strftime('%Y-%m-%d')))
                    topic_tuple_dict_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_tuple_dict
                    
                    title_constant_list_dict[aggregate_day].append(len(tmp_df))
                    title_topic_num_list_dict[aggregate_day].append(np.zeros((num_topics, 3)))
                    for t in range(num_topics):
                        for i in range(3):
                            title_topic_num_list_dict[aggregate_day][-1][t][i] = len(tmp_df[np.vstack(tmp_df['topic_num_global'])[:, i] == t])
                else:
                    topic_tuple_dict = defaultdict(list)
                    for article in tmp_df.itertuples():
                        topic_tuple = tuple(sorted(article.topic_num[:3]))
                        topic_tuple_dict[topic_tuple].append((article._1, article.createdAt.strftime('%Y-%m-%d')))
                    topic_tuple_dict_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_tuple_dict
                    
                    title_constant_list_dict[aggregate_day].append(len(tmp_df))
                    title_topic_num_list_dict[aggregate_day].append(np.zeros((num_topics, 3)))
                    for t in range(num_topics):
                        for i in range(3):
                            title_topic_num_list_dict[aggregate_day][-1][t][i] = len(tmp_df[np.vstack(tmp_df['topic_num'])[:, i] == t])
                    
                summary_df, summary_df_csv, topic_dict_tmp_df = aggregate_save(tmp_df, comment_date_cutoff_dict[aggregate_day], is_global)
                topic_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_dict_tmp_df
                
                # save summary_df
                summary_df.to_parquet(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'df', start_end_dict[aggregate_day][0].strftime('%Y-%m-%d') + '.parquet'), compression='gzip')
                summary_df_csv.to_csv(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'csv', start_end_dict[aggregate_day][0].strftime('%Y-%m-%d') + '.csv'))
                
            tmp_list_df[aggregate_day] = []
            start_end_dict[aggregate_day] = (start_end_dict[aggregate_day][1], start_end_dict[aggregate_day][1] + add_dict[aggregate_day])
        
        tmp_list_df[aggregate_day].append(articles)
          
# process leftovers          
          
for aggregate_day in aggregate_day_list:
        
    len_list_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = len(tmp_list_df[aggregate_day])
    tmp_df = pd.concat(tmp_list_df[aggregate_day])
    
    if len(tmp_df) > 0:
        tmp_df = pd.concat(tmp_list_df[aggregate_day])
                
        if is_global:
            topic_tuple_dict = defaultdict(list)
            for article in tmp_df.itertuples():
                topic_tuple = tuple(sorted(article.topic_num_global[:3]))
                topic_tuple_dict[topic_tuple].append((article._1, article.createdAt.strftime('%Y-%m-%d')))
            topic_tuple_dict_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_tuple_dict
            
            title_constant_list_dict[aggregate_day].append(len(tmp_df))
            title_topic_num_list_dict[aggregate_day].append(np.zeros((num_topics, 3)))
            for t in range(num_topics):
                for i in range(3):
                    title_topic_num_list_dict[aggregate_day][-1][t][i] = len(tmp_df[np.vstack(tmp_df['topic_num_global'])[:, i] == t])
        else:
            topic_tuple_dict = defaultdict(list)
            for article in tmp_df.itertuples():
                topic_tuple = tuple(sorted(article.topic_num[:3]))
                topic_tuple_dict[topic_tuple].append((article._1, article.createdAt.strftime('%Y-%m-%d')))
            topic_tuple_dict_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_tuple_dict
            
            title_constant_list_dict[aggregate_day].append(len(tmp_df))
            title_topic_num_list_dict[aggregate_day].append(np.zeros((num_topics, 3)))
            for t in range(num_topics):
                for i in range(3):
                    title_topic_num_list_dict[aggregate_day][-1][t][i] = len(tmp_df[np.vstack(tmp_df['topic_num'])[:, i] == t])

        summary_df, summary_df_csv, topic_dict_tmp_df = aggregate_save(tmp_df, comment_date_cutoff_dict[aggregate_day], is_global)
        topic_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_dict_tmp_df
        
        # save summary_df
        summary_df.to_parquet(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'df', start_end_dict[aggregate_day][0].strftime('%Y-%m-%d') + '.parquet'))
        summary_df_csv.to_csv(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'csv', start_end_dict[aggregate_day][0].strftime('%Y-%m-%d') + '.csv'))
        tmp_list_df[aggregate_day] = []
    
result_dict = {'topic_tuple_dict_dict': topic_tuple_dict_dict, 'len_list_dict': len_list_dict, 'title_constant_list_dict': title_constant_list_dict, 'title_topic_num_list_dict': title_topic_num_list_dict}

# save num_elements
with open(join('/data', 'collmind', article_folder_name, collection_name.lower(), 'result_dict.pkl'), 'wb') as f:
    pickle.dump(result_dict, f)
    
for aggregate_day in aggregate_day_list:
    with open(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, f'topic_dict_{aggregate_day}.pkl'), 'wb') as f:
        pickle.dump(topic_dict[aggregate_day], f)

In [None]:
is_global = True
num_topics = NUM_TOPICS['global']
aggregate_day_list = ['3d', 'week', 'month']
comment_threshold = 10
add_dict = {'3d':relativedelta(days=3), 'week':relativedelta(weeks=1), 'month':relativedelta(months=1)}
comment_date_cutoff_dict = {'3d': 1, 'week' : 2, 'month': 7}

for collection_name in ["Motherjones", "Thehill", "Breitbart", "Gatewaypundit"]:
    print(collection_name)
    model_name = MODEL_NAMES[collection_name]
    start_date, end_date = DATE_RANGES[collection_name]

    topic_dict = {aggregate_day : {} for aggregate_day in aggregate_day_list}
    topic_tuple_dict_dict = {aggregate_day : {} for aggregate_day in aggregate_day_list}
    title_topic_num_list_dict = {aggregate_day : [] for aggregate_day in aggregate_day_list}
    title_constant_list_dict = {aggregate_day : [] for aggregate_day in aggregate_day_list}

    tmp_list_df = {aggregate_day : [] for aggregate_day in aggregate_day_list}
    len_list_dict = {aggregate_day: {} for aggregate_day in aggregate_day_list}

    # make folders under /data/collmind/article

    article_folder_name = 'article_global' if is_global else 'article'

    for aggregate_day in aggregate_day_list:
        os.makedirs(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day), exist_ok=True)
        os.makedirs(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'csv'), exist_ok=True)
        os.makedirs(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'df'), exist_ok=True)

    final_day = (end_date - relativedelta(days=30)).strftime('%Y-%m-%d')
    print(final_day)

    while True:
        file_path = join('article', collection_name.lower(), model_name, 'articles_by_day', f'{final_day}.parquet')
        if os.path.exists(file_path):
            print('aggregate start')
            break
        else:
            print(datetime.now(), 'still waiting')
            time.sleep(60 * 60)  # Sleep for 60 minutes


    file_names = os.listdir(join('article', collection_name.lower(), model_name, 'articles_by_day'))
    keys = sorted([file_name.split('.')[0] for file_name in file_names], key=lambda x: datetime.strptime(x, '%Y-%m-%d'))
    start = datetime.strptime(keys[0], '%Y-%m-%d')

    start_end_dict = {}
    # 3d
    for aggregate_day in aggregate_day_list:
        start_date = deepcopy(aggregate_start_date)
        
        if aggregate_day == 'month':
            
            start_date = start.replace(day=1)
            
            end_date = (start_date + relativedelta(months=1)).replace(day=1)

        else:
            while True:
                if start_date + add_dict[aggregate_day] > start:
                    break
                else:
                    start_date += add_dict[aggregate_day]
            
            end_date = start_date + add_dict[aggregate_day] 
    
        start_end_dict[aggregate_day] = (start_date, end_date)
        

    for key in keys:
        print(key)
        articles = pd.read_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', key +'.parquet'))
        articles = articles[articles['comment_id'].apply(len) > comment_threshold]
        
        for aggregate_day in aggregate_day_list:
            if datetime.strptime(key, '%Y-%m-%d') >= start_end_dict[aggregate_day][1]:

                len_list_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = len(tmp_list_df[aggregate_day])
                tmp_df = pd.concat(tmp_list_df[aggregate_day])
                
                if len(tmp_df) > 0:
                    tmp_df = pd.concat(tmp_list_df[aggregate_day])
                    
                    if is_global:
                        topic_tuple_dict = defaultdict(list)
                        for article in tmp_df.itertuples():
                            topic_tuple = tuple(sorted(article.topic_num_global[:3]))
                            topic_tuple_dict[topic_tuple].append((article._1, article.createdAt.strftime('%Y-%m-%d')))
                        topic_tuple_dict_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_tuple_dict
                        
                        title_constant_list_dict[aggregate_day].append(len(tmp_df))
                        title_topic_num_list_dict[aggregate_day].append(np.zeros((num_topics, 3)))
                        for t in range(num_topics):
                            for i in range(3):
                                title_topic_num_list_dict[aggregate_day][-1][t][i] = len(tmp_df[np.vstack(tmp_df['topic_num_global'])[:, i] == t])
                    else:
                        topic_tuple_dict = defaultdict(list)
                        for article in tmp_df.itertuples():
                            topic_tuple = tuple(sorted(article.topic_num[:3]))
                            topic_tuple_dict[topic_tuple].append((article._1, article.createdAt.strftime('%Y-%m-%d')))
                        topic_tuple_dict_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_tuple_dict
                        
                        title_constant_list_dict[aggregate_day].append(len(tmp_df))
                        title_topic_num_list_dict[aggregate_day].append(np.zeros((num_topics, 3)))
                        for t in range(num_topics):
                            for i in range(3):
                                title_topic_num_list_dict[aggregate_day][-1][t][i] = len(tmp_df[np.vstack(tmp_df['topic_num'])[:, i] == t])
                        
                    summary_df, summary_df_csv, topic_dict_tmp_df = aggregate_save(tmp_df, comment_date_cutoff_dict[aggregate_day], is_global)
                    topic_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_dict_tmp_df
                    
                    # save summary_df
                    summary_df.to_parquet(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'df', start_end_dict[aggregate_day][0].strftime('%Y-%m-%d') + '.parquet'), compression='gzip')
                    summary_df_csv.to_csv(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'csv', start_end_dict[aggregate_day][0].strftime('%Y-%m-%d') + '.csv'))
                    
                tmp_list_df[aggregate_day] = []
                start_end_dict[aggregate_day] = (start_end_dict[aggregate_day][1], start_end_dict[aggregate_day][1] + add_dict[aggregate_day])
            
            tmp_list_df[aggregate_day].append(articles)
            
    # process leftovers          
            
    for aggregate_day in aggregate_day_list:
            
        len_list_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = len(tmp_list_df[aggregate_day])
        tmp_df = pd.concat(tmp_list_df[aggregate_day])
        
        if len(tmp_df) > 0:
            tmp_df = pd.concat(tmp_list_df[aggregate_day])
                    
            if is_global:
                topic_tuple_dict = defaultdict(list)
                for article in tmp_df.itertuples():
                    topic_tuple = tuple(sorted(article.topic_num_global[:3]))
                    topic_tuple_dict[topic_tuple].append((article._1, article.createdAt.strftime('%Y-%m-%d')))
                topic_tuple_dict_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_tuple_dict
                
                title_constant_list_dict[aggregate_day].append(len(tmp_df))
                title_topic_num_list_dict[aggregate_day].append(np.zeros((num_topics, 3)))
                for t in range(num_topics):
                    for i in range(3):
                        title_topic_num_list_dict[aggregate_day][-1][t][i] = len(tmp_df[np.vstack(tmp_df['topic_num_global'])[:, i] == t])
            else:
                topic_tuple_dict = defaultdict(list)
                for article in tmp_df.itertuples():
                    topic_tuple = tuple(sorted(article.topic_num[:3]))
                    topic_tuple_dict[topic_tuple].append((article._1, article.createdAt.strftime('%Y-%m-%d')))
                topic_tuple_dict_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_tuple_dict
                
                title_constant_list_dict[aggregate_day].append(len(tmp_df))
                title_topic_num_list_dict[aggregate_day].append(np.zeros((num_topics, 3)))
                for t in range(num_topics):
                    for i in range(3):
                        title_topic_num_list_dict[aggregate_day][-1][t][i] = len(tmp_df[np.vstack(tmp_df['topic_num'])[:, i] == t])

            summary_df, summary_df_csv, topic_dict_tmp_df = aggregate_save(tmp_df, comment_date_cutoff_dict[aggregate_day], is_global)
            topic_dict[aggregate_day][start_end_dict[aggregate_day][0].strftime('%Y-%m-%d')] = topic_dict_tmp_df
            
            # save summary_df
            summary_df.to_parquet(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'df', start_end_dict[aggregate_day][0].strftime('%Y-%m-%d') + '.parquet'))
            summary_df_csv.to_csv(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, 'csv', start_end_dict[aggregate_day][0].strftime('%Y-%m-%d') + '.csv'))
            tmp_list_df[aggregate_day] = []
        
    result_dict = {'topic_tuple_dict_dict': topic_tuple_dict_dict, 'len_list_dict': len_list_dict, 'title_constant_list_dict': title_constant_list_dict, 'title_topic_num_list_dict': title_topic_num_list_dict}

    # save num_elements
    with open(join('/data', 'collmind', article_folder_name, collection_name.lower(), 'result_dict.pkl'), 'wb') as f:
        pickle.dump(result_dict, f)
        
    for aggregate_day in aggregate_day_list:
        with open(join('/data', 'collmind', article_folder_name, collection_name.lower(), aggregate_day, f'topic_dict_{aggregate_day}.pkl'), 'wb') as f:
            pickle.dump(topic_dict[aggregate_day], f)

# 3. **Auxilary dataset**

## 3A. Convert_list for topic to frequency-based topics

### 3A-1. Comment frequency ranking

In [None]:
COLLECTION_NAMES = ['Atlantic', 'Breitbart', 'Gatewaypundit', 'Motherjones', 'Thehill']
#COLLECTION_NAMES = ['Gatewaypundit']

topic_df_dict = {collection_name: {} for collection_name in COLLECTION_NAMES}
aggregate_day = 'month'

for collection_name in COLLECTION_NAMES:
    print(collection_name)
    file_names = os.listdir(join('/data', 'collmind', 'article', collection_name.lower(), aggregate_day, 'df'))
    for file_name in file_names:
        day = file_name.split('.')[0]
        topic_df_dict[collection_name][day] = pd.read_parquet(join('/data', 'collmind', 'article', collection_name.lower(), aggregate_day, 'df', file_name))

In [None]:
# for each collection, get the sum of topic freq
topic_freq_dict = {collection_name: None for collection_name in COLLECTION_NAMES}
for collection_name in COLLECTION_NAMES:
    print(collection_name)
    for day, topic_df in topic_df_dict[collection_name].items():
        if topic_freq_dict[collection_name] is None:
            topic_freq_dict[collection_name] = topic_df['topic_freq'].values
        else:
            topic_freq_dict[collection_name] += topic_df['topic_freq'].values

In [None]:
topic_df_dict = {collection_name: {} for collection_name in COLLECTION_NAMES}
aggregate_day = 'month'

for collection_name in COLLECTION_NAMES:
    print(collection_name)
    file_names = os.listdir(join('/data', 'collmind', 'article_global', collection_name.lower(), aggregate_day, 'df'))
    for file_name in file_names:
        day = file_name.split('.')[0]
        topic_df_dict[collection_name][day] = pd.read_parquet(join('/data', 'collmind', 'article_global', collection_name.lower(), aggregate_day, 'df', file_name))

In [None]:
# for each collection, get the sum of topic freq
num_topics = NUM_TOPICS['global']
global_topic_freq_list = np.zeros(num_topics+1)
for collection_name in COLLECTION_NAMES:
    print(collection_name)
    for day, topic_df in topic_df_dict[collection_name].items():
        global_topic_freq_list += topic_df['topic_freq'].values
        
topic_freq_dict['global'] = global_topic_freq_list
            

In [None]:
with open(join('result', 'comment_topic_freq_dict.pkl'), 'wb') as f:
    pickle.dump(topic_freq_dict, f)

### 3A-2. Title frequency ranking

In [None]:
comment_threshold = 10
topic_num_threshold = 3  # get triplet (top 3) topics
title_topic_dict = {}

for collection_name in COLLECTION_NAMES:
    print(collection_name)
    model_name = MODEL_NAMES[collection_name]
    num_topics = NUM_TOPICS[collection_name]
    start_date, end_date = DATE_RANGES[collection_name]
    
    file_names = os.listdir(join('article', collection_name.lower(), model_name, 'articles_by_day'))
    keys = sorted([file_name.split('.')[0] for file_name in file_names], key=lambda x: datetime.strptime(x, '%Y-%m-%d'))
    title_topic_list = []
    
    for key in keys:
        print(key)     
        articles = pd.read_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', key +'.parquet'))
        articles = articles[articles['comment_id'].apply(len) > comment_threshold]
        if len(articles):
            title_topic_list.append(np.vstack(articles['topic_num'].values)[:, :topic_num_threshold])
    title_topic_dict[collection_name] = np.vstack(title_topic_list)

with open('result/title_topic_dict.pkl', 'wb') as f:
    pickle.dump(topic_freq_dict, f)

In [None]:
for collection_name in COLLECTION_NAMES:
    print(collection_name)
    model_name = MODEL_NAMES[collection_name]
    num_topics = NUM_TOPICS[collection_name]
    start_date, end_date = DATE_RANGES[collection_name]
    
    file_names = os.listdir(join('article', collection_name.lower(), model_name, 'articles_by_day'))
    keys = sorted([file_name.split('.')[0] for file_name in file_names], key=lambda x: datetime.strptime(x, '%Y-%m-%d'))
    title_topic_list = []
    
    for key in keys:
        print(key)     
        articles = pd.read_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', key +'.parquet'))
        articles = articles[articles['comment_id'].apply(len) > comment_threshold]
        if len(articles):
            title_topic_list.append(np.vstack(articles['topic_num'].va

## 3B. Number of Article / Comment Dict

In [None]:
#threshold_key = next(key for key in reversed(cumulative_counter_article.keys()) if key >= comment_threshold)

for collection_name in COLLECTION_NAMES:
    print(collection_name)
    model_name = MODEL_NAMES[collection_name]
    comment_threshold = 10
    
    file_names = os.listdir(join('article', collection_name.lower(), model_name, 'articles_by_day'))
    keys = sorted([file_name.split('.')[0] for file_name in file_names], key=lambda x: datetime.strptime(x, '%Y-%m-%d'))
    
    article_num_counter_dict = {}
    comment_num_counter_dict = {}

    for key in keys:
        print(key)
        articles = pd.read_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', key +'.parquet'))
        
        sorted_counter = OrderedDict(sorted(Counter(articles['comment_id'].apply(len).values).items()))
        cumulative_counter_article = OrderedDict(zip(list(reversed(sorted_counter.keys())), list(accumulate(list(reversed(sorted_counter.values()))))))
        items_list = list(cumulative_counter_article.items())
        cumulative_counter_comment = {}
        for i in range(len(items_list)):
            cumulative_counter_comment[items_list[i][0]] = items_list[i][0]*(items_list[i][1]-items_list[i-1][1]) + cumulative_counter_comment[items_list[i-1][0]] if i>0 else items_list[0][0]*items_list[0][1]
            
        article_num_counter_dict[key] = cumulative_counter_article
        comment_num_counter_dict[key] = cumulative_counter_comment
        
    with open(join('article', collection_name.lower(), model_name, 'article_num_counter_dict.pkl'), 'wb') as f:
        pickle.dump(article_num_counter_dict, f)
    with open(join('article', collection_name.lower(), model_name, 'comment_num_counter_dict.pkl'), 'wb') as f:
        pickle.dump(comment_num_counter_dict, f)

## 3C. Multiplier distribution

In [None]:
from scipy.stats import rankdata

# for each month, calculate the total frequency of comment's topic number
def get_topic_frequency_by_month(articles_by_month, exclude_neg_one=False):
    topic_freq_by_month = {}
    for month, articles in articles_by_month.items():
        comment_topics = []
        for i in range(len(articles)):
            article = articles.iloc[i]
            comment_topics.extend(article['comment_topics'])
        if exclude_neg_one:
            comment_topics = [t for t in comment_topics if t != -1]
        topic_freq_by_month[month] = Counter(comment_topics)
        
        
    return topic_freq_by_month

# sum all counters for all months
def get_topic_frequency(topic_freq_by_month):
    topic_freq = Counter()
    for month, freq in topic_freq_by_month.items():
        topic_freq += freq
    return topic_freq

def get_topic_frequency_by_month_self(articles_by_month, rank, exclude_neg_one=False):
    topic_freq_by_month = {}
    for month, articles in articles_by_month.items():
        comment_topics = defaultdict(list)
        for i in range(len(articles)):
            article = articles.iloc[i]
            article_topic = article['topic_num'][rank-1]
            if exclude_neg_one:
                # if article topic is not -1
                if article_topic != -1:
                    comment_topics[article_topic].extend(article['comment_topics'])
        if exclude_neg_one:
            for key in comment_topics.keys():
                comment_topics[key] = [t for t in comment_topics[key] if t != -1]
        topic_freq_by_month[month] = {}
        for key in comment_topics.keys(): 
            topic_freq_by_month[month][key] = Counter(comment_topics[key])
    return topic_freq_by_month

# sum all counter for all topics for all months
def get_topic_frequency_self(topic_freq_by_month_self):
    topic_freq = defaultdict(Counter)
    for month, freq in topic_freq_by_month_self.items():
        for topic, f in freq.items():
            topic_freq[topic]+=f
    return topic_freq

In [None]:
# Aggregation based on previous (windowed) ranking + comm

ratio_list_dict = {collection_name:[[], [], []] for collection_name in COLLECTION_NAMES}
aggregate_day = 'month'
exclude_neg_one = True
breaker = False
window_size = 12
ratio_list_dict = {}

for collection_name in COLLECTION_NAMES:
    print(collection_name)
    num_topics = NUM_TOPICS['global']
    with open(join('/data', 'collmind', 'article_global', collection_name.lower(), aggregate_day, f'topic_dict_{aggregate_day}.pkl'), 'rb') as f:
        articles_by_month = pickle.load(f)
    
    articles_by_month = articles_by_month_clear(articles_by_month, exclude_neg_one=exclude_neg_one)
    topic_freq_by_month = get_topic_frequency_by_month(articles_by_month, exclude_neg_one)

    ratio_list_list = []

    first = True
    counter = 0
    last_month = None
    
    freq_dict = {i:[] for i in range(num_topics)}

    for month, data in topic_freq_by_month.items():
        print(month)

        if first:  ## constructing rank/freq dict for the first(windowed) month
            counter += 1
    
            total_comment_num = sum(data.values())
            
            for i, item in enumerate(topic_freq_by_month[month].items()):
                freq_dict[item[0]].append(item[1] / total_comment_num)

            for t in list(set(range(num_topics))-set(topic_freq_by_month[month].keys())):
                freq_dict[t].append(0)

            if counter == window_size:
                rank_dict = {}
                ranks = rankdata([-np.mean(freq_dict[key]) for key in freq_dict.keys()], method='max')
                for i, key in enumerate(freq_dict.keys()):
                    rank_dict[key] = ranks[i]-1

                first = False
                    
            last_month = month
            continue
        
        articles = articles_by_month[month]
        total_comment_num = sum(data.values())

        if exclude_neg_one:
            ratio_list = [[[] for _ in range(num_topics)] for _ in range(3)]
        else:
            ratio_list = [[[] for _ in range(num_topics+1)] for _ in range(3)]

        for i in range(len(articles)):
            article = articles.iloc[i]
            if exclude_neg_one:
                comment_topics = [t for t in article['comment_topics'] if t != -1]
            else:
                comment_topics = article['comment_topics']
                
            tmp_ratio_list = []
            for tier in range(3):
                article_topic = article['topic_num_global'][tier]
                comment_ratio = np.mean(comment_topics == article_topic)
                comment_expected_ratio = np.mean(freq_dict[article_topic])
                if comment_expected_ratio > 0:
                    tmp_ratio_list.append(comment_ratio/comment_expected_ratio)
                else:
                    tmp_ratio_list.append(-1)
                
            for tier in range(3):
                rem_list = [0, 1, 2]
                rem_list.remove(tier)
                ratio_list[tier][rank_dict[article['topic_num_global'][tier]]].append([len(comment_topics)/total_comment_num, tmp_ratio_list[tier], rank_dict[article['topic_num_global'][rem_list[0]]], rank_dict[article['topic_num_global'][rem_list[1]]], tmp_ratio_list[rem_list[0]], tmp_ratio_list[rem_list[1]]])

        ratio_list_list.append(ratio_list)

        ## constructing rank/freq dict for this month
        rank_dict = {}
        
        for i, item in enumerate(topic_freq_by_month[month].items()):
            freq_dict[item[0]].pop(0)  # remove the oldest element)
            freq_dict[item[0]].append(item[1] / total_comment_num) 

        for t in list(set(range(num_topics))-set(topic_freq_by_month[month].keys())):
            freq_dict[t].pop(0)
            freq_dict[t].append(0)

        rank_dict = {}
        ranks = rankdata([-np.mean(freq_dict[key]) for key in freq_dict.keys()], method='max')
        for i, key in enumerate(freq_dict.keys()):
            rank_dict[key] = ranks[i]-1

    ratio_list_dict[collection_name] = ratio_list_list
     
with open(join('result', f'freq_mult_rank_w{window_size}_global_dict.pkl'), 'wb') as f:
    pickle.dump(ratio_list_dict, f)

## 3D. On/Off-topic distribution

In [None]:
ranking_month_list_dict = {'off_corr':{}, 'corr':{}}
frequency_month_list_dict = {'off_corr':{}, 'corr':{}}
exclude_neg_one = True

for collection_name in COLLECTION_NAMES:
    aggregate_day = 'month'
    num_topics = NUM_TOPICS['global']

    with open(join('/data', 'collmind', 'article_global', collection_name.lower(), aggregate_day, f'topic_dict_{aggregate_day}.pkl'), 'rb') as f:
        articles_by_month = pickle.load(f)
        
    articles_by_month = articles_by_month_clear(articles_by_month, exclude_neg_one=exclude_neg_one)

    corr_ranking_month_list = []
    off_corr_ranking_month_list = []
    corr_frequency_list = []
    off_corr_frequency_list = []

    for month, articles in articles_by_month.items():
        print(month)
        
        corr_comm = []
        off_corr_comm = []
        
        for i in range(len(articles)):
            article = articles.iloc[i]
            article_topics = article['topic_num_global'][:3]
            comment_topics = np.array(article['comment_topics'])
            comment_topics = comment_topics[comment_topics!=-1]
            corr_comm.extend(comment_topics[np.isin(comment_topics, article_topics)])
            off_corr_comm.extend(comment_topics[~np.isin(comment_topics, article_topics)])
            
        corr_counter = Counter(corr_comm)
        off_corr_counter = Counter(off_corr_comm)
        
        # add 0 to non-exist topics
        for i in range(num_topics):
            if i not in corr_counter:
                corr_counter[i] = 0
            if i not in off_corr_counter:
                off_corr_counter[i] = 0
        
        
        corr_list = np.array([corr_counter[i] for i in range(num_topics)])[ranking_dict['global']]
        off_corr_list = np.array([off_corr_counter[i] for i in range(num_topics)])[ranking_dict['global']]
        
        corr_ranking_month_list.append(np.argsort(np.argsort(-corr_list))/num_topics)
        off_corr_ranking_month_list.append(np.argsort(np.argsort(-off_corr_list))/num_topics)
        corr_frequency_list.append(corr_list)
        off_corr_frequency_list.append(off_corr_list)
        
    ranking_month_list_dict['corr'][collection_name] = np.array(corr_ranking_month_list)
    ranking_month_list_dict['off_corr'][collection_name] = np.array(off_corr_ranking_month_list)
    frequency_month_list_dict['corr'][collection_name] = np.array(corr_frequency_list)
    frequency_month_list_dict['off_corr'][collection_name] = np.array(off_corr_frequency_list)
    
with open(join('result', 'corr_rank_freq_global_dict.pkl'), 'wb') as f:
    pickle.dump({'ranking_month_list_dict': ranking_month_list_dict, 'frequency_month_list_dict': frequency_month_list_dict}, f)

## 3E. Comment inter-on-topic similarity

In [12]:
from joblib import Parallel, delayed

def cosine_similarity_batch_list(X_list, a_index, b_index, batch_size=1000):
    """Compute the average pairwise cosine similarity in batches using index-based access."""
    A = X_list[a_index]  # Fetch A dynamically
    B = X_list[b_index]  # Fetch B dynamically
    
    if len(A) == 0 or len(B) == 0:
        return 0
    
    # Normalize
    A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
    B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)

    total_sim = 0.0
    count = 0

    for i in range(0, len(A), batch_size):
        A_batch = A_norm[i:i+batch_size]
        for j in range(0, len(B), batch_size):
            B_batch = B_norm[j:j+batch_size]
            total_sim += np.sum(A_batch @ B_batch.T)
            count += A_batch.shape[0] * B_batch.shape[0]

    return total_sim / count

def cosine_similarity_batch_dict(X_dict, a_index, b_index, i, batch_size=1000):
    """Compute the average pairwise cosine similarity in batches using index-based access."""
    A = X_dict[(a_index, b_index)][i][0]  # Fetch A dynamically
    B = X_dict[(a_index, b_index)][i][1]  # Fetch B dynamically
    
    if len(A) == 0 or len(B) == 0:
        return 0
    
    # Normalize
    A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
    B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)

    total_sim = 0.0
    count = 0

    for i in range(0, len(A), batch_size):
        A_batch = A_norm[i:i+batch_size]
        for j in range(0, len(B), batch_size):
            B_batch = B_norm[j:j+batch_size]
            total_sim += np.sum(A_batch @ B_batch.T)
            count += A_batch.shape[0] * B_batch.shape[0]

    return total_sim / count

def compute_embeddings(article, is_global):
    comment_embeddings = article.comment_embeddings
    if is_global:
        topic_tuple = article.topic_num_global[:3]
    else:
        topic_tuple = article.topic_num[:3]

    topic_tuple_embeddings_list = []
    topic_tuple_num_list = []

    for t in topic_tuple:
        comment_indices = np.where(np.isin(article.comment_topics, t))[0]
        if len(comment_indices) > 0:
            topic_tuple_embeddings_list.append(np.sum(comment_embeddings[comment_indices], axis=0))
            topic_tuple_num_list.append(len(comment_indices))
        else:
            topic_tuple_embeddings_list.append(np.zeros(384))
            topic_tuple_num_list.append(0)
    
    return topic_tuple_embeddings_list, topic_tuple_num_list

In [None]:
# comment topic embeddings similarity (chunk)

aggregate_day_list = ['month']
num_topics = NUM_TOPICS['global']
N = 1000  # chunk size
is_global = True

now = datetime.now()

for collection_name in COLLECTION_NAMES:
    print(collection_name)
    final_dict = {}
    
    for aggregate_day in aggregate_day_list:
        print(aggregate_day)

        file_names = os.listdir(join('/data', 'collmind', 'article_global', collection_name.lower(), aggregate_day, 'df'))
        keys = sorted([file_name.split('.')[0] for file_name in file_names], key=lambda x: datetime.strptime(x, '%Y-%m-%d'))

        gen = tmp_df_generator(collection_name, aggregate_day, include_embeddings=True, is_global=is_global)

        similarity_other_list = []
        similarity_self_list_list = [[] for _ in range(3)]

        for month in sorted(keys):
            comments_embeddings_dict_self = defaultdict(lambda: [[[], []] for _ in range(3)])
            comments_embeddings_list_others = [[] for _ in range(num_topics)]
            
            articles = next(gen)
            
            articles["comment_embeddings"] = articles["comment_embeddings"].apply(np.array)
            topic_tuples = np.stack(articles["topic_num_global"].apply(lambda x: x[:3]))
            pair_list = [list(combinations(t, 2)) for t in topic_tuples]

            articles[["topic_tuple_embeddings", "topic_tuple_num"]] = articles.apply(lambda x: pd.Series(compute_embeddings(x, is_global)), axis=1)

            articles = articles.to_numpy() # columns : Index(['_id', 'topic_num_global', 'comment_topics', 'comment_embeddings', 'topic_tuple_embeddings', 'topic_tuple_num'], dtype='object')
            c_order = [[0, 1], [0, 2], [1, 2]]
            
            for i in range(len(pair_list)):
                pairs = pair_list[i]
                topic_tuple_num_list = articles[i, 5]
                topic_tuple_embeddings_list = articles[i, 4]

                for j, (a, b) in enumerate(pairs):
                    if -1 not in (a, b):
                        for k in range(2):
                            if topic_tuple_num_list[c_order[j][k]] > 0:
                                comments_embeddings_dict_self[pairs[j]][j][k].append(topic_tuple_embeddings_list[c_order[j][k]])

            comments_embeddings_dict_self_dict = dict(comments_embeddings_dict_self)

            for i in range(num_topics):
                mask = ~np.isin(topic_tuples, i)

                for idx in np.where(mask)[0]:
                    article = articles[idx]
                    topic_set = set(range(num_topics))
                    comment_indices = [j for j, topic in enumerate(article[2]) if topic == i]
                    if len(comment_indices) > 0:
                        comments_embeddings_list_others[i].extend(article[3][comment_indices])

            gc.collect()
            
            indices = np.zeros((8, len(comments_embeddings_dict_self.keys())), dtype=int)  # 0 : v1_other, 1: v2_other, 2: v1_self1, 3: v2_self1, 4: v1_self2, 5: v2_self2, 6: v1_self3, 7: v2_self3

            for c, key in enumerate(comments_embeddings_dict_self.keys()):
                indices[0][c] = key[0]
                indices[1][c] = key[1]
                for i in range(3):
                    indices[2+2*i][c] = key[0]
                    indices[3+2*i][c] = key[1]

            similarity_other_list.extend([cosine_similarity_batch_list(comments_embeddings_list_others, a_index, b_index, batch_size=5000) for a_index, b_index in zip(indices[0], indices[1])])
            for i in range(3):
                similarity_self_list_list[i].extend([cosine_similarity_batch_dict(comments_embeddings_dict_self_dict, a_index, b_index, i, batch_size=5000) for a_index, b_index in zip(indices[2+2*i], indices[3+2*i])])
            print(datetime.now() - now)
            now = datetime.now()
            
        big_dict = {
            'similarity_other_list': np.array(similarity_other_list),
            'similarity_self_list_list': np.array(similarity_self_list_list)
        }
        
        final_dict[aggregate_day] = big_dict

    with open(join('result', f'comment_sim_{collection_name}_global_dict.pkl'), 'wb') as f:
        pickle.dump(final_dict, f)

Gatewaypundit
month
month, 2015-01-01
0:00:06.411923
month, 2015-02-01
0:00:06.488691
month, 2015-03-01
0:00:08.294389
month, 2015-04-01
0:00:05.140286
month, 2015-05-01
0:00:08.943350
month, 2015-06-01
0:00:07.224641
month, 2015-07-01
0:00:10.068584
month, 2015-08-01
0:00:09.363868
month, 2015-09-01
0:00:10.400669
month, 2015-10-01
0:00:09.594250
month, 2015-11-01
0:00:09.867625
month, 2015-12-01
0:00:12.341596
month, 2016-01-01
0:00:11.488747
month, 2016-02-01
0:00:20.326740
month, 2016-03-01
0:00:30.237979
month, 2016-04-01
0:00:29.169660
month, 2016-05-01
0:00:19.166323
month, 2016-06-01
0:00:22.193229
month, 2016-07-01
0:00:23.299880
month, 2016-08-01
0:00:32.761280
month, 2016-09-01
0:00:40.731062
month, 2016-10-01
0:00:39.718204
month, 2016-11-01
0:00:42.545227
month, 2016-12-01
0:00:23.777604
month, 2017-01-01
0:00:41.323773
month, 2017-02-01
0:00:44.206165
month, 2017-03-01
0:00:42.258832
month, 2017-04-01
0:00:32.741119
month, 2017-05-01
0:00:55.005360
month, 2017-06-01
0:00:

In [None]:
# comment topic embeddings similarity (chunk)

aggregate_day_list = ['month']
num_topics = NUM_TOPICS['global']

is_global=True

now = datetime.now()

for collection_name in ['Gatewaypundit']:
    print(collection_name)
    final_dict = {}
    
    for aggregate_day in aggregate_day_list:
        print(aggregate_day)

        file_names = os.listdir(join('/data', 'collmind', 'article_global', collection_name.lower(), aggregate_day, 'df'))
        keys = sorted([file_name.split('.')[0] for file_name in file_names], key=lambda x: datetime.strptime(x, '%Y-%m-%d'))

        gen = tmp_df_generator(collection_name, aggregate_day, include_embeddings=True, is_global=is_global)

        similarity_other_list = []
        similarity_self_list_list = []

        for month in sorted(keys):
            print(month)
            comments_embeddings_dict_self = defaultdict(lambda: [[[], []] for _ in range(3)])
            comments_embeddings_list_others = [[] for _ in range(num_topics)]
            
            articles = next(gen)
            
            articles["comment_embeddings"] = articles["comment_embeddings"].apply(np.array)
            topic_tuples = np.stack(articles["topic_num_global"].apply(lambda x: x[:3]))
            pair_list = [list(combinations(t, 2)) for t in topic_tuples]

            articles[["topic_tuple_embeddings", "topic_tuple_num"]] = articles.apply(lambda x: pd.Series(compute_embeddings(x, is_global)), axis=1)

            articles = articles.to_numpy() # columns : Index(['_id', 'topic_num_global', 'comment_topics', 'comment_embeddings', 'topic_tuple_embeddings', 'topic_tuple_num'], dtype='object')
            c_order = [[0, 1], [0, 2], [1, 2]]
            
            for i in range(len(pair_list)):
                pairs = pair_list[i]
                topic_tuple_num_list = articles[i, 5]
                topic_tuple_embeddings_list = articles[i, 4]

                for j, (a, b) in enumerate(pairs):
                    if -1 not in (a, b):
                        for k in range(2):
                            if topic_tuple_num_list[c_order[j][k]] > 0:
                                comments_embeddings_dict_self[pairs[j]][j][k].append(topic_tuple_embeddings_list[c_order[j][k]])

            comments_embeddings_dict_self_dict = dict(comments_embeddings_dict_self)

            for i in range(num_topics):
                mask = ~np.isin(topic_tuples, i)

                for idx in np.where(mask)[0]:
                    article = articles[idx]
                    topic_set = set(range(num_topics))
                    comment_indices = [j for j, topic in enumerate(article[2]) if topic == i]
                    if len(comment_indices) > 0:
                        comments_embeddings_list_others[i].extend(article[3][comment_indices])

            gc.collect()
                        
            N = 5000  # chunk size

            for key in comments_embeddings_dict_self.keys():

                vector1 = comments_embeddings_list_others[key[0]]
                vector2 = comments_embeddings_list_others[key[1]]
                
                if len(vector1) > 0 and len(vector2) > 0:
                    
                    similarity_other = 0
                    
                    for i in range(0, len(vector1), N):
                        for j in range(0, len(vector2), N):
                            similarity_other += np.sum(cosine_similarity(vector1[i:i+N], vector2[j:j+N]))
                    
                    similarity_other /= (len(vector1) * len(vector2))
                    
                    similarity_self_list = []
                    
                    for i in range(3):
                        vector1 = comments_embeddings_dict_self[key][i][0]
                        vector2 = comments_embeddings_dict_self[key][i][1]
                        
                        if len(vector1) > 0 and len(vector2) > 0:
                            similarity_self = 0
                            for i in range(0, len(vector1), N):
                                for j in range(0, len(vector2), N):
                                    similarity_self += np.sum(cosine_similarity(vector1, vector2))
                            
                            similarity_self /= (len(vector1) * len(vector2))        
                            similarity_self_list.append(similarity_self)
                        else:
                            similarity_self_list.append(0)
                        
                    #print(f'{key}, other : {similarity_other}, self : {similarity_self_list}')
                    similarity_other_list.append(similarity_other)
                    similarity_self_list_list.append(similarity_self_list)

            print(datetime.now() - now)
            now = datetime.now()

        big_dict = {
            'similarity_other_list': similarity_other_list,
            'similarity_self_list_list': similarity_self_list_list
        }
        
        final_dict[aggregate_day] = big_dict

    #with open(join('result', f'comment_sim_{collection_name}_global_2_dict.pkl'), 'wb') as f:
    #    pickle.dump(final_dict, f)

## 3F. Individual trajectory

In [None]:
from proj_utils import _init_mongo_collection

user_dict = defaultdict(lambda: [[], [], [], []])  # comment_id, article_id, createdAt

for coll_counter, collection_name in enumerate(COLLECTION_NAMES):
    
    print(coll_counter, collection_name)

    select_columns = ['_id', 'user_id', 'createdAt', 'art_id']
    mongo_client, collection= _init_mongo_collection('Comments', collection_name)

    counter = 0
    
    for doc in collection.find({}, ['user_id', '_id', 'art_id', 'createdAt']):
        
        counter += 1
        if doc['user_id'] is not None:
            user_dict[doc['user_id']][0].append(doc['_id'])
            user_dict[doc['user_id']][1].append(doc['art_id'])
            user_dict[doc['user_id']][2].append(coll_counter)
            user_dict[doc['user_id']][3].append(doc['createdAt'])
            
        if counter % 1000000 == 0:
            print(counter)
    
    print(collection_name + ' done')
        
user_dict = dict(user_dict)
# sort by createdAt
for key in user_dict.keys():
    indices = np.argsort(user_dict[key][3])
    tmp_list = [np.array(user_dict[key][0])[indices], np.array(user_dict[key][1])[indices], np.array(user_dict[key][2])[indices], np.array(user_dict[key][3])[indices], len(indices)]
    tmp_list.append((tmp_list[3][-1]-tmp_list[3][0]).days)
    user_dict[key] = tmp_list
    
df = pd.DataFrame([
    [key, val[0], val[1], val[2], val[3], val[4], val[5]] for key, val in user_dict.items()
], columns=['user_id', 'comment_id', 'article_id', 'collection_id', 'createdAt', 'num_comments', 'age'])

# save df
df.to_parquet('result/user_trajectory_df.parquet', compression='gzip')

0 Gatewaypundit


1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000
32000000
33000000
34000000
35000000
36000000
37000000
38000000
39000000
40000000
Gatewaypundit done
1 Breitbart
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000
32000000
33000000
34000000
35000000
36000000
37000000
38000000
39000000
40000000
41000000
42000000
43000000
44000000
45000000
46000000
47000000
48000000
49000000
50000000
51000000
52000000
53000000
54000000
55000000
56000000
57000000
58000000
59000000
60000000
61000000
62000000
63000000
64000000
65000000
66000000
67000000
68000000
69000000
700000

In [85]:
for coll_counter, collection_name in enumerate(COLLECTION_NAMES):
    
    print(coll_counter, collection_name)

    select_columns = ['_id', 'user_id', 'createdAt', 'art_id']
    mongo_client, collection= _init_mongo_collection('Comments', collection_name)

    # find how many of the document in collection has user_id is None
    count1 = collection.count_documents({"user_id": None})
    count2 = collection.count_documents({"user_id": {"$ne": None}})
    
    print(count1, count2, (count1)/(count1+count2))

1992542

In [86]:
df

Unnamed: 0,user_id,comment_id,article_id,collection_id,createdAt,num_comments,age
0,98984140,"[1296188855, 1538670905, 1538673711, 153867633...","[2479818491, 2919574806, 2919574806, 291957480...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2014-03-22 08:48:35, 2014-08-12 08:07:51, 201...",95544,1775
1,119681868,"[1766699258, 1767767286, 1768599205, 176870299...","[3377750257, 3381059643, 3382088630, 338352429...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2015-01-01 00:03:48, 2015-01-01 22:50:21, 201...",706,2026
2,80705336,"[1766708109, 1766781995, 1767283930, 176779769...","[3378186944, 3378186944, 3379967272, 337818694...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2015-01-01 00:14:33, 2015-01-01 01:49:58, 201...",653,1225
3,46987845,"[848139831, 1318705113, 1353706075, 1399594770...","[1178114936, 2586046961, 2636003224, 270476032...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, ...","[2013-03-31 23:44:39, 2014-04-04 17:14:44, 201...",29862,3703
4,137569935,[1766728393],[3377148162],[0],[2015-01-01 00:33:40],1,0
...,...,...,...,...,...,...,...
1992537,386700367,[5945168033],[9303859903],[4],[2022-08-12 06:15:40],1,0
1992538,380909193,[5710779204],[8989006764],[4],[2022-01-27 21:08:27],1,0
1992539,345270447,[5716815397],[8998684079],[4],[2022-02-01 01:18:27],1,0
1992540,381459559,[5730145969],[9013121563],[4],[2022-02-11 13:52:03],1,0


In [100]:
COLLECTION_NAMES

['Gatewaypundit', 'Breitbart', 'Thehill', 'Atlantic', 'Motherjones']

In [108]:
collection_name = 'Breitbart'

mongo_client_articles, collection = _init_mongo_collection('Articles', collection_name)

def get_articles_from_ids(collection, ids, select_columns):
    query = {'_id': {'$in': ids}}
    projection = {k: 1 for k in select_columns}
    articles = collection.find(query, projection)
    return pd.DataFrame(articles)

art = get_articles_from_ids(collection, ['690516888'], ['_id', 'clean_title', 'createdAt'])
art

Unnamed: 0,_id,createdAt,clean_title
0,690516888,2012-05-15 15:26:38,Obama Inserts Himself in White House Bios of P...


# 4. ETC

## Global ↔ Community model comparison

In [None]:
topic_embeddings_list = []

for collection_name in COLLECTION_NAMES:
    model_name = MODEL_NAMES[collection_name]
    with open(join("result", collection_name.lower(), model_name+'_output.pkl'), 'rb') as f:
        output = pickle.load(f)
        topic_embeddings_list.append(output['topic_embeddings'])
    
# add global
with open(join("result", "global", MODEL_NAMES["global"]+"_output.pkl"), 'rb') as f:
    output = pickle.load(f)
    global_topic_embedding = output['topic_embeddings']

In [None]:
cos_sim_list_embedding = []
matching_list_embedding = []
cutoff = 20

for i in range(len(topic_embeddings_list)):
    comment_freq_top_indices = convert_list_dict[list(convert_list_dict.keys())[i]][:cutoff]+1
    topic_embeddings = topic_embeddings_list[i][comment_freq_top_indices]
    
    dense_list = [topic_embeddings, global_topic_embedding]
    matching, score_list = cos_sim_between_models(dense_list, None, None, 'embeddings', 0, 1)
    #matching = [matching[j] for j in np.argsort(score_list)[::-1]]
    #cos_sim_list_embedding.append(sorted(score_list, reverse=True))
    cos_sim_list_embedding.append(score_list)
    matching_list_embedding.append(matching)

In [None]:
for i in range(5):
    plt.plot(cos_sim_list_embedding[i][:20], label=COLLECTION_NAMES[i])
plt.legend()
plt.title('Comment, Full matching')

In [None]:
for i in range(5):
    plt.plot(cos_sim_list_embedding[i][:30], label=COLLECTION_NAMES[i])
plt.legend()
plt.title('Comment, Top 20 matching')

In [None]:
# Same thing for title
# Show keywords for each matching

In [None]:
# plot cos_sim_list_embedding each line with different color
for i in range(len(cos_sim_list_embedding)):
    plt.plot(cos_sim_list_embedding[i], label=COLLECTION_NAMES[i], alpha=0.5)
plt.legend()
plt.xlim([0, 50])
plt.ylim([0.9, 1])

In [None]:
# save cos_sim_list_embedding, matching_list_embedding 
with open(join('result', 'global', 'global_matching_dict.pkl'), 'wb') as f:
    pickle.dump({'cos_sim_list_embedding': cos_sim_list_embedding, 'matching_list_embedding': matching_list_embedding}, f)

In [None]:
with open(join('result', 'global', 'global_matching_dict.pkl'), 'rb') as f:
    df_dict = pickle.load(f)

In [None]:
cos_sim_list_embedding = df_dict['cos_sim_list_embedding']

In [None]:
df_dict['matching_list_embedding']

In [None]:
with open(join('result', 'matching_dict.pkl'), 'rb') as f:
    x = pickle.load(f)

## Check transform inconsistency

In [None]:
test_date_dict = {'Atlantic': (datetime(2013, 3, 1, 0, 0), datetime(2014, 4, 1, 0, 0), datetime(2015, 5, 1, 0, 0), datetime(2016, 6, 1, 0, 0), datetime(2017, 7, 1, 0, 0)),
                'Gatewaypundit': (datetime(2015, 5, 1, 0, 0), datetime(2017, 7, 1, 0, 0), datetime(2019, 9, 1, 0, 0), datetime(2021, 11, 1, 0, 0), datetime(2023, 1, 1, 0, 0)),
                'Breitbart': (datetime(2013, 3, 1, 0, 0), datetime(2015, 5, 1, 0, 0), datetime(2017, 7, 1, 0, 0), datetime(2019, 9, 1, 0, 0), datetime(2021, 11, 1, 0, 0)),
                'Thehill': (datetime(2013, 3, 1, 0, 0), datetime(2015, 5, 1, 0, 0), datetime(2017, 7, 1, 0, 0), datetime(2019, 9, 1, 0, 0), datetime(2021, 11, 1, 0, 0)),
                'Motherjones': (datetime(2013, 3, 1, 0, 0), datetime(2014, 4, 1, 0, 0), datetime(2015, 5, 1, 0, 0), datetime(2016, 6, 1, 0, 0), datetime(2017, 7, 1, 0, 0))}

In [None]:
inconsistency_dict = {collection_name:[] for collection_name in COLLECTION_NAMES}

for collection_name in COLLECTION_NAMES:
    topic_model = (BERTopic.load(join('model', collection_name.lower(), MODEL_NAMES[collection_name]), embedding_model="all-MiniLM-L6-v2"))
    test_samples = [100, 101, 110, 500, 900, 990, 999, 1000, 1001, 1010, 1100, 5000, 9000, 9900, 9990, 9999, 10000, 10001, 10010, 10100, 11000]
    
    for test_point in range(5):
        print(collection_name, test_point)
        start_date = test_date_dict[collection_name][test_point]
        end_date = start_date + relativedelta(months=1)
        time_period = relativedelta(months=1)
        for embeddings_month, current_date in tqdm(gen_sent_embeddings(collection_name, start_date, end_date), total=time_period.months):
            break
        
        test_dict = {}
        for i in test_samples:
            print(i)
            topics, _ = topic_model.transform([None]*i, embeddings=embeddings_month["embeddings"].numpy()[:i])
            test_dict[i] = topics
            
        inconsistency_dict[collection_name].append(test_dict)
        
# Save the variable using pickle
with open('inconsistency_dict.pkl', 'wb') as f:
    pickle.dump(inconsistency_dict, f)

## Constructing sampling_dict for global / title models

### A. global sampling dict

In [None]:
seed_dict = {'Atlantic': 4,
             'Breitbart': 3,
             'Gatewaypundit': 4,
             'Motherjones': 5,
             'Thehill': 2}

sampling_size = int(2e6/5)

for i in range(5):
    seed_global = i+1
    print(seed_global)
    
    sampled_global_embeddings_dict = {}
    for collection_name in seed_dict.keys():
        print(collection_name)
        with open(join('search', collection_name.lower(), 'sampled_embeddings_dict', f'sampled_embeddings_dict_{seed_dict[collection_name]}.pkl'), 'rb') as f:
            sampled_embeddings_dict = pickle.load(f)
            
        np.random.seed(seed_global)
        sampled_indices = np.random.choice(len(sampled_embeddings_dict['_id']), sampling_size, replace=False)    
        
        if len(sampled_global_embeddings_dict.keys())==0:
            sampled_global_embeddings_dict['_id'] = [sampled_embeddings_dict['_id'][t] for t in sampled_indices]
            sampled_global_embeddings_dict['embeddings'] = sampled_embeddings_dict['embeddings'][sampled_indices]
            sampled_global_embeddings_dict['raw_message'] = [sampled_embeddings_dict['raw_message'][t] for t in sampled_indices]
        else:
            sampled_global_embeddings_dict['_id'] += [sampled_embeddings_dict['_id'][t] for t in sampled_indices]
            sampled_global_embeddings_dict['embeddings'] = torch.vstack((sampled_global_embeddings_dict['embeddings'], sampled_embeddings_dict['embeddings'][sampled_indices]))
            sampled_global_embeddings_dict['raw_message'] += [sampled_embeddings_dict['raw_message'][t] for t in sampled_indices]
            
        del sampled_embeddings_dict
        gc.collect()
        
    with open(f'{join("search", "global", "sampled_embeddings_dict", f"sampled_embeddings_dict_{seed_global}")}.pkl', 'wb') as f:
        pickle.dump(sampled_global_embeddings_dict, f)

### B. Title sampling dict

In [None]:
for collection_name in COLLECTION_NAMES:
    print(collection_name)
    # load articles_by_month from pickle file
    articles_by_day = {}
    threshold = 10
    model_name = MODEL_NAMES[collection_name]
    file_names = os.listdir(join('article', collection_name.lower(), model_name, 'articles_by_day'))
    file_names = sorted(file_names, key=lambda x: datetime.strptime(x.split('.')[0], '%Y-%m-%d'))
    
    sampled_title_embeddings_dict = {}
    title_embeddings_list = []

    for file_name in file_names:
        day = file_name.split('.')[0]
        print(day)
        articles = pd.read_parquet(join('article', collection_name.lower(), model_name, 'articles_by_day', file_name))
        articles = articles[articles['comment_id'].apply(len) > threshold]
        if len(articles)>0:
            title_embeddings_list.append(articles[['_id', 'clean_title', 'title_embeddings', 'createdAt']])

    title_embeddings_df = pd.concat(title_embeddings_list, ignore_index=True)
    title_embeddings_df.to_feather(join('article', collection_name.lower(), model_name, 'title_embeddings_df.feather'))
    print(len(title_embeddings_df))

# Atlantic : 32032
# Breitbart : 284801
# Gatewaypundit : 83807
# Motherjones : 30967
# Thehill : 307959

In [None]:
collection_names = list(DATE_RANGES.keys())
seed_list = [1, 2, 3, 4, 5]
sample_size = 30967 # Motherjones (minimum number of articles)

for seed in seed_list:
    title_df_list = []
    
    for collection_name in collection_names:
        print(collection_name)
        model_name = MODEL_NAMES[collection_name]
        title_embeddings_df = pd.read_feather(join('article', collection_name.lower(), model_name, 'title_embeddings_df.feather'))
        # sampling
        np.random.seed(seed)
        sampled_indices = np.random.choice(len(title_embeddings_df['_id']), sample_size, replace=False)
        title_embeddings_df = title_embeddings_df.iloc[sampled_indices]
        title_df_list.append(title_embeddings_df)
    
    title_df = pd.concat(title_df_list, ignore_index=True)
    with open(join('/data', 'collmind', 'search', 'title', 'sampled_embeddings_dict', f'sampled_embeddings_dict_{seed}.pkl'), 'wb') as f:
        pickle.dump(title_df.to_dict(orient='list'), f)


In [None]:
# convert pandas dataframe into dictionary
def df_to_dict(df):
    return {k: v for k, v in zip(df['_id'], df['title_embeddings'])}


### C. Title model transform

In [None]:
## merging title data

collection_names = list(DATE_RANGES.keys())
title_df_list = []

for collection_name in collection_names:
    print(collection_name)
    model_name = MODEL_NAMES[collection_name]
    title_embeddings_df = pd.read_feather(join('article', collection_name.lower(), model_name, 'title_embeddings_df.feather'))
    title_embeddings_df['collection'] = collection_name  # Add the 'collection' column
    title_df_list.append(title_embeddings_df)

title_df = pd.concat(title_df_list, ignore_index=True)

In [None]:
title_df['createdAt'] = title_df['createdAt'].apply(lambda x: x.strftime('%m%y'))

In [None]:
topic_model = BERTopic.load(join('model', 'title', MODEL_NAMES['title']), embedding_model="all-MiniLM-L6-v2")
chunk_list = []

In [None]:
chunk_size = 10000

for i in tqdm(range(0, len(title_df), chunk_size)):
    chunk = title_df[i:i+chunk_size]
    
    articles['topic_num'] = class_nums
    articles['topic_prob'] = class_probs
    
    topics, prob = topic_model.transform(chunk['clean_title'], embeddings=np.vstack(chunk['title_embeddings'].values))
    chunk['title_topics'] = topics
    chunk_list.append(chunk)

title_df = pd.concat(chunk_list, ignore_index=True)

In [None]:
title_df.to_feather(join('transform', 'title', 'title_df.feather'))

## Prepare real-world dataset settings for simulation

In [None]:
title_topic_num_list_dict_dict = {}
ranking_month_list_dict = {}
frequency_month_list_dict = {}

aggregate_day = 'month'

for collection_name in COLLECTION_NAMES:
    print(collection_name)
    with open(join('/data', 'collmind', 'article_global', collection_name.lower(), 'result_dict.pkl'), 'rb') as f:
        result_dict = pickle.load(f)
    title_topic_num_list_dict_dict[collection_name] = result_dict['title_topic_num_list_dict']['month']
    
    file_names = os.listdir(join('/data', 'collmind', 'article_global', collection_name.lower(), 'month', 'df'))
    keys = sorted([file_name.split('.')[0] for file_name in file_names], key=lambda x: datetime.strptime(x, '%Y-%m-%d'))

    ranking_month_list = []
    frequency_list = []

    for key in keys:
        data = pd.read_parquet(join('/data', 'collmind', 'article_global', collection_name.lower(), aggregate_day, 'df', key+'.parquet'))['topic_freq'].values
        assert data[0] == np.max(data)
        ranking_month_list.append(get_ranking(data[1:])[ranking_dict['global']])
        frequency_list.append(data[1:][ranking_dict['global']]/np.sum(data[1:]))
        
    ranking_month_list_dict[collection_name] = np.array(ranking_month_list)
    frequency_month_list_dict[collection_name] = np.array(frequency_list)

In [None]:
key = '2015-06-01'
aggregate_day = 'month'
frequency_list = []
sim_matrix_list = []

for collection_name in COLLECTION_NAMES:
    print(collection_name)
    
    data = pd.read_parquet(join('/data', 'collmind', 'article_global', collection_name.lower(), aggregate_day, 'df', key+'.parquet'))['topic_freq'].values
    frequency_list.append(data[1:][ranking_dict['global']] / np.sum(data[1:]))
    data = pd.read_parquet(join('/data', 'collmind', 'article_global', collection_name.lower(), 'month', 'df', key+'.parquet'))
    embedding = np.vstack(data['topic_mean_embedding'].values)
    sim_matrix_full = cosine_similarity(embedding[1:], embedding[1:])
    sim_matrix_list.append(sim_matrix_full)
        
frequency_list = np.array(frequency_list)
sim_matrix_list = np.array(sim_matrix_list)

In [None]:
np.savez_compressed(join('comp_model', f'freq_sim_data_{key}'), frequency_list=frequency_list, sim_matrix_list=sim_matrix_list)

In [None]:
empirical_data = np.load(join('comp_model', f'freq_sim_data_{key}.npz'))
frequency_list = empirical_data['frequency_list']
sim_matrix_list = empirical_data['sim_matrix_list']

## Prepare distant initial condition

In [None]:
import scipy.spatial as spatial
from scipy.stats import kendalltau
from comp_model.model import *

def kendalltau_dist(x, y):
    x = kendalltau(np.argsort(np.argsort(-x)), np.argsort(np.argsort(-y))).statistic
    return (1-x)/2

In [None]:
fixed_dict = {}
topic_num = 250
IV_dict =  {'iv_type': 0,
            'iv_time': 0,
            'iv_s1': 0,
            'iv_s2': 0,
            'iv_rank': 0,
            'iv_tier': 0}

In [None]:
## for fixed 1.0

counter = 0
initial_freq_std = 1.0

while True:
    tg = topic_graph(topic_num, filter_strength=0.0, memory_strength=0.0, sampling_ratio_list=[0.5, 0.5], IV_dict=IV_dict, frequency=None, weight=None)
    o_freq = deepcopy(tg.frequency)
    tg.perturb_frequency_lognormal(initial_freq_std)
    p_freq = deepcopy(tg.frequency)

    kd = kendalltau_dist(o_freq, p_freq)
    js = spatial.distance.jensenshannon(o_freq, p_freq)
    corr = np.corrcoef(o_freq, p_freq)[0][1]

    if (kd > 0.3) and (js > 0.3) and (corr > 0.6):
        print(counter, kd, js, corr)
        break
    else:
        counter+=1

print(get_ranking(p_freq))

In [None]:
fixed_dict = {}
fixed_dict[1.0] = {}
fixed_dict[1.0]['freq'] = p_freq
fixed_dict[1.0]['weight'] = np.array([weight_dist(topic_num)])

In [None]:
## for fixed 0.2

counter = 0
initial_freq_std = 0.2

while True:
    tg = topic_graph(topic_num, filter_strength=0.0, memory_strength=0.0, sampling_ratio_list=[0.5, 0.5], IV_dict=IV_dict, frequency=None, weight=None)
    o_freq = deepcopy(tg.frequency)
    tg.perturb_frequency_lognormal(initial_freq_std)
    p_freq = deepcopy(tg.frequency)

    kd = kendalltau_dist(o_freq, p_freq)
    js = spatial.distance.jensenshannon(o_freq, p_freq)
    corr = np.corrcoef(o_freq, p_freq)[0][1]
    print(counter, kd, js, corr)
    if (kd > 0.095):
        print(counter, kd, js, corr)
        break
    else:
        counter+=1

print(get_ranking(p_freq))

In [None]:
fixed_dict = {}
fixed_dict[0.2] = {}
fixed_dict[0.2]['freq'] = p_freq
fixed_dict[0.2]['weight'] = np.array([weight_dist(topic_num)])

In [None]:
# for fixed 0.0

tg = topic_graph(topic_num, filter_strength=0.0, memory_strength=0.0, sampling_ratio_list=[0.5, 0.5], IV_dict=IV_dict, frequency=None, weight=None)
fixed_dict[0.0] = {}
fixed_dict[0.0]['freq'] = tg.frequency
fixed_dict[0.0]['weight'] = np.array([weight_dist(topic_num) for i in range(10)])

In [None]:
for i in range(10):
    x = fixed_dict[0.0]['weight'][i][25]
    print(np.argsort(-x)[:3])

In [None]:
with open(join('comp_model', 'data', 'fixed_dict.pkl'), 'wb') as f:
    pickle.dump(fixed_dict, f)

In [None]:
with open(join('comp_model', 'data', 'fixed_dict.pkl'), 'rb') as f:
    fixed_dict = pickle.load(f)

In [None]:
fixed_dict[0.2] = {}
fixed_dict[0.2]['freq'] = p_freq
fixed_dict[0.2]['weight'] = np.array([weight_dist(topic_num)])

## Legacy

In [None]:


# comment topic embeddings similarity (chunk)

aggregate_day_list = ['month']
num_topics = NUM_TOPICS['global']

for collection_name in COLLECTION_NAMES:
    print(collection_name)
    final_dict = {}
    
    for aggregate_day in aggregate_day_list:
        print(aggregate_day)

        file_names = os.listdir(join('/data', 'collmind', 'article_global', collection_name.lower(), aggregate_day, 'df'))
        keys = sorted([file_name.split('.')[0] for file_name in file_names], key=lambda x: datetime.strptime(x, '%Y-%m-%d'))

        gen = tmp_df_generator(collection_name, aggregate_day, include_embeddings=True, is_global=True)

        similarity_other_list = []
        similarity_self_list_list = []

        for month in sorted(keys):
            print(month)
            comments_embeddings_dict_self = defaultdict(lambda: [[[], []] for _ in range(3)])
            comments_embeddings_dict_others = defaultdict(lambda: [])
            
            articles = next(gen)
            
            articles["comment_embeddings"] = articles["comment_embeddings"].apply(np.array)
            topic_tuples = np.stack(articles["topic_num"].apply(lambda x: x[:3]))
            pair_list = [list(combinations(t, 2)) for t in topic_tuples]

            articles[["topic_tuple_embeddings", "topic_tuple_num"]] = articles.apply(lambda x: pd.Series(compute_embeddings(x)), axis=1)

            c_order = [[0, 1], [0, 2], [1, 2]]

            for i in range(len(pair_list)):
                pairs = pair_list[i]
                topic_tuple_num_list = articles["topic_tuple_num"].iloc[i]
                topic_tuple_embeddings_list = articles["topic_tuple_embeddings"].iloc[i]

                for j, (a, b) in enumerate(pairs):
                    if -1 not in (a, b):
                        for k in range(2):
                            if topic_tuple_num_list[c_order[j][k]] > 0:
                                comments_embeddings_dict_self[pairs[j]][j][k].append(topic_tuple_embeddings_list[c_order[j][k]])

            articles = articles.to_numpy()  # columns : ['_id', 'topic_num', 'topic_prob', 'comment_topics', 'comment_embeddings', 'topic_tuple_embeddings', 'topic_tuple_num'],

            for i in range(num_topics):
                mask = ~np.isin(topic_tuples, i)
                indices = np.where(mask)[0]

                for idx in indices:
                    article = articles[idx]
                    topic_set = set(range(num_topics))
                    comment_indices = [j for j, topic in enumerate(article[3]) if topic == i]
                    #comment_indices = np.where(np.isin(article[3], i))[0]
                    if len(comment_indices) > 0:
                        comments_embeddings_dict_others[i].extend(article[4][comment_indices])

            comments_embeddings_dict_self_dict = dict(comments_embeddings_dict_self)
            comments_embeddings_dict_others_dict = dict(comments_embeddings_dict_others)

            gc.collect()
                        
            N = 5000  # chunk size

            for key in comments_embeddings_dict_self.keys():
                if -1 not in key:
                    if key[0] in comments_embeddings_dict_others.keys() and key[1] in comments_embeddings_dict_others.keys():
                        vector1 = comments_embeddings_dict_others[key[0]]
                        vector2 = comments_embeddings_dict_others[key[1]]

                        similarity_other = 0
                        
                        for i in range(0, len(vector1), N):
                            for j in range(0, len(vector2), N):
                                similarity_other += np.sum(cosine_similarity(vector1[i:i+N], vector2[j:j+N]))
                        
                        similarity_other /= (len(vector1) * len(vector2))
                        
                        similarity_self_list = []
                        
                        for i in range(3):
                            vector1 = comments_embeddings_dict_self[key][i][0]
                            vector2 = comments_embeddings_dict_self[key][i][1]
                            
                            if len(vector1) > 0 and len(vector2) > 0:
                                similarity_self = 0
                                for i in range(0, len(vector1), N):
                                    for j in range(0, len(vector2), N):
                                        similarity_self += np.sum(cosine_similarity(vector1, vector2))
                                
                                similarity_self /= (len(vector1) * len(vector2))        
                                similarity_self_list.append(similarity_self)
                            else:
                                similarity_self_list.append(0)
                            
                        #print(f'{key}, other : {similarity_other}, self : {similarity_self_list}')
                        similarity_other_list.append(similarity_other)
                        similarity_self_list_list.append(similarity_self_list)

        big_dict = {
            'similarity_other_list': similarity_other_list,
            'similarity_self_list_list': similarity_self_list_list
        }
        
        final_dict[aggregate_day] = big_dict

    with open(join('result', f'comment_sim_{collection_name}_global_dict.pkl'), 'wb') as f:
        pickle.dump(final_dict, f)