In [2]:
from data_collection.reddit_user_dataset import RedditUserDataset
from classification.feature_computing import Embedder
from utils.file_sort import path_sort
import os
import datetime
import time
import pickle as pkl
from os import listdir
from os.path import isfile, join
from argparse import ArgumentParser
import argparse
import gzip
import sys
from tqdm import tqdm


In [3]:
start_date = datetime.date(2020, 1, 1)
end_date = datetime.date(2021, 4, 30)

In [4]:
# Generate timeframes
delta_days = 30
offset_days = 30
curr_date = start_date
timeframes = []
while curr_date + datetime.timedelta(days=delta_days) < end_date:
    print(curr_date)
    timeframes.append((curr_date, curr_date + datetime.timedelta(days=delta_days)))
    curr_date = curr_date + datetime.timedelta(days=offset_days)

2020-01-01
2020-01-31
2020-03-01
2020-03-31
2020-04-30
2020-05-30
2020-06-29
2020-07-29
2020-08-28
2020-09-27
2020-10-27
2020-11-26
2020-12-26
2021-01-25
2021-02-24
2021-03-26


In [5]:
base_dataset_path = '../data/reddit_dataset/reddit_corpus_unbalanced_filtered.gzip'
base_dataset = RedditUserDataset.load_from_file(base_dataset_path, compression='gzip')


In [6]:
embeddings_filepath = ['../data/embeddings/usr2vec/']
dim = 200
embed_type = 'usr2vec'
embedder = Embedder(embeddings_filepath, embed_type, dim=dim)


In [8]:

graph_type = 'linguistic'
source_threshold = 0.75
embed_mode = 'avg'
similarity_metric = 'cosine_similarity'

for time_index, tf in tqdm(enumerate(timeframes)):
    print("Generating source timeframe...")
    start = time.time()
    if graph_type == 'linguistic':
        #framed = base_dataset.timeframed_documents(tf, inplace=False)
        framed = RedditUserDataset(base_dataset.data_frame.drop(columns=['documents', 'embedding_file', 'annotation', 'bias_counter', 'factual_counter']))
        framed.generate_similarity_triplet_list(embedder, source_threshold, embed_mode,time_index,
                                                similarity_metric=similarity_metric)
    elif graph_type == 'social':
        # Format ('2020-09-01', '2020-09-30')
        formatted_timeframe = (tf[0], tf[1])
        print(formatted_timeframe)
        framed = base_dataset.load_social_graph_from_cache(formatted_timeframe, inplace=False)
        framed = RedditUserDataset(base_dataset.data_frame.drop(columns=['documents', 'embedding_file', 'annotation', 'bias_counter', 'factual_counter']))
    else:
        raise Exception("Invalid graph type")
    #framed.store_instance_to_file(source_graph_path + 'source_graph_' + str(time_index) + '.pkl')
    end = time.time()
    print("Elapsed time:" + str(end - start))
    break

0it [00:00, ?it/s]

Generating source timeframe...
Exception while embedding user f4c297d70c985a246ce50bb9c2554d0e2a7743e1488dcf07bed4b4e990de342a
'f4c297d70c985a246ce50bb9c2554d0e2a7743e1488dcf07bed4b4e990de342a'
Exception while embedding user 638b517f673fe4e9cf6bb08405812c3f423fa8dceef0f273fa1e5260bb8652e4
'638b517f673fe4e9cf6bb08405812c3f423fa8dceef0f273fa1e5260bb8652e4'
Exception while embedding user b2e4450e64e3e19ef5a4dea085959c572f22fa4fa2f2c6b7d3f7c47aeb1c7ee4
'b2e4450e64e3e19ef5a4dea085959c572f22fa4fa2f2c6b7d3f7c47aeb1c7ee4'
Exception while embedding user 285d52d8050a872b6c95e6b2971586f312765ae76fe8047a2f3554ab75d76bed
'285d52d8050a872b6c95e6b2971586f312765ae76fe8047a2f3554ab75d76bed'


0it [06:19, ?it/s]


NameError: name 'source_graph_path' is not defined

In [19]:
for sim_triple in framed.similarity_triplets:
    sim = sim_triple[-1]
    if sim < 0.75:
        print(sim_triple)
    

# Bert Linguistic Similarities

In [20]:
embeddings_filepath = ['../data/embeddings/bert/']
dim = 768
embed_type = 'bert'
embedder = Embedder(embeddings_filepath, embed_type, dim=dim)


  user_embedding.append(torch.tensor(embedding))


In [21]:

graph_type = 'linguistic'
source_threshold = 0.75
embed_mode = 'avg'
similarity_metric = 'cosine_similarity'

for time_index, tf in tqdm(enumerate(timeframes)):
    print("Generating source timeframe...")
    start = time.time()
    if graph_type == 'linguistic':
        #framed = base_dataset.timeframed_documents(tf, inplace=False)
        framed_bert = RedditUserDataset(base_dataset.data_frame.drop(columns=['documents', 'embedding_file', 'annotation', 'bias_counter', 'factual_counter']))
        framed_bert.generate_similarity_triplet_list(embedder, source_threshold, embed_mode,time_index,
                                                similarity_metric=similarity_metric)
    elif graph_type == 'social':
        # Format ('2020-09-01', '2020-09-30')
        formatted_timeframe = (tf[0], tf[1])
        print(formatted_timeframe)
        framed_bert = base_dataset.load_social_graph_from_cache(formatted_timeframe, inplace=False)
        framed_bert = RedditUserDataset(base_dataset.data_frame.drop(columns=['documents', 'embedding_file', 'annotation', 'bias_counter', 'factual_counter']))
    else:
        raise Exception("Invalid graph type")
    #framed.store_instance_to_file(source_graph_path + 'source_graph_' + str(time_index) + '.pkl')
    end = time.time()
    print("Elapsed time:" + str(end - start))
    break

0it [00:00, ?it/s]

Generating source timeframe...
Exception while embedding user b2e4450e64e3e19ef5a4dea085959c572f22fa4fa2f2c6b7d3f7c47aeb1c7ee4
'b2e4450e64e3e19ef5a4dea085959c572f22fa4fa2f2c6b7d3f7c47aeb1c7ee4'


0it [06:25, ?it/s]

Elapsed time:385.6063551902771



