In [None]:
import datetime
import pandas as pd
import json
import time
import re
import os
import zipfile
from collections import defaultdict
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
nltk_stopwords = set(stopwords.words('english'))

# Function to read data from a file and return it as a dictionary
def read_json_file(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Function to merge multiple dictionaries into a single dictionary
def merge_json_data(data_list):
    merged_data = []
    for data in data_list:
        merged_data.extend(data)
    return merged_data

def execute_merge(file_names, change_fieldName=False):
    # Load data from each file and merge into a single list
    data_list = []
    total_keywords = 0
    for file_name in tqdm(file_names, desc='Merging'):
        data = read_json_file(file_name)
        data_list.append(data)

        # Count the number of keywords in the current file and add it to the total
        num_keywords = len(data)
        total_keywords += num_keywords
        #print(f"File {file_name} - Number of keywords: {num_keywords}")

    # Merge data into a single JSON object
    merged_chunks = merge_json_data(data_list)
    
    # Sort data by alphabetical order
    merged_sorted_data = sorted(merged_chunks, key=lambda x: x['keyword'])  # Sort by keywords
    
    # Filter the data to keep only items where keyword is not a stopword and has length > 1
    merged_sorted_data = [item for item in tqdm(merged_sorted_data, desc='Removing meaningless matches') if item["keyword"].lower() not in nltk_stopwords and len(item["keyword"]) > 1]

    if change_fieldName:
        for item in tqdm(merged_sorted_data, desc='Changing fieldname'):
            item["matching_posts"] = item.pop("matching posts")

    # Sort the matching_posts for each item by date
    for item in tqdm(merged_sorted_data, desc='Sorting'):
        item["matching_posts"] = sorted(
            item["matching_posts"],
            key=lambda x: datetime.datetime.strptime(x["date"], "%m-%d-%Y")
        )
        
    return merged_sorted_data, total_keywords

## Reports

In [None]:
# List of file names to load
file_names = [ 
    "/home/anon/input/chunks-keywords-in-datasets/5-Chunks_REP_keywords_in_HF_posts/REP_keywords_in_HF_posts_{0}.json"
    .format(i) for i in range(1, 691)]

merged_sorted_data, total_keywords = execute_merge(file_names, True)

# Save the merged data to a new JSON file
output_file = "REP_keywords_in_HF_posts.json"
print('Saving', output_file)
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(merged_sorted_data, file, indent=4)

print(f"\nData from {len(file_names)} files merged and saved to {output_file}.")
print("Total keywords with match: {0}".format(total_keywords))

In [None]:
# List of file names to load
file_names = [ 
    "/home/anon/input/chunks-keywords-in-datasets/6-Chunks_REP_keywords_in_REP_articles/REP_keywords_in_REP_articles_{0}.json"
    .format(i) for i in range(1, 698)]

merged_sorted_data, total_keywords = execute_merge(file_names)

# Save the merged data to a new JSON file
output_file = "REP_keywords_in_REP_articles.json" 
print('Saving', output_file)
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(merged_sorted_data, file, indent=4)

print(f"\nData from {len(file_names)} files merged and saved to {output_file}.")
print("Total keywords with match: {0}".format(total_keywords))

## HF

In [None]:
# List of file names to load
file_names = [ 
    "/home/anon/input/chunks-keywords-in-datasets/5-Chunks_HF_keywords_in_REP_articles/HF_keywords_in_REP_articles_{0}.json"
    .format(i) for i in range(1, 271)]

merged_sorted_data, total_keywords = execute_merge(file_names, True)

# Save the merged data to a new JSON file
output_file = "HF_keywords_in_REP_articles.json" 
print('Saving', output_file)
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(merged_sorted_data, file, indent=4)

print(f"\nData from {len(file_names)} files merged and saved to {output_file}.")
print("Total keywords with match: {0}".format(total_keywords))

In [None]:
# List of file names to load
file_names = [ 
    "/home/anon/input/chunks-keywords-in-datasets/6-Chunks_HF_keywords_in_HF_posts/HF_keywords_in_HF_posts_{0}.json"
    .format(i) for i in range(1, 281)]

merged_sorted_data, total_keywords = execute_merge(file_names)

# Save the merged data to a new JSON file
output_file = "HF_keywords_in_HF_posts.json" 
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(merged_sorted_data, file, indent=4)

print(f"\nData from {len(file_names)} files merged and saved to {output_file}.")
print("Total keywords with match: {0}".format(total_keywords))