# Combine Word Frequencies

This notebook combines word frequency data from multiple corpora located in `artifacts/word_freq`. Each file is a JSON dictionary of `{word: norm_freq}`.

The combination is done using a weighted average of the normalized frequencies for each word across all corpora. The weights can be adjusted in the second cell.

The final combined word frequency list is saved to `artifacts/combined_word_freq.json`.

In [None]:
import json
from pathlib import Path
import pandas as pd
import numpy as np

# Define paths
WORD_FREQ_DIR = Path("../artifacts/word_freq")
OUTPUT_PATH = Path("../artifacts/combined_word_freq.json")

# Ensure output directory exists
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

In [None]:
# Load all word frequency files
json_files = list(WORD_FREQ_DIR.glob("*.json"))

if not json_files:
    raise FileNotFoundError(f"No JSON files found in {WORD_FREQ_DIR}")

all_freqs = {}
corpus_names = []
for file_path in json_files:
    corpus_name = file_path.stem
    corpus_names.append(corpus_name)
    with open(file_path, "r", encoding="utf-8") as f:
        all_freqs[corpus_name] = json.load(f)

print(f"Loaded {len(corpus_names)} corpora: {corpus_names}")

# Create a DataFrame from the loaded data
df = pd.DataFrame(all_freqs).fillna(0)
df.head()

In [None]:
# --- Define weights for each corpus ---
# By default, we use equal weights.
# You can modify the weights here. The weights will be normalized.
weights = {name: 1.0 for name in corpus_names}

# Example of custom weights:
# weights = {
#     'corpus1': 1.0,
#     'corpus2': 1.5,
#     'corpus3': 0.8
# }

# Normalize weights to sum to 1
total_weight = sum(weights.values())
normalized_weights = {name: w / total_weight for name, w in weights.items()}

print("Using normalized weights:")
print(json.dumps(normalized_weights, indent=2))

# Get weights in the same order as dataframe columns
weight_values = [normalized_weights[col] for col in df.columns]

In [None]:
# Calculate the weighted average for each word
combined_freq = df.apply(
    lambda row: np.average(row, weights=weight_values),
    axis=1
)

# Convert the result to a dictionary
combined_freq_dict = combined_freq.to_dict()

# Display first 10 items of the combined frequencies
dict(list(combined_freq_dict.items())[:10])

In [None]:
# Save the combined frequencies to a JSON file
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(combined_freq_dict, f, ensure_ascii=False, indent=2)

print(f"Combined word frequencies saved to {OUTPUT_PATH}")