# Combine Word Frequencies

This notebook combines word frequency data from multiple corpora located in `artifacts/word_freq`. Each file is a JSON dictionary of `{word: norm_freq}`.

The combination is done using a weighted average of the normalized frequencies for each word across all corpora. The weights can be adjusted in the second cell.

The final combined word frequency list is saved to `artifacts/combined_word_freq.json`.

In [1]:
import json
from pathlib import Path
import pandas as pd
import numpy as np

# Define paths
WORD_FREQ_DIR = Path("../artifacts/word_freq")
OUTPUT_PATH = Path("../artifacts/combined_word_freq.json")

# Ensure output directory exists
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

In [2]:
# Load all word frequency files
json_files = list(WORD_FREQ_DIR.glob("*.json"))

if not json_files:
    raise FileNotFoundError(f"No JSON files found in {WORD_FREQ_DIR}")

all_freqs = {}
corpus_names = []
for file_path in json_files:
    corpus_name = file_path.stem
    corpus_names.append(corpus_name)
    with open(file_path, "r", encoding="utf-8") as f:
        all_freqs[corpus_name] = json.load(f)

print(f"Loaded {len(corpus_names)} corpora: {corpus_names}")

# Create a DataFrame from the loaded data
df = pd.DataFrame(all_freqs).fillna(0)
df.head()

Loaded 4 corpora: ['prachathai', 'wisesight', 'wongnai', 'thwiki']


Unnamed: 0,prachathai,wisesight,wongnai,thwiki
นักวิจัย,5.1e-05,9e-06,0.0,3.08698e-05
หนุน,7.2e-05,1.9e-05,2e-06,1.300043e-05
แม้ว,1.2e-05,6e-06,5e-06,1.962485e-06
เปิด,0.000598,0.001097,0.001616,0.0005004585
จีเอ็มโอ,3.9e-05,0.0,0.0,4.55429e-07


In [3]:
# --- Define weights for each corpus ---
# By default, we use equal weights.
# You can modify the weights here. The weights will be normalized.
weights = {name: 1.0 for name in corpus_names}

# Example of custom weights:
# weights = {
#     'corpus1': 1.0,
#     'corpus2': 1.5,
#     'corpus3': 0.8
# }

# Normalize weights to sum to 1
total_weight = sum(weights.values())
normalized_weights = {name: w / total_weight for name, w in weights.items()}

print("Using normalized weights:")
print(json.dumps(normalized_weights, indent=2))

# Get weights in the same order as dataframe columns
weight_values = [normalized_weights[col] for col in df.columns]

Using normalized weights:
{
  "prachathai": 0.25,
  "wisesight": 0.25,
  "wongnai": 0.25,
  "thwiki": 0.25
}


In [4]:
# Calculate the weighted average for each word
combined_freq = df.apply(
    lambda row: np.average(row, weights=weight_values),
    axis=1
)

# Sort by most common first
combined_freq.sort_values(ascending=False, inplace=True)

# Convert the result to a dictionary
combined_freq_dict = combined_freq.to_dict()

# Display first 10 items of the combined frequencies
combined_freq.head(10)

ที่     0.018033
ไม่     0.014749
ใน      0.013315
มี      0.013078
และ     0.012822
ของ     0.010543
ได้     0.010384
เป็น    0.010020
มา      0.008944
ไป      0.008281
dtype: float64

In [5]:
print(f"Total unique word counts: {len(combined_freq_dict)}")

combined_freq.describe()

Total unique word counts: 109999


count    1.099990e+05
mean     9.090992e-06
std      1.523195e-04
min      6.210395e-09
25%      1.429558e-08
50%      6.003382e-08
75%      5.913346e-07
max      1.803333e-02
dtype: float64

In [6]:
# Save the combined frequencies to a JSON file
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(combined_freq_dict, f, ensure_ascii=False, indent=2)

print(f"Combined word frequencies saved to {OUTPUT_PATH}")

Combined word frequencies saved to ../artifacts/combined_word_freq.json
