In [6]:
import sys
from pathlib import Path

from tensorflow.core.config.flags import config

root_dir = str(Path.cwd().parent.parent.absolute())
if not root_dir in sys.path:
    sys.path.insert(0, root_dir)

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from config import Config

sns.set_style('whitegrid')
config = Config.to_dict()

In [None]:
with open('../data/raw/dataset_en.txt', 'r', encoding='utf-8') as f:
    english = f.readlines()

with open('../data/raw/dataset_vi.txt', 'r', encoding='utf-8') as f:
    vietnamese = f.readlines()

In [None]:
# %% [markdown]
# # 📊 Data Exploration - English-Vietnamese Translation Dataset

# %%




# %% [markdown]
# ## 1. Load Data

# %%
# Load datasets


df = pd.DataFrame({
    'english': [line.strip() for line in english],
    'vietnamese': [line.strip() for line in vietnamese]
})

print(f"Dataset shape: {df.shape}")
df.head(10)

# %% [markdown]
# ## 2. Basic Statistics

# %%
# Sentence lengths
df['en_length'] = df['english'].apply(lambda x: len(x.split()))
df['vi_length'] = df['vietnamese'].apply(lambda x: len(x.split()))

print("English sentences:")
print(df['en_length'].describe())
print("\nVietnamese sentences:")
print(df['vi_length'].describe())

# %% [markdown]
# ## 3. Length Distribution

# %%
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# English
axes[0].hist(df['en_length'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('English Sentence Length Distribution')
axes[0].set_xlabel('Number of Words')
axes[0].set_ylabel('Frequency')
axes[0].axvline(df['en_length'].mean(), color='red', linestyle='--',
                label=f'Mean: {df["en_length"].mean():.1f}')
axes[0].legend()

# Vietnamese
axes[1].hist(df['vi_length'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('Vietnamese Sentence Length Distribution')
axes[1].set_xlabel('Number of Words')
axes[1].set_ylabel('Frequency')
axes[1].axvline(df['vi_length'].mean(), color='red', linestyle='--',
                label=f'Mean: {df["vi_length"].mean():.1f}')
axes[1].legend()

plt.tight_layout()
plt.show()

# %% [markdown]
# ## 4. Vocabulary Analysis

# %%
def count_vocab(texts):
    word_freq = Counter()
    for text in texts:
        word_freq.update(text.lower().split())
    return word_freq

en_vocab = count_vocab(df['english'])
vi_vocab = count_vocab(df['vietnamese'])

print(f"English vocabulary size: {len(en_vocab):,}")
print(f"Vietnamese vocabulary size: {len(vi_vocab):,}")

print("\nTop 20 English words:")
print(en_vocab.most_common(20))

print("\nTop 20 Vietnamese words:")
print(vi_vocab.most_common(20))

# %% [markdown]
# ## 5. Rare Words Analysis

# %%
# Words appearing only once
en_rare = [w for w, c in en_vocab.items() if c == 1]
vi_rare = [w for w, c in vi_vocab.items() if c == 1]

print(f"English rare words (freq=1): {len(en_rare):,} ({len(en_rare)/len(en_vocab)*100:.1f}%)")
print(f"Vietnamese rare words (freq=1): {len(vi_rare):,} ({len(vi_rare)/len(vi_vocab)*100:.1f}%)")

# %% [markdown]
# ## 6. Length Filtering Impact

# %%
# Test different max lengths
max_lengths = [20, 30, 40, 50, 60]

for max_len in max_lengths:
    filtered = df[(df['en_length'] <= max_len) & (df['vi_length'] <= max_len)]
    kept_pct = len(filtered) / len(df) * 100
    print(f"Max length {max_len}: {len(filtered):,}/{len(df):,} pairs ({kept_pct:.1f}%)")

# %% [markdown]
# ## 7. Sample Pairs

# %%
print("Random sample pairs:\n")
for i in df.sample(5).index:
    print(f"EN: {df.loc[i, 'english']}")
    print(f"VI: {df.loc[i, 'vietnamese']}")
    print()
