In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt

# =========================
# 1. Load the dataset
# =========================
# Assume the dataset is in CSV/TSV format with columns: 'en' (English), 'ar' (Arabic)
# Replace 'arzen_multigenre.csv' with your actual path

file_path = "arzen_multigenre.csv"
try:
    df = pd.read_csv(file_path)
except UnicodeDecodeError:
    # fallback for encoding issues
    df = pd.read_csv(file_path, encoding="utf-8", errors="replace")

df.head()

In [None]:
# =========================
# 2. Basic dataset info
# =========================
print("Dataset shape:", df.shape)
print("Null values:\n", df.isnull().sum())

# Drop rows with missing values
df.dropna(inplace=True)

In [None]:
# =========================
# 3. Text Cleaning Functions
# =========================
def clean_english(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # keep alphanumerics only
    text = re.sub(r"\s+", " ", text).strip()
    return text

def clean_arabic(text):
    text = str(text)
    # Remove diacritics
    text = re.sub(r"[\u0617-\u061A\u064B-\u0652]", "", text)
    # Remove non-Arabic characters
    text = re.sub(r"[^\u0600-\u06FF\s]", "", text)
    # Normalize alef and yaa
    text = re.sub(r"[إأآا]", "ا", text)
    text = re.sub(r"ى", "ي", text)
    text = re.sub(r"ؤ", "و", text)
    text = re.sub(r"ئ", "ي", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply cleaning
df["en_clean"] = df["en"].apply(clean_english)
df["ar_clean"] = df["ar"].apply(clean_arabic)

In [None]:
# =========================
# 4. Sentence Length Analysis
# =========================
df["en_len"] = df["en_clean"].apply(lambda x: len(x.split()))
df["ar_len"] = df["ar_clean"].apply(lambda x: len(x.split()))

print("English avg length:", df["en_len"].mean())
print("Arabic avg length:", df["ar_len"].mean())

plt.hist(df["en_len"], bins=50)
plt.title("English sentence length distribution")
plt.show()

plt.hist(df["ar_len"], bins=50)
plt.title("Arabic sentence length distribution")
plt.show()

In [None]:
# =========================
# 5. Filter long sentences
# =========================
MAX_LEN = 50
df = df[(df["en_len"] <= MAX_LEN) & (df["ar_len"] <= MAX_LEN)]

print("Dataset after filtering:", df.shape)

In [None]:
# =========================
# 6. Save preprocessed dataset
# =========================
output_path = "arzen_multigenre_preprocessed.csv"
df[["en_clean", "ar_clean"]].to_csv(output_path, index=False, encoding="utf-8")

print(f"Preprocessed dataset saved to {output_path}")