# ITM 454: Khmer News Summarizer - Final Project

This notebook implements an extractive text summarizer for Khmer news articles. The process includes:
1.  **Data Collection**: Scraping a live news article from the web.
2.  **Preprocessing**: Cleaning and preparing the Khmer text for analysis.
3.  **Summarization**: Applying the TextRank algorithm to extract key sentences.
4.  **Evaluation**: Measuring the summary quality with ROUGE scores.


In [51]:
# Step 1: Setup and Imports
# If libraries are missing, install via: !pip install -r requirements.txt

import re
import time
import requests
from bs4 import BeautifulSoup

import nltk
nltk.download('punkt', quiet=True)

# Khmer tokenization (preferred). Fallbacks are provided below.
try:
    from khmernltk import word_tokenize as km_word_tokenize
    from khmernltk import sentence_tokenize as km_sentence_tokenize
except Exception:
    km_word_tokenize = None
    km_sentence_tokenize = None

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

try:
    from rouge_score import rouge_scorer
except Exception:
    rouge_scorer = None

print('✅ Libraries imported (with fallbacks if needed).')


✅ Libraries imported (with fallbacks if needed).


In [52]:
# --- Khmer tokenization utilities and preprocessing helpers ---

# Simple Khmer sentence splitter fallback (splits on Khmer punctuation)
SENT_SPLIT_REGEX = re.compile(r'[។!?៖]+["]?\s*')

# Remove non-Khmer letters/numbers/some punctuation while keeping spaces
# Line 34: Correct
NON_KHMER_CLEAN = re.compile(r"[a-zA-Z0-9\.,!?\(\)\[\]\{\}\"':;]")

# A minimal Khmer stopword list (extend as needed)
KHMER_STOPWORDS = set([
    'និង', 'នៃ', 'ក្នុង', 'ជា', 'នៅ', 'បាន', 'ថា', 'ដោយ', 'ដែរ', 'ទៅ', 'ឲ្យ', 'ពី',
    'មួយ', 'ៗ', 'ដែល', 'មាន', 'ទេ', 'នេះ', 'នោះ', 'គឺ', 'ក៏', 'លោក', 'អ្នក', 'យើង',
    'គេ', 'គាត់', 'ខ្ញុំ', 'វា'
])

def khmer_sentence_tokenize(text: str):
    if not text:
        return []
    if km_sentence_tokenize is not None:
        try:
            return [s.strip() for s in km_sentence_tokenize(text) if s.strip()]
        except Exception:
            pass
    # Fallback: regex-based split
    return [p.strip() for p in SENT_SPLIT_REGEX.split(text) if p and p.strip()]

def khmer_word_tokenize(sent: str):
    if not sent:
        return []
    if km_word_tokenize is not None:
        try:
            return km_word_tokenize(sent, keep_whitespace=False)
        except Exception:
            pass
    # Fallback: whitespace split; if no spaces, split into characters
    return sent.split() if ' ' in sent else list(sent)


## Section 2: Data Collection (Web Scraper)

This function scrapes the title and content of a news article from a given URL. It is designed for Khmer news sites and includes robust fallbacks (AMP, proxy).


In [53]:
def _request_with_retries(url, headers=None, timeout=10, retries=3, backoff=1.3):
    headers = headers or {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome Safari',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'km-KH,km;q=0.9,en;q=0.8'
    }
    last_exc = None
    for attempt in range(1, retries + 1):
        try:
            resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
            if resp.status_code == 200:
                return resp
            else:
                last_exc = Exception(f'HTTP {resp.status_code}')
        except Exception as e:
            last_exc = e
        time.sleep(backoff ** attempt)
    if last_exc:
        print(f'Request failed after {retries} attempts: {last_exc}')
    return None

def scrape_vod_article(url):
    """
    Scrapes the title and article text from a Khmer news URL (with fallbacks).
    Returns: (title, text) or (None, None)
    """
    # 1) Try direct fetch
    response = _request_with_retries(url, timeout=10, retries=3, backoff=1.3)
    if response:
        soup = BeautifulSoup(response.content, 'html.parser')
        title_el = soup.find('h1', class_='entry-title') or soup.find('h1')
        title_text = title_el.get_text(strip=True) if title_el else None
        content_div = soup.find('div', class_='entry-content')
        paragraphs = []
        if content_div:
            paragraphs = content_div.find_all('p')
        if not paragraphs:
            paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
        article_text = re.sub(r'\s+', ' ', article_text).strip()
        if article_text:
            print(f"Scraping successful for: {title_text or url}")
            return title_text, article_text

    # 2) Try AMP page variants
    amp_candidates = [url.rstrip('/') + '/amp/', url.rstrip('/') + '/amp']
    for amp_url in amp_candidates:
        response = _request_with_retries(amp_url, timeout=10, retries=2, backoff=1.2)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            title_el = soup.find('h1')
            title_text = title_el.get_text(strip=True) if title_el else None
            paragraphs = soup.find_all('p')
            article_text = ' '.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            article_text = re.sub(r'\s+', ' ', article_text).strip()
            if article_text:
                print(f"Scraped AMP page for: {title_text or url}")
                return title_text, article_text

    # 3) Try r.jina.ai content extraction proxy
    try:
        scheme_split = url.split('://', 1)
        if len(scheme_split) == 2:
            rjina_url = f"https://r.jina.ai/http://{scheme_split[1]}"
        else:
            rjina_url = f"https://r.jina.ai/http://{url}"
        r = _request_with_retries(rjina_url, timeout=10, retries=2, backoff=1.2)
        if r and r.status_code == 200:
            text = r.text.strip()
            if text:
                # Heuristic: first non-empty line as title
                lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
                title_text = lines[0] if lines else None
                print(f"Fetched via r.jina.ai proxy: {title_text or url}")
                return title_text, text
    except Exception:
        pass

    print('Error: Could not fetch article content from the URL or fallbacks.')
    return None, None


## Section 3: Text Preprocessing Pipeline

Cleans and prepares Khmer text for summarization.

In [54]:
def preprocess_khmer_text(text):
    """
    Cleans and preprocesses Khmer text for summarization.
    Returns:
      - original_sentences: list of raw sentences
      - processed_sentences: list of tokenized/cleaned sentences used for modeling
      - idx_map: list mapping processed_sentences index -> original_sentences index
    """
    sentences = khmer_sentence_tokenize(text)

    processed_sentences = []
    idx_map = []
    for i, sent in enumerate(sentences):
        sent_clean = NON_KHMER_CLEAN.sub('', sent).strip()
        tokens = khmer_word_tokenize(sent_clean)
        filtered_tokens = [w for w in tokens if w not in KHMER_STOPWORDS and len(w) > 1]
        if filtered_tokens:
            processed_sentences.append(' '.join(filtered_tokens))
            idx_map.append(i)

    return sentences, processed_sentences, idx_map


## Section 4: The Summarization Model (TextRank)

Represent each sentence as a TF-IDF vector, compute similarities, apply PageRank, and select top sentences.

In [55]:
def summarize_textrank(original_sentences, processed_sentences, idx_map, num_sentences=3):
    if not processed_sentences:
        return 'Could not generate summary. No content after preprocessing.'

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_sentences)

    similarity_matrix = cosine_similarity(tfidf_matrix)
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)

    ranked = sorted(((scores[i], idx_map[i]) for i in range(len(idx_map))), reverse=True)
    num_sentences = max(1, min(num_sentences, len(ranked)))
    top = sorted(ranked[:num_sentences], key=lambda x: x[1])
    return ' '.join([original_sentences[j] for _, j in top])


## Section 5: Putting It All Together (Live Demo)

Runs the entire pipeline on a live Khmer news article. If fetching fails, it will try AMP, a text extraction proxy, a local file, or prompt for manual paste.

In [62]:
# --- Main Execution ---

# Example URL (you may replace with any Khmer news URL)
ARTICLE_URL = 'https://khmernews.news/article/archives/141231'
# Optional local text file fallback (set to a valid path to auto-load)
LOCAL_TEXT_FILE = ''  # e.g., r'c:\path\to\article.txt'

print(f'Attempting to scrape: {ARTICLE_URL}')
title, article_text = scrape_vod_article(ARTICLE_URL)

# Local file fallback
if not article_text and LOCAL_TEXT_FILE:
    try:
        with open(LOCAL_TEXT_FILE, 'r', encoding='utf-8') as f:
            article_text = f.read().strip()
        title = title or 'Local file'
        print(f'Loaded article text from: {LOCAL_TEXT_FILE}')
    except Exception as e:
        print(f'Failed to read LOCAL_TEXT_FILE: {e}')

# Manual paste fallback
if not article_text:
    print('Fetching failed. Paste Khmer article text below to proceed.')
    try:
        article_text = input('Paste article text (or leave blank to cancel): ').strip()
    except Exception:
        article_text = ''
    if article_text:
        title = title or 'Manual input'

if article_text:
    original_sentences, processed_sentences, idx_map = preprocess_khmer_text(article_text)
    summary = summarize_textrank(original_sentences, processed_sentences, idx_map, num_sentences=3)

    print('\n' + '='*50)
    print(f'📰 ARTICLE TITLE: {title}')
    print('='*50 + '\n')
    print('✨ GENERATED SUMMARY:')
    print(summary)
    print('\n' + '='*50)
else:
    print('No article text available. Skipping summarization.')

Attempting to scrape: https://khmernews.news/article/archives/141231
Scraping successful for: https://khmernews.news/article/archives/141231

📰 ARTICLE TITLE: None

✨ GENERATED SUMMARY:
ប្រធានាធិបតីបារាំង លោក Emmanuel Macron បាននិយាយនៅថ្ងៃព្រហស្បតិ៍សប្ដាហ៍នេះ នៅលើបណ្ដាញសង្គម X ថា ប្រទេសបារាំងនឹងដាក់ពង្រាយយន្តហោះចម្បាំង Rafale ចំនួន ៣ គ្រឿង ដើម្បីជួយប៉ូឡូញការពារដែនអាកាស បន្ទាប់ពីប្រទេសសមាជិកអង្គការណាតូមួយនេះបានរងការលុកលុយពីសំណាក់យន្តហោះគ្មានមនុស្សបើករបស់រុស្ស៊ីក្នុងសប្តាហ៍នេះ។ គិតត្រឹមថ្ងៃសុក្រនេះ ប្រទេសសម្ព័ន្ធមិត្តអង្គការ ណាតូ កំពុងស្ថិតនៅក្នុងស្ថានភាពប្រឈមុខនឹងរុស្ស៊ីកាន់តែខ្លាំង ខណៈដែលរុស្ស៊ី និងប្រទេស​ប៊េឡារុស កំពុងធ្វើសមយុទ្ធយោធានៅជាប់នឹងមាត់ទ្វារផ្ទះរបស់ណាតូ។ [email protected]070 878884 / 078 878884 អាស័យដ្ឋាន៖



## Section 6: Evaluation (ROUGE Score)

Compare the generated summary against a baseline (first 3 sentences). For Khmer, stemming is disabled.

In [59]:
if 'summary' in globals() and 'article_text' in globals() and article_text:
    reference_summary = ' '.join(original_sentences[:3])
    if rouge_scorer is not None:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
        scores = scorer.score(reference_summary, summary)
        print('\n📊 EVALUATION (vs. first 3 sentences):')
        print(f"  - ROUGE-1 (Overlap of unigrams): {scores['rouge1'].fmeasure:.4f}")
        print(f"  - ROUGE-2 (Overlap of bigrams): {scores['rouge2'].fmeasure:.4f}")
        print(f"  - ROUGE-L (Longest common subsequence): {scores['rougeL'].fmeasure:.4f}")
        print('\n' + '='*50)
    else:
        print('rouge-score not installed/importable. Install it to run evaluation.')
else:
    print('Evaluation skipped: no summary available.')


📊 EVALUATION (vs. first 3 sentences):
  - ROUGE-1 (Overlap of unigrams): 0.9624
  - ROUGE-2 (Overlap of bigrams): 0.9612
  - ROUGE-L (Longest common subsequence): 0.9203

