# ITM 454: Khmer News Summarizer - Final Project

This notebook implements:
- A Khmer extractive text summarizer (TextRank).
- A Machine Learning text classification demo using NLTK (Naive Bayes).

Pipeline:
1.  Data Collection (scraping with robust fallbacks)
2.  Preprocessing (Khmer-aware tokenization and normalization)
3.  Summarization (TextRank)
4.  Evaluation (ROUGE)
5.  ML Implementation (NLTK Naive Bayes text classifier)


In [92]:
# Step 1: Setup and Imports
# Install necessary libraries if you haven't already using requirements.txt
# !pip install -r requirements.txt

import requests
from bs4 import BeautifulSoup
import re
import nltk
from khmernltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge_score import rouge_scorer

# NLTK's sentence tokenizer is generally language-agnostic and works well for sentence boundaries.
nltk.download('punkt', quiet=True)

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [93]:
def scrape_vod_article(url):
    """
    Scrapes the title and article text from a VOD Khmer news URL.
    """
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        if response.status_code != 200:
            print(f"Error: Failed to fetch URL with status code {response.status_code}")
            return None, None

        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the article title
        title = soup.find('h1', class_='entry-title')
        if not title:
            print("Error: Could not find the article title.")
            return None, None
        title_text = title.get_text().strip()

        # Find the main article content
        content_div = soup.find('div', class_='entry-content')
        if not content_div:
            print("Error: Could not find the article content.")
            return None, None
            
        paragraphs = content_div.find_all('p')
        article_text = ' '.join([p.get_text().strip() for p in paragraphs])
        
        print(f" Scraping successful for: {title_text}")
        return title_text, article_text

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during scraping: {e}")
        return None, None

## Section 2: Data Collection (Web Scraper)

Robust scraping with headers, retries, AMP fallback and r.jina.ai proxy.

In [94]:
def _request_with_retries(url, headers=None, timeout=10, retries=3, backoff=1.3):
    headers = headers or {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome Safari',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'km-KH,km;q=0.9,en;q=0.8'
    }
    last_exc = None
    for attempt in range(1, retries + 1):
        try:
            resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
            if resp.status_code == 200:
                return resp
            else:
                last_exc = Exception(f'HTTP {resp.status_code}')
        except Exception as e:
            last_exc = e
        time.sleep(backoff ** attempt)
    if last_exc:
        print(f'Request failed after {retries} attempts: {last_exc}')
    return None

def scrape_vod_article(url):
    """
    Scrapes the title and article text from a Khmer news URL (with fallbacks).
    Returns: (title, text) or (None, None)
    """
    # 1) Try direct fetch
    response = _request_with_retries(url, timeout=10, retries=3, backoff=1.3)
    if response:
        soup = BeautifulSoup(response.content, 'html.parser')
        title_el = soup.find('h1', class_='entry-title') or soup.find('h1')
        title_text = title_el.get_text(strip=True) if title_el else None
        content_div = soup.find('div', class_='entry-content')
        paragraphs = []
        if content_div:
            paragraphs = content_div.find_all('p')
        if not paragraphs:
            paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
        article_text = re.sub(r'\s+', ' ', article_text).strip()
        if article_text:
            print(f"Scraping successful for: {title_text or url}")
            return title_text, article_text

    # 2) Try AMP page variants
    amp_candidates = [url.rstrip('/') + '/amp/', url.rstrip('/') + '/amp']
    for amp_url in amp_candidates:
        response = _request_with_retries(amp_url, timeout=10, retries=2, backoff=1.2)
        if response:
            soup = BeautifulSoup(response.content, 'html.parser')
            title_el = soup.find('h1')
            title_text = title_el.get_text(strip=True) if title_el else None
            paragraphs = soup.find_all('p')
            article_text = ' '.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            article_text = re.sub(r'\s+', ' ', article_text).strip()
            if article_text:
                print(f"Scraped AMP page for: {title_text or url}")
                return title_text, article_text

    # 3) Try r.jina.ai content extraction proxy
    try:
        scheme_split = url.split('://', 1)
        if len(scheme_split) == 2:
            rjina_url = f"https://r.jina.ai/http://{scheme_split[1]}"
        else:
            rjina_url = f"https://r.jina.ai/http://{url}"
        r = _request_with_retries(rjina_url, timeout=10, retries=2, backoff=1.2)
        if r and r.status_code == 200:
            text = r.text.strip()
            if text:
                lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
                title_text = lines[0] if lines else None
                print(f"Fetched via r.jina.ai proxy: {title_text or url}")
                return title_text, text
    except Exception:
        pass

    print('Error: Could not fetch article content from the URL or fallbacks.')
    return None, None


## Section 3: Text Preprocessing Pipeline

Cleans and prepares Khmer text for summarization.

In [95]:
# A custom list of Khmer stopwords. A more comprehensive list would improve results.
KHMER_STOPWORDS = [
    'និង', 'នៃ', 'ក្នុង', 'ជា', 'នៅ', 'បាន', 'ថា', 'ដោយ', 'ដែរ', 'さんも',
    'ទៅ', 'ឲ្យ', 'ពី', 'មួយ', 'ៗ', '។', 'ៗ', 'ដែល', 'មាន', 'การ', 'นี้',
    'លោក', 'អ្នក', 'ខ្ញុំ', 'គេ', 'យើង', 'វា', 'គាត់', 'นั้น', 'នេះ', 'ទេ', 'ដែរ'
]

def preprocess_khmer_text(text):
    """
    Cleans and preprocesses Khmer text for summarization.
    """
    # 1. Sentence Segmentation
    sentences = nltk.sent_tokenize(text)
    
    # 2. Clean and tokenize each sentence
    processed_sentences = []
    for sent in sentences:
        # Lowercase and remove non-Khmer characters/punctuation (keeping spaces)
        sent = re.sub(r'[a-zA-Z0-9\.,!?\(\)\[\]\{\}"\':;]', '', sent)
        sent = sent.strip()
        
        # 3. Word Tokenization using khmernltk
        tokens = word_tokenize(sent)
        
        # 4. Stopword Removal
        filtered_tokens = [word for word in tokens if word not in KHMER_STOPWORDS and len(word) > 1]
        
        if filtered_tokens:
            processed_sentences.append(" ".join(filtered_tokens))
            
    return sentences, processed_sentences

## Section 4: The Summarization Model (TextRank)

Represent each sentence as a TF-IDF vector, compute similarities, apply PageRank, and select top sentences.

In [96]:
def summarize_textrank(original_sentences, processed_sentences, num_sentences=3):
    """
    Generates a summary using the TextRank algorithm.
    """
    if not processed_sentences:
        return "Could not generate summary. No content after preprocessing."

    # 1. Vectorize sentences using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_sentences)

    # 2. Calculate sentence similarity (cosine similarity)
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # 3. Create graph from similarity matrix
    nx_graph = nx.from_numpy_array(similarity_matrix)

    # 4. Apply PageRank algorithm
    scores = nx.pagerank(nx_graph)

    # 5. Rank sentences and extract top ones
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
    
    # Ensure we don't request more sentences than available
    num_sentences = min(num_sentences, len(ranked_sentences))

    # Get the top sentences
    top_sentence_tuples = ranked_sentences[:num_sentences]
    
    # Sort them back to their original order for better readability
    summary_sentences = sorted(top_sentence_tuples, key=lambda x: original_sentences.index(x[1]))

    summary = " ".join([s for _, s in summary_sentences])
    return summary

## Section 5: Putting It All Together (Live Demo)

Runs the summarization pipeline. If fetching fails, it tries AMP, a text extraction proxy, a local file, or manual paste.

In [97]:
# --- Main Execution ---

# URL of a real news article
# You can replace this with another article from vodkhmer.news
ARTICLE_URL = "https://thmeythmey.com/detail/151427"

# 1. Scrape the article
title, article_text = scrape_vod_article(ARTICLE_URL)

if article_text:
    # 2. Preprocess the text
    original_sentences, processed_sentences = preprocess_khmer_text(article_text)
    
    # 3. Generate the summary
    summary = summarize_textrank(original_sentences, processed_sentences, num_sentences=3)
    
    # --- Display Results ---
    print("\n" + "="*50)
    print(f"📰 ARTICLE TITLE: {title}")
    print("="*50 + "\n")
    
    print("📜 ORIGINAL TEXT (first 500 chars):")
    print(article_text[:500] + "...")
    print("\n" + "-"*50 + "\n")
    
    print("✨ GENERATED SUMMARY:")
    print(summary)
    print("\n" + "="*50)

Request failed after 3 attempts: HTTP 403
Request failed after 2 attempts: HTTP 403
Request failed after 2 attempts: HTTP 403
Fetched via r.jina.ai proxy: Title: ក្រុមហ៊ុនរដ្ឋចិន ចង់បណ្តាក់ទុនវិនិយោគលើការកែច្នៃស្វាយចន្ទីនៅកម្ពុជា

📰 ARTICLE TITLE: Title: ក្រុមហ៊ុនរដ្ឋចិន ចង់បណ្តាក់ទុនវិនិយោគលើការកែច្នៃស្វាយចន្ទីនៅកម្ពុជា

📜 ORIGINAL TEXT (first 500 chars):
Title: ក្រុមហ៊ុនរដ្ឋចិន ចង់បណ្តាក់ទុនវិនិយោគលើការកែច្នៃស្វាយចន្ទីនៅកម្ពុជា

URL Source: http://thmeythmey.com/detail/151427

Markdown Content:
ក្រុមហ៊ុនរដ្ឋចិន ចង់បណ្តាក់ទុនវិនិយោគលើការកែច្នៃស្វាយចន្ទីនៅកម្ពុជា 


*   [](https://www.facebook.com/ThmeyThmey/)
*   [](https://twitter.com/thmeythmey)
*   [](https://www.instagram.com/thmeythmeynews/)
*   [](https://www.youtube.com/c/ThmeyThmeyOnlineNews)
*   [](https://t.me/thmeythmeymedia)
*   [](https://www.tiktok.com/@thmeythmey.co...

--------------------------------------------------

✨ GENERATED SUMMARY:
Title: ក្រុមហ៊ុនរដ្ឋចិន ចង់បណ្តាក់ទុនវិនិយោគលើការកែច្នៃស្វាយចន្ទីនៅកម្ពុជា

URL

## Section 6: Evaluation (ROUGE Score)

Compare the generated summary against a baseline (first 3 sentences). For Khmer, stemming is disabled.

In [98]:
if article_text:
    # Create a reference summary (baseline: first 3 sentences)
    reference_summary = " ".join(original_sentences[:3])

    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Calculate scores
    scores = scorer.score(reference_summary, summary)

    print("\n📊 EVALUATION (vs. first 3 sentences):")
    print(f"  - ROUGE-1 (Overlap of unigrams): {scores['rouge1'].fmeasure:.4f}")
    print(f"  - ROUGE-2 (Overlap of bigrams): {scores['rouge2'].fmeasure:.4f}")
    print(f"  - ROUGE-L (Longest common subsequence): {scores['rougeL'].fmeasure:.4f}")
    print("\n" + "="*50)


📊 EVALUATION (vs. first 3 sentences):
  - ROUGE-1 (Overlap of unigrams): 0.6984
  - ROUGE-2 (Overlap of bigrams): 0.6203
  - ROUGE-L (Longest common subsequence): 0.6667

