# ITM 454: Khmer News Summarizer - Final Project

This notebook implements:
- A Khmer extractive text summarizer (TextRank).
- A Machine Learning text classification demo using NLTK (Naive Bayes).

Pipeline:
1.  Data Collection (scraping with robust fallbacks)
2.  Preprocessing (Khmer-aware tokenization and normalization)
3.  Summarization (TextRank)
4.  Evaluation (ROUGE)
5.  ML Implementation (NLTK Naive Bayes text classifier)


In [9]:
# --- Core Libraries ---
import json
import re
import numpy as np
from collections import Counter

# --- NLP Libraries ---
import nltk
from khmernltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from rouge_score import rouge_scorer

# --- Visualization (Optional but Recommended) ---
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.font_manager import FontProperties

# Download necessary NLTK data
nltk.download('punkt', quiet=True)

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [10]:
def load_data_from_json(file_path='train.json'):
    """
    Loads and transforms data from the specified JSON file.
    Maps 'text' to 'title' and 'detail[0]' to 'full_text'.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
        
        dataset = []
        for item in json_data:
            if 'detail' in item and item['detail']:
                dataset.append({
                    'title': item.get('text', 'No Title'),
                    'full_text': item['detail'][0]
                })
        
        print(f"✅ Successfully loaded and processed {len(dataset)} articles from {file_path}")
        return dataset
    except FileNotFoundError:
        print(f"❌ Error: '{file_path}' not found. Please ensure it is in the same directory.")
        return []
    except Exception as e:
        print(f"❌ An error occurred: {e}")
        return []

# --- Load the Articles ---
article_dataset = load_data_from_json('train.json')

✅ Successfully loaded and processed 1048 articles from train.json


In [11]:
def load_data_from_json(file_path='train.json'):
    """
    Loads and transforms data from the specified JSON file.
    Maps 'text' to 'title' and 'detail[0]' to 'full_text'.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
        
        dataset = []
        for item in json_data:
            if 'detail' in item and item['detail']:
                dataset.append({
                    'title': item.get('text', 'No Title'),
                    'full_text': item['detail'][0]
                })
        
        print(f"✅ Successfully loaded and processed {len(dataset)} articles from {file_path}")
        return dataset
    except FileNotFoundError:
        print(f"❌ Error: '{file_path}' not found. Please ensure it is in the same directory.")
        return []
    except Exception as e:
        print(f"❌ An error occurred: {e}")
        return []

# --- Load the Articles ---
article_dataset = load_data_from_json('train.json')

✅ Successfully loaded and processed 1048 articles from train.json


In [12]:
KHMER_STOPWORDS = [
    'និង', 'នៃ', 'ក្នុង', 'ជា', 'នៅ', 'បាន', 'ថា', 'ដោយ', 'ដែរ', 'ទៅ', 'ឲ្យ', 
    'ពី', 'មួយ', 'ៗ', '។', 'ដែល', 'មាន', 'លោក', 'អ្នក', 'ខ្ញុំ', 'គេ', 'យើង', 
    'វា', 'គាត់', 'นั้น', 'នេះ', 'ទេ'
]

def preprocess_khmer_text(text):
    """
    Implements the full preprocessing pipeline for Khmer text.
    Returns the original sentences and the cleaned, tokenized sentences.
    """
    # 1. Sentence Segmentation (using NLTK)
    original_sentences = nltk.sent_tokenize(text)
    
    processed_sentences = []
    all_tokens = []
    
    for sent in original_sentences:
        # 3a. Punctuation Removal
        sent_clean = re.sub(r'[a-zA-Z0-9\.,!?\(\)\[\]\{\}"\':;]', '', sent)
        sent_clean = sent_clean.strip()
        
        # 2. Word Tokenization
        tokens = word_tokenize(sent_clean)
        
        # 3b. Stopword Removal
        filtered_tokens = [
            word for word in tokens 
            if word not in KHMER_STOPWORDS and len(word) > 1
        ]
        
        if filtered_tokens:
            processed_sentences.append(" ".join(filtered_tokens))
            all_tokens.extend(filtered_tokens)
            
    return original_sentences, processed_sentences, all_tokens

In [13]:
def analyze_and_plot_word_frequency(all_tokens, num_words=15):
    """
    Uses NLTK's FreqDist to analyze and plot word frequency.
    """
    if not all_tokens:
        print("  -> No tokens to analyze.")
        return

    # Use NLTK's Frequency Distribution
    freq_dist = nltk.FreqDist(all_tokens)
    
    print(f"\n📊 Top {num_words} Most Common Words:")
    for word, freq in freq_dist.most_common(num_words):
        print(f"  - {word}: {freq}")
        
    # --- Visualization ---
    # Note: You might need to install a Khmer font for your system
    # for the plot labels to render correctly.
    try:
        # For Khmer fonts in Matplotlib
        font_path = '/usr/share/fonts/truetype/khmeros/KhmerOS.ttf' # Example for Linux
        khmer_font = FontProperties(fname=font_path)
        
        plt.figure(figsize=(12, 6))
        freq_dist.plot(num_words, cumulative=False)
        plt.xlabel("Words", fontproperties=khmer_font)
        plt.ylabel("Frequency", fontproperties=khmer_font)
        plt.title(f"Top {num_words} Word Frequencies", fontproperties=khmer_font)
        plt.xticks(fontproperties=khmer_font, rotation=45)
        plt.show()
    except Exception as e:
        print(f"\n[Plotting Skipped] Could not generate plot. Matplotlib or font setup might be needed. Error: {e}")