In [3]:
import numpy as np
import pandas as pd
import networkx as nx
import pickle
import json
import re
from math import sqrt, log
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.util import ngrams
import matplotlib.pyplot as plt
import seaborn as sns
from khmernltk import sentence_tokenize, word_tokenize as khmer_word_tokenize
import warnings
warnings.filterwarnings('ignore')

class KhmerTextPreprocessor:
    """
    Comprehensive text preprocessing pipeline for Khmer text
    """
    
    def __init__(self, stopwords_file="stopwords.txt"):
        self.stopwords = self.load_stopwords(stopwords_file)
        self.stemmer = PorterStemmer()
        
    def load_stopwords(self, file_path):
        """Load Khmer stopwords from file"""
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                return set(file.read().split("\n"))
        except FileNotFoundError:
            print(f"Warning: {file_path} not found. Using empty stopwords list.")
            return set()
    
    def normalize_khmer_text(self, text):
        """Normalize Khmer text - handle Unicode variations"""
        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text)
        # Remove multiple punctuation
        text = re.sub(r'[។]+', '។', text)
        text = re.sub(r'[៕]+', '៕', text)
        return text.strip()
    
    def remove_punctuation(self, text):
        """Remove Khmer and English punctuation"""
        khmer_punct = "។៕៖ៗ៘៙៚៛ៜ៝៞៟"
        english_punct = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
        
        for punct in khmer_punct + english_punct:
            text = text.replace(punct, " ")
        return text
    
    def tokenize_sentences(self, text):
        """Tokenize text into sentences using khmernltk"""
        return sentence_tokenize(text)
    
    def tokenize_words(self, text):
        """Tokenize text into words using khmernltk"""
        return khmer_word_tokenize(text)
    
    def remove_stopwords(self, words):
        """Remove stopwords from word list"""
        return [word for word in words if word.lower() not in self.stopwords]
    
    def preprocess_text(self, text, remove_punct=True, remove_stops=True, normalize=True):
        """Complete preprocessing pipeline"""
        if normalize:
            text = self.normalize_khmer_text(text)
        
        if remove_punct:
            text = self.remove_punctuation(text)
        
        words = self.tokenize_words(text)
        words = [word.lower() for word in words if len(word.strip()) > 0]
        
        if remove_stops:
            words = self.remove_stopwords(words)
        
        return words
    
    def preprocess_document(self, text):
        """Preprocess entire document maintaining sentence structure"""
        sentences = self.tokenize_sentences(text)
        processed_sentences = []
        
        for sentence in sentences:
            processed_words = self.preprocess_text(sentence)
            if processed_words:  # Only add non-empty sentences
                processed_sentences.append(processed_words)
        
        return processed_sentences

class KhmerTextSummarizer:
    """
    Extractive text summarization using TextRank algorithm
    """
    
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor
    
    def cosine_distance(self, u, v):
        """Calculate cosine distance between two vectors"""
        dot_product = np.dot(u, v)
        norm_u = sqrt(np.dot(u, u))
        norm_v = sqrt(np.dot(v, v))
        
        if norm_u == 0 or norm_v == 0:
            return 1.0
        
        return 1 - (dot_product / (norm_u * norm_v))
    
    def sentence_similarity(self, sent1, sent2):
        """Calculate similarity between two sentences"""
        all_words = list(set(sent1 + sent2))
        
        if not all_words:
            return 0.0
        
        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)
        
        for word in sent1:
            vector1[all_words.index(word)] += 1
        
        for word in sent2:
            vector2[all_words.index(word)] += 1
        
        return 1 - self.cosine_distance(vector1, vector2)
    
    def build_similarity_matrix(self, sentences):
        """Build sentence similarity matrix"""
        n_sentences = len(sentences)
        similarity_matrix = np.zeros((n_sentences, n_sentences))
        
        for i in range(n_sentences):
            for j in range(n_sentences):
                if i != j:
                    similarity_matrix[i][j] = self.sentence_similarity(
                        sentences[i], sentences[j]
                    )
        
        return similarity_matrix
    
    def summarize(self, text, num_sentences=3):
        """Generate extractive summary"""
        original_sentences = self.preprocessor.tokenize_sentences(text)
        processed_sentences = self.preprocessor.preprocess_document(text)
        
        if len(processed_sentences) <= num_sentences:
            return "។ ".join(original_sentences)
        
        # Build similarity matrix and apply PageRank
        similarity_matrix = self.build_similarity_matrix(processed_sentences)
        similarity_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(similarity_graph)
        
        # Get top sentences
        ranked_sentences = sorted(
            [(scores[i], i, original_sentences[i]) for i in range(len(original_sentences))],
            reverse=True
        )
        
        summary_sentences = [sent[2] for sent in ranked_sentences[:num_sentences]]
        return "។ ".join(summary_sentences).replace("។។", "។")

class KhmerTextClassifier:
    """
    Multi-class text classifier for Khmer documents
    """
    
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor
        self.vectorizer = None
        self.classifier = None
        self.is_trained = False
    
    def prepare_features(self, texts, fit_vectorizer=False):
        """Convert texts to feature vectors"""
        processed_texts = []
        
        for text in texts:
            words = self.preprocessor.preprocess_text(text)
            processed_texts.append(" ".join(words))
        
        if fit_vectorizer or self.vectorizer is None:
            self.vectorizer = TfidfVectorizer(
                max_features=5000,
                ngram_range=(1, 2),
                min_df=2,
                max_df=0.8
            )
            features = self.vectorizer.fit_transform(processed_texts)
        else:
            features = self.vectorizer.transform(processed_texts)
        
        return features
    
    def train(self, texts, labels, model_type='nb'):
        """Train the classifier"""
        X = self.prepare_features(texts, fit_vectorizer=True)
        
        if model_type == 'nb':
            self.classifier = MultinomialNB()
        elif model_type == 'svm':
            self.classifier = SVC(kernel='linear', probability=True)
        elif model_type == 'lr':
            self.classifier = LogisticRegression(max_iter=1000)
        
        self.classifier.fit(X, labels)
        self.is_trained = True
        
        return self
    
    def predict(self, texts):
        """Predict classes for new texts"""
        if not self.is_trained:
            raise ValueError("Model must be trained first")
        
        X = self.prepare_features(texts)
        return self.classifier.predict(X)
    
    def predict_proba(self, texts):
        """Predict class probabilities"""
        if not self.is_trained:
            raise ValueError("Model must be trained first")
        
        X = self.prepare_features(texts)
        return self.classifier.predict_proba(X)
    
    def evaluate(self, test_texts, test_labels):
        """Evaluate classifier performance"""
        predictions = self.predict(test_texts)
        
        accuracy = accuracy_score(test_labels, predictions)
        report = classification_report(test_labels, predictions)
        conf_matrix = confusion_matrix(test_labels, predictions)
        
        return {
            'accuracy': accuracy,
            'classification_report': report,
            'confusion_matrix': conf_matrix
        }

class KhmerLanguageModel:
    """
    N-gram based language model for Khmer text
    """
    
    def __init__(self, n=3, preprocessor=None):
        self.n = n
        self.preprocessor = preprocessor or KhmerTextPreprocessor()
        self.ngram_counts = defaultdict(Counter)
        self.vocabulary = set()
        self.is_trained = False
    
    def train(self, texts):
        """Train the n-gram language model"""
        print(f"Training {self.n}-gram language model...")
        
        for text in texts:
            words = self.preprocessor.preprocess_text(text, remove_stops=False)
            self.vocabulary.update(words)
            
            # Add start and end tokens
            padded_words = ['<START>'] * (self.n - 1) + words + ['<END>']
            
            # Generate n-grams
            for i in range(len(padded_words) - self.n + 1):
                ngram = tuple(padded_words[i:i + self.n])
                context = ngram[:-1]
                word = ngram[-1]
                self.ngram_counts[context][word] += 1
        
        self.is_trained = True
        print(f"Model trained on {len(texts)} documents")
        print(f"Vocabulary size: {len(self.vocabulary)}")
        print(f"Number of {self.n}-gram contexts: {len(self.ngram_counts)}")
    
    def get_probability(self, context, word, smoothing=1e-6):
        """Calculate probability of word given context with smoothing"""
        if not self.is_trained:
            raise ValueError("Model must be trained first")
        
        context_tuple = tuple(context)
        
        if context_tuple not in self.ngram_counts:
            return smoothing
        
        word_count = self.ngram_counts[context_tuple][word]
        total_count = sum(self.ngram_counts[context_tuple].values())
        
        return (word_count + smoothing) / (total_count + len(self.vocabulary) * smoothing)
    
    def generate_text(self, seed_words=None, max_length=50):
        """Generate text using the trained model"""
        if not self.is_trained:
            raise ValueError("Model must be trained first")
        
        if seed_words is None:
            context = ['<START>'] * (self.n - 1)
        else:
            context = seed_words[-(self.n-1):]
        
        generated = list(context)
        
        for _ in range(max_length):
            context_tuple = tuple(context)
            
            if context_tuple not in self.ngram_counts:
                break
            
            # Sample next word based on probability distribution
            candidates = list(self.ngram_counts[context_tuple].keys())
            weights = [self.ngram_counts[context_tuple][word] for word in candidates]
            
            if '<END>' in candidates and np.random.random() < 0.1:
                break
            
            next_word = np.random.choice(candidates, p=np.array(weights)/sum(weights))
            
            if next_word == '<END>':
                break
            
            generated.append(next_word)
            context = context[1:] + [next_word]
        
        return ' '.join([word for word in generated if word not in ['<START>', '<END>']])
    
    def calculate_perplexity(self, test_texts):
        """Calculate perplexity on test texts"""
        if not self.is_trained:
            raise ValueError("Model must be trained first")
        
        total_log_prob = 0
        total_words = 0
        
        for text in test_texts:
            words = self.preprocessor.preprocess_text(text, remove_stops=False)
            padded_words = ['<START>'] * (self.n - 1) + words + ['<END>']
            
            for i in range(len(padded_words) - self.n + 1):
                ngram = padded_words[i:i + self.n]
                context = ngram[:-1]
                word = ngram[-1]
                
                prob = self.get_probability(context, word)
                total_log_prob += log(prob)
                total_words += 1
        
        avg_log_prob = total_log_prob / total_words
        perplexity = np.exp(-avg_log_prob)
        
        return perplexity

class KhmerNLP:
    """
    Main class integrating all NLP components
    """
    
    def __init__(self, stopwords_file="stopwords.txt"):
        self.preprocessor = KhmerTextPreprocessor(stopwords_file)
        self.summarizer = KhmerTextSummarizer(self.preprocessor)
        self.classifier = KhmerTextClassifier(self.preprocessor)
        self.language_model = KhmerLanguageModel(n=3, preprocessor=self.preprocessor)
    
    def analyze_text(self, text):
        """Comprehensive text analysis"""
        result = {
            'original_text': text,
            'sentences': self.preprocessor.tokenize_sentences(text),
            'words': self.preprocessor.tokenize_words(text),
            'processed_words': self.preprocessor.preprocess_text(text),
            'word_count': len(self.preprocessor.tokenize_words(text)),
            'sentence_count': len(self.preprocessor.tokenize_sentences(text)),
            'summary': self.summarizer.summarize(text, num_sentences=2)
        }
        
        return result
    
    def save_models(self, base_path="models"):
        """Save trained models"""
        import os
        os.makedirs(base_path, exist_ok=True)
        
        # Save classifier
        if self.classifier.is_trained:
            with open(f"{base_path}/classifier.pkl", "wb") as f:
                pickle.dump(self.classifier, f)
        
        # Save language model
        if self.language_model.is_trained:
            with open(f"{base_path}/language_model.pkl", "wb") as f:
                pickle.dump(self.language_model, f)
    
    def load_models(self, base_path="models"):
        """Load pre-trained models"""
        try:
            with open(f"{base_path}/classifier.pkl", "rb") as f:
                self.classifier = pickle.load(f)
        except FileNotFoundError:
            print("Classifier model not found")
        
        try:
            with open(f"{base_path}/language_model.pkl", "rb") as f:
                self.language_model = pickle.load(f)
        except FileNotFoundError:
            print("Language model not found")

def demo_system():
    """
    Demonstration of the complete system
    """
    print("=== Advanced Khmer Text Processing System Demo ===\n")
    
    # Initialize system
    nlp_system = KhmerNLP()
    
    # Sample Khmer text for demonstration
    sample_text = """
    ក្នុង ឱកាស ទទួល ឯកឧត្តម ផាក ជុង វូក (PARK Jung_Wook) ឯក អគ្គរដ្ឋទូត វិសាមញ្ញ និង ពេញ សមត្ថភាព នៃ សាធារណរដ្ឋ កូរ៉េ ប្រចាំ ព្រះរាជាណាចក្រ កម្ពុជា ចូល ជួប សម្តែង កិត្តិយស ។ សម្តេច ប្រធាន រដ្ឋសភា បាន សម្តែង នូវ សេចក្តី និង វាយតម្លៃ ត្រឹមត្រូវ ចំពោះ ទំនាក់ទំនង និង កិច្ចសហប្រតិបត្តិការ រវាង កម្ពុជា - កូរ៉េ ។
    """
    
    # 1. Text Analysis Demo
    print("1. TEXT ANALYSIS")
    print("-" * 50)
    analysis = nlp_system.analyze_text(sample_text)
    print(f"Original text length: {len(analysis['original_text'])} characters")
    print(f"Number of sentences: {analysis['sentence_count']}")
    print(f"Number of words: {analysis['word_count']}")
    print(f"Processed words: {len(analysis['processed_words'])}")
    print(f"Summary: {analysis['summary'][:100]}...")
    print()
    
    # 2. Text Preprocessing Demo
    print("2. TEXT PREPROCESSING PIPELINE")
    print("-" * 50)
    sentences = nlp_system.preprocessor.tokenize_sentences(sample_text)
    print(f"Original sentences: {len(sentences)}")
    for i, sent in enumerate(sentences[:2], 1):
        print(f"Sentence {i}: {sent[:50]}...")
    
    words = nlp_system.preprocessor.tokenize_words(sample_text)
    print(f"\nFirst 10 words: {words[:10]}")
    
    processed_words = nlp_system.preprocessor.preprocess_text(sample_text)
    print(f"First 10 processed words: {processed_words[:10]}")
    print()
    
    # 3. Demonstrate classifier training (with synthetic data)
    print("3. TEXT CLASSIFICATION DEMO")
    print("-" * 50)
    
    # Create sample training data
    sample_texts = [
        "នេះជាអត្ថបទអំពីនយោបាយ រដ្ឋាភិបាលបានធ្វើការកែទម្រង់",
        "ប្រធានក្រុមហ៊ុនបានប្រកាសពីផលិតផលថ្មី នេះជាការរីកចំរើនដ៏ល្អ",
        "កីឡាករបានឈ្នះការប្រកួត ក្រុមជាតិបានរកបានជ័យជំនះ",
        "រដ្ឋាភិបាលបានអនុម័តច្បាប់ថ្មី នយោបាយនេះនឹងប្តូរសង្គម"
    ]
    sample_labels = ["នយោបាយ", "អាជីវកម្ម", "កីឡា", "នយោបាយ"]
    
    # Train classifier
    nlp_system.classifier.train(sample_texts, sample_labels, model_type='nb')
    
    # Test prediction
    test_text = "ក្រុមបាល់ទាត់បានឈ្នះការប្រកួត"
    prediction = nlp_system.classifier.predict([test_text])
    print(f"Test text: {test_text}")
    print(f"Predicted category: {prediction[0]}")
    print()
    
    # 4. Language Model Demo
    print("4. LANGUAGE MODEL DEMO")
    print("-" * 50)
    
    # Train language model
    training_texts = [sample_text] + sample_texts
    nlp_system.language_model.train(training_texts)
    
    # Generate text
    generated = nlp_system.language_model.generate_text(
        seed_words=["សម្តេច"], max_length=10
    )
    print(f"Generated text: {generated}")
    print()
    
    print("=== Demo Complete ===")

# Example usage and testing
if __name__ == "__main__":
    # Run demonstration
    demo_system()
    
    # Additional functionality examples
    print("\n=== Additional Features ===")
    
    # Create system instance
    khmer_nlp = KhmerNLP()
    
    # Example of reading from file and summarizing
    try:
        with open("Khmer.txt", "r", encoding="utf-8") as file:
            content = file.read()
        
        # Analyze document
        analysis = khmer_nlp.analyze_text(content)
        print(f"\nDocument Analysis:")
        print(f"Total sentences: {analysis['sentence_count']}")
        print(f"Total words: {analysis['word_count']}")
        print(f"Summary (3 sentences):")
        summary = khmer_nlp.summarizer.summarize(content, num_sentences=3)
        print(summary[:200] + "...")
        
    except FileNotFoundError:
        print("Khmer.txt file not found - create sample file for testing")
    
    print("\nSystem initialization complete. Ready for use!")

=== Advanced Khmer Text Processing System Demo ===

1. TEXT ANALYSIS
--------------------------------------------------
Original text length: 313 characters
Number of sentences: 2
Number of words: 90
Processed words: 38
Summary: 
    ក្នុង ឱកាស ទទួល ឯកឧត្តម ផាក ជុង វូក (PARK Jung_Wook) ឯក អគ្គរដ្ឋទូត វិសាមញ្ញ និង ពេញ សមត្ថភាព ន...

2. TEXT PREPROCESSING PIPELINE
--------------------------------------------------
Original sentences: 2
Sentence 1: 
    ក្នុង ឱកាស ទទួល ឯកឧត្តម ផាក ជុង វូក (PARK Jun...
Sentence 2: សម្តេច ប្រធាន រដ្ឋសភា បាន សម្តែង នូវ សេចក្តី និង វ...

First 10 words: ['ក្នុង', ' ', 'ឱកាស', ' ', 'ទទួល', ' ', 'ឯកឧត្តម', ' ', 'ផាក', ' ']
First 10 processed words: ['ក្នុង', 'ឱកាស', 'ទទួល', 'ឯកឧត្តម', 'ផាក', 'ជុង', 'វូក', 'park', 'jung', 'wook']

3. TEXT CLASSIFICATION DEMO
--------------------------------------------------
Test text: ក្រុមបាល់ទាត់បានឈ្នះការប្រកួត
Predicted category: កីឡា

4. LANGUAGE MODEL DEMO
--------------------------------------------------
Training 3-gram