In [None]:


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
!apt-get update && apt-get install -y poppler-utils tesseract-ocr

In [None]:
!pip install pandas requests beautifulsoup4 scrapy datasets pdfminer.six clean-text

In [None]:
!pip install langdetect

import pandas as pd
import requests
from bs4 import BeautifulSoup
from datasets import load_dataset
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import cv2
import numpy as np
import os
from cleantext import clean
import gc
from langdetect import detect_langs

# Languages and target sample size per language
LANGUAGES = ["Hindi", "Marathi", "Sindhi", "Gujarati", "Bengali", "Tamil", "Kannada"]
TARGET_PER_LANG = 250000  # Reduced from 750000 to prevent memory issues
MIN_WORDS = 50

# Function to filter text by word count
def filter_text_length(text):
    if not isinstance(text, str):
        return False
    words = text.split()
    return len(words) >= MIN_WORDS

# Preprocess image for OCR
def preprocess_image(image):
    """Preprocess scanned images to improve OCR accuracy."""
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    kernel = np.ones((1, 1), np.uint8)
    processed_image = cv2.dilate(binary, kernel, iterations=1)
    return Image.fromarray(processed_image)

# Extract text from PDF using OCR with better memory management
def extract_text_from_pdf(pdf_path, batch_size=10):  # Reduced batch size
    """Extract text from scanned PDFs using OCR in batches."""
    try:
        # Convert only the first 20 pages to prevent memory issues
        images = convert_from_path(pdf_path, dpi=200, first_page=1, last_page=20)  
        text = ""
        for i in range(0, len(images), batch_size):
            batch_images = images[i:i + batch_size]
            for image in batch_images:
                processed_image = preprocess_image(image)
                text += pytesseract.image_to_string(
                    processed_image,
                    lang='hin+mar+ben+tam+kan+eng+urd+chi',
                    config="--psm 6 --oem 1"
                ) + "\n"
            # Force garbage collection after each batch
            del batch_images
            processed_image = None
            gc.collect()
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

# Detect language based on Unicode ranges
def detect_language_unicode(text):
    """Detects and labels language based on Unicode ranges."""
    lang_counts = {
        'Hindi': sum(1 for char in text if '\u0900' <= char <= '\u097F'),  # Devanagari
        'Marathi': sum(1 for char in text if '\u0900' <= char <= '\u097F'), # Devanagari (overlap with Hindi)
        'Sindhi': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),  # Arabic script
        'Gujarati': sum(1 for char in text if '\u0A80' <= char <= '\u0AFF'),
        'Bengali': sum(1 for char in text if '\u0980' <= char <= '\u09FF'),
        'Tamil': sum(1 for char in text if '\u0B80' <= char <= '\u0BFF'),
        'Kannada': sum(1 for char in text if '\u0C80' <= char <= '\u0CFF'),
        'Urdu': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),    # Arabic script
        'Chinese': sum(1 for char in text if '\u4E00' <= char <= '\u9FFF'),
        'English': sum(1 for char in text if '\u0041' <= char <= '\u007A' or '\u0061' <= char <= '\u007A')
    }
    
    total_chars = sum(lang_counts.values())
    if total_chars == 0:
        return None, {}, {}
    
    lang_probs = {lang: count / total_chars for lang, count in lang_counts.items()}
    top_lang = max(lang_probs, key=lang_probs.get)
    
    if top_lang in LANGUAGES:
        return top_lang, lang_probs, {top_lang: text}
    return None, lang_probs, {}

# Function to save dataframe in chunks to prevent memory issues
def save_df_in_chunks(df, output_file, chunk_size=10000):
    """Save large dataframe in chunks to avoid memory issues."""
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i + chunk_size]
        mode = 'w' if i == 0 else 'a'
        header = i == 0
        chunk.to_csv(output_file, 
                    mode=mode, 
                    header=header, 
                    index=False, 
                    escapechar='\\', 
                    encoding='utf-8-sig')
        # Clear memory
        del chunk
        gc.collect()
    print(f"Saved {len(df)} samples to {output_file} in chunks")

# Function 1: Load from Public Datasets (OSCAR) with memory optimization
def load_from_datasets(output_file="dataset_corpus.csv"):
    print("Loading from public datasets...")
    all_samples = 0
    oscar_langs = {
        "hi": "Hindi",
        "mr": "Marathi",
        "gu": "Gujarati",
        "bn": "Bengali",
        "ta": "Tamil",
        "kn": "Kannada"
    }
    
    # Clear any existing file
    open(output_file, 'w', encoding='utf-8-sig').close()
    
    for code, lang in oscar_langs.items():
        try:
            # Load and process in smaller chunks
            dataset = load_dataset("oscar", f"unshuffled_deduplicated_{code}", split="train", trust_remote_code=True)
            chunk_size = min(10000, TARGET_PER_LANG)
            num_chunks = min(TARGET_PER_LANG // chunk_size, len(dataset) // chunk_size)
            
            total_lang_samples = 0
            for i in range(num_chunks):
                start_idx = i * chunk_size
                end_idx = min((i + 1) * chunk_size, len(dataset))
                
                chunk_data = dataset[start_idx:end_idx]
                chunk_df = pd.DataFrame(chunk_data)
                filtered_df = chunk_df[chunk_df['text'].apply(filter_text_length)]
                
                if not filtered_df.empty:
                    filtered_df = filtered_df[['text']].assign(language=lang)
                    # Write chunk to file
                    mode = 'a' if i > 0 or all_samples > 0 else 'w'
                    header = i == 0 and all_samples == 0
                    filtered_df.to_csv(output_file, 
                                      mode=mode, 
                                      header=header, 
                                      index=False, 
                                      escapechar='\\', 
                                      encoding='utf-8-sig')
                    
                    samples_added = len(filtered_df)
                    total_lang_samples += samples_added
                    all_samples += samples_added
                
                # Clear memory
                del chunk_data, chunk_df, filtered_df
                gc.collect()
                
                # Stop if we've collected enough samples
                if total_lang_samples >= TARGET_PER_LANG:
                    break
                    
            print(f"Loaded {total_lang_samples} {lang} samples from OSCAR")
            
        except Exception as e:
            print(f"Error loading OSCAR for {lang}: {e}")
    
    print(f"Saved total of {all_samples} samples from datasets to {output_file}")
    return all_samples

# Function 2: Web Scraping with memory optimization
def scrape_from_web(output_file="scraped_corpus.csv"):
    print("Scraping from web...")
    sites = {
        "Hindi": ["https://hindi.bbc.com", "https://www.bhaskar.com"],
        "Marathi": ["https://lokmat.com", "https://maharashtratimes.com"],
        "Sindhi": ["https://awamiawaz.pk"],
        "Gujarati": ["https://divyabhaskar.co.in", "https://sandesh.com"],
        "Bengali": ["https://anandabazar.com", "https://eisamay.com"],
        "Tamil": ["https://dinamalar.com", "https://dailythanthi.com"],
        "Kannada": ["https://prajavani.net", "https://vijaykarnataka.com"]
    }
    
    # Clear any existing file
    open(output_file, 'w', encoding='utf-8-sig').close()
    all_samples = 0
    
    headers = {"User-Agent": "Mozilla/5.0"}
    
    for lang, urls in sites.items():
        lang_texts = []
        max_samples_per_url = TARGET_PER_LANG // len(urls)
        
        for url in urls:
            try:
                response = requests.get(url, headers=headers, timeout=10)
                soup = BeautifulSoup(response.content, "html.parser")
                paragraphs = soup.find_all("p")
                
                for p in paragraphs:
                    if len(lang_texts) >= max_samples_per_url:
                        break
                        
                    text = clean(p.get_text(), no_line_breaks=True, no_urls=True, no_emails=True)
                    if filter_text_length(text):
                        lang_texts.append({"text": text, "language": lang})
                        
                # Save immediately if we have enough data
                if len(lang_texts) >= max_samples_per_url:
                    break
                    
            except Exception as e:
                print(f"Error scraping {url}: {e}")
        
        # Save language data
        if lang_texts:
            lang_df = pd.DataFrame(lang_texts)
            mode = 'a' if all_samples > 0 else 'w'
            header = all_samples == 0
            lang_df.to_csv(output_file, 
                          mode=mode, 
                          header=header, 
                          index=False, 
                          escapechar='\\', 
                          encoding='utf-8-sig')
            
            all_samples += len(lang_df)
            print(f"Scraped {lang}: {len(lang_df)} samples")
            
            # Clear memory
            del lang_df, lang_texts
            gc.collect()
    
    print(f"Saved total of {all_samples} samples from web to {output_file}")
    return all_samples

# Function 3: OCR from PDFs using Kaggle path with memory optimization
def extract_from_pdfs(pdf_dir="/kaggle/input/pdddffs/allpdfs", output_file="pdf_corpus.csv"):
    print("Extracting from PDFs...")
    
    if not os.path.exists(pdf_dir):
        print(f"PDF directory {pdf_dir} not found, skipping...")
        return 0
    
    # Clear any existing file
    open(output_file, 'w', encoding='utf-8-sig').close()
    all_samples = 0
    
    # Limit to 10 PDFs max to prevent memory issues
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")][:10]
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_corpus = []
        
        try:
            # Try pdfminer for searchable PDFs first
            text = extract_text(pdf_path)
            if not text.strip():  # If empty, use OCR
                text = extract_text_from_pdf(pdf_path, batch_size=5)  # Reduced batch size
            
            if not text:
                continue
            
            # Split into paragraphs
            paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
            
            for para in paragraphs:
                if filter_text_length(para):
                    top_lang, lang_probs, _ = detect_language_unicode(para)
                    if top_lang in LANGUAGES:  # Only include target languages
                        pdf_corpus.append({"text": para, "language": top_lang})
            
            # Save immediately after each PDF to prevent memory buildup
            if pdf_corpus:
                pdf_df = pd.DataFrame(pdf_corpus)
                mode = 'a' if all_samples > 0 else 'w'
                header = all_samples == 0
                pdf_df.to_csv(output_file, 
                             mode=mode, 
                             header=header, 
                             index=False, 
                             escapechar='\\', 
                             encoding='utf-8-sig')
                
                all_samples += len(pdf_df)
                print(f"Extracted {len(pdf_df)} samples from {pdf_file}")
                
                # Clear memory
                del pdf_df, pdf_corpus
                gc.collect()
            
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
        
        # Clear more memory
        text = None
        paragraphs = None
        gc.collect()
    
    print(f"Saved total of {all_samples} samples from PDFs to {output_file}")
    return all_samples

# Main function to build the corpus with memory optimizations
def build_corpus(output_file="indic_corpus.csv"):
    print("Building the corpus...")
    
    # Process each source individually and merge at the end
    dataset_count = load_from_datasets("temp_dataset_corpus.csv")
    gc.collect()  # Force garbage collection
    
    scraped_count = scrape_from_web("temp_scraped_corpus.csv")
    gc.collect()  # Force garbage collection
    
    pdf_count = extract_from_pdfs(output_file="temp_pdf_corpus.csv")
    gc.collect()  # Force garbage collection
    
    # If we have data from any source, merge and balance
    if dataset_count + scraped_count + pdf_count > 0:
        print("Merging and balancing corpus...")
        
        # Create a new output file
        open(output_file, 'w', encoding='utf-8-sig').close()
        
        # Process each language separately
        for lang in LANGUAGES:
            lang_samples = []
            
            # Process each source file in chunks to avoid memory issues
            for source_file in ["temp_dataset_corpus.csv", "temp_scraped_corpus.csv", "temp_pdf_corpus.csv"]:
                if os.path.exists(source_file) and os.path.getsize(source_file) > 0:
                    # Read in chunks
                    chunk_size = 10000
                    for chunk in pd.read_csv(source_file, 
                                            chunksize=chunk_size, 
                                            encoding='utf-8-sig', 
                                            escapechar='\\'):
                        # Filter for current language
                        lang_chunk = chunk[chunk['language'] == lang]
                        if not lang_chunk.empty:
                            lang_samples.append(lang_chunk)
                            
                            # If we have enough samples, stop reading
                            total_samples = sum(len(df) for df in lang_samples)
                            if total_samples >= TARGET_PER_LANG:
                                break
                    
                    # Clear memory
                    gc.collect()
            
            # Combine all chunks for this language
            if lang_samples:
                combined_lang = pd.concat(lang_samples)
                available_samples = len(combined_lang)
                
                # Sample if we have more than needed
                if available_samples > TARGET_PER_LANG:
                    combined_lang = combined_lang.sample(n=TARGET_PER_LANG, random_state=42)
                
                # Save this language to the final file
                mode = 'a' if os.path.getsize(output_file) > 0 else 'w'
                header = mode == 'w'
                combined_lang.to_csv(output_file, 
                                    mode=mode, 
                                    header=header, 
                                    index=False, 
                                    escapechar='\\', 
                                    encoding='utf-8-sig')
                
                print(f"Added {len(combined_lang)} {lang} samples to final corpus")
                
                # Clear memory
                del combined_lang, lang_samples
                gc.collect()
            else:
                print(f"No samples found for {lang}")
        
        # Get final count
        total_lines = 0
        with open(output_file, 'r', encoding='utf-8-sig') as f:
            for _ in f:
                total_lines += 1
        
        total_lines -= 1  # Subtract header line
        print(f"Final corpus saved to {output_file} with {total_lines} samples")
        
        # Clean up temporary files
        for temp_file in ["temp_dataset_corpus.csv", "temp_scraped_corpus.csv", "temp_pdf_corpus.csv"]:
            if os.path.exists(temp_file):
                os.remove(temp_file)
    else:
        print("No data collected!")

if __name__ == "__main__":
    build_corpus()

df = pd.read_csv("indic_corpus.csv", encoding='utf-8-sig', escapechar='\\')

In [None]:
!apt-get install -y tesseract-ocr tesseract-ocr-hin tesseract-ocr-mar tesseract-ocr-ben tesseract-ocr-tam tesseract-ocr-kan tesseract-ocr-eng tesseract-ocr-urd tesseract-ocr-chi-sim

In [None]:
!pip install pdf2image pytesseract opencv-python pillow clean-text langdetect datasets requests beautifulsoup4

In [None]:
!pip install pdfminer pdf2image pytesseract langdetect

In [None]:
!pip install pdfminer.six==20231228


In [None]:
!pip install cleantext

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datasets import load_dataset
# Fix for pdfminer import issue
try:
    from pdfminer.high_level import extract_text
except ImportError:
    # Create a fallback function if pdfminer fails to import properly
    def extract_text(pdf_path):
        print(f"Using fallback extraction for {pdf_path} due to pdfminer import issue")
        return ""
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import cv2
import numpy as np
import os
from cleantext import clean
import gc
from langdetect import detect_langs
import concurrent.futures
import time

# Expanded languages
LANGUAGES = [
    "Hindi", "Marathi", "Sindhi", "Gujarati", "Bengali", "Tamil", "Kannada",
    "Telugu", "Malayalam", "Punjabi", "Odia", "Assamese"
]
TARGET_PER_LANG = 250000  # Reduced for faster runtime
MIN_WORDS = 50

def filter_text_length(text):
    if not isinstance(text, str):
        return False
    return len(text.split()) >= MIN_WORDS

def preprocess_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    kernel = np.ones((1, 1), np.uint8)
    return Image.fromarray(cv2.dilate(binary, kernel, iterations=1))

def extract_text_from_pdf(pdf_path, batch_size=5):
    try:
        print(f"Processing PDF: {pdf_path}")
        start_time = time.time()
        images = convert_from_path(pdf_path, dpi=150, first_page=1, last_page=10)  # Reduced DPI and pages
        text = ""
        for i in range(0, len(images), batch_size):
            batch_images = images[i:i + batch_size]
            for image in batch_images:
                processed_image = preprocess_image(image)
                text += pytesseract.image_to_string(
                    processed_image,
                    lang='hin+mar+ben+tam+kan+tel+mal+pan+ori+asm+eng+urd+chi',
                    config="--psm 6 --oem 1"
                ).replace('\0', '')[:100000] + "\n"  # Clean NULL, truncate
            del batch_images
            gc.collect()
        print(f"Finished {pdf_path} in {time.time() - start_time:.2f}s")
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def detect_language_unicode(text):
    lang_counts = {
        'Hindi': sum(1 for char in text if '\u0900' <= char <= '\u097F'),
        'Marathi': sum(1 for char in text if '\u0900' <= char <= '\u097F'),
        'Sindhi': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),
        'Gujarati': sum(1 for char in text if '\u0A80' <= char <= '\u0AFF'),
        'Bengali': sum(1 for char in text if '\u0980' <= char <= '\u09FF'),
        'Tamil': sum(1 for char in text if '\u0B80' <= char <= '\u0BFF'),
        'Kannada': sum(1 for char in text if '\u0C80' <= char <= '\u0CFF'),
        'Telugu': sum(1 for char in text if '\u0C00' <= char <= '\u0C7F'),
        'Malayalam': sum(1 for char in text if '\u0D00' <= char <= '\u0D7F'),
        'Punjabi': sum(1 for char in text if '\u0A00' <= char <= '\u0A7F'),
        'Odia': sum(1 for char in text if '\u0B00' <= char <= '\u0B7F'),
        'Assamese': sum(1 for char in text if '\u0980' <= char <= '\u09FF'),
        'Urdu': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),
        'Chinese': sum(1 for char in text if '\u4E00' <= char <= '\u9FFF'),
        'English': sum(1 for char in text if '\u0041' <= char <= '\u007A' or '\u0061' <= char <= '\u007A')
    }
    total_chars = sum(lang_counts.values())
    if total_chars == 0:
        return None, {}, {}
    lang_probs = {lang: count / total_chars for lang, count in lang_counts.items()}
    top_lang = max(lang_probs, key=lang_probs.get)
    if top_lang in LANGUAGES:
        return top_lang, lang_probs, {top_lang: text}
    return None, lang_probs, {}

def save_df_in_chunks(df, output_file, chunk_size=5000):
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i + chunk_size]
        mode = 'w' if i == 0 else 'a'
        header = i == 0
        chunk.to_csv(output_file, mode=mode, header=header, index=False, escapechar='\\', encoding='utf-8-sig')
        del chunk
        gc.collect()

def load_from_datasets(output_file="dataset_corpus.csv"):
    print("Loading from public datasets...")
    oscar_langs = {
        "hi": "Hindi", "mr": "Marathi", "gu": "Gujarati", "bn": "Bengali",
        "ta": "Tamil", "kn": "Kannada", "te": "Telugu", "ml": "Malayalam",
        "pa": "Punjabi", "or": "Odia", "as": "Assamese"
    }
    open(output_file, 'w', encoding='utf-8-sig').close()
    all_samples = 0
    
    for code, lang in oscar_langs.items():
        try:
            print(f"Loading OSCAR for {lang}...")
            dataset = load_dataset("oscar", f"unshuffled_deduplicated_{code}", split="train", trust_remote_code=True)
            chunk_size = 5000
            num_chunks = min(TARGET_PER_LANG // chunk_size, len(dataset) // chunk_size)
            total_lang_samples = 0
            
            for i in range(num_chunks):
                chunk_data = dataset[i * chunk_size:(i + 1) * chunk_size]
                chunk_df = pd.DataFrame(chunk_data)
                chunk_df['text'] = chunk_df['text'].apply(lambda x: x.replace('\0', '')[:100000] if isinstance(x, str) else x)
                filtered_df = chunk_df[chunk_df['text'].apply(filter_text_length)]
                if not filtered_df.empty:
                    filtered_df = filtered_df[['text']].assign(language=lang)
                    mode = 'a' if all_samples > 0 else 'w'
                    header = all_samples == 0
                    filtered_df.to_csv(output_file, mode=mode, header=header, index=False, escapechar='\\', encoding='utf-8-sig')
                    total_lang_samples += len(filtered_df)
                    all_samples += len(filtered_df)
                del chunk_data, chunk_df, filtered_df
                gc.collect()
                if total_lang_samples >= TARGET_PER_LANG:
                    break
            print(f"Loaded {total_lang_samples} {lang} samples from OSCAR")
        except Exception as e:
            print(f"Error loading OSCAR for {lang}: {e}")
    print(f"Saved total of {all_samples} samples from datasets to {output_file}")
    return all_samples

def scrape_url(url, lang, max_samples):
    headers = {"User-Agent": "Mozilla/5.0"}
    lang_texts = []
    try:
        response = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find_all("p")
        for p in paragraphs:
            if len(lang_texts) >= max_samples:
                break
            text = clean(p.get_text(), no_line_breaks=True, no_urls=True, no_emails=True).replace('\0', '')[:100000]
            if filter_text_length(text):
                lang_texts.append({"text": text, "language": lang})
        print(f"Scraped {url}: {len(lang_texts)} samples")
    except Exception as e:
        print(f"Error scraping {url}: {e}")
    return lang_texts

def scrape_from_web(output_file="scraped_corpus.csv"):
    print("Scraping from web...")
    sites = {
        "Hindi": ["https://hindi.bbc.com", "https://www.bhaskar.com"],
        "Marathi": ["https://lokmat.com", "https://maharashtratimes.com"],
        "Sindhi": ["https://awamiawaz.pk"],
        "Gujarati": ["https://divyabhaskar.co.in", "https://sandesh.com"],
        "Bengali": ["https://anandabazar.com", "https://eisamay.com"],
        "Tamil": ["https://dinamalar.com", "https://dailythanthi.com"],
        "Kannada": ["https://prajavani.net", "https://vijaykarnataka.com"],
        "Telugu": ["https://eenadu.net", "https://sakshi.com"],
        "Malayalam": ["https://mathrubhumi.com", "https://www.manoramaonline.com"],
        "Punjabi": ["https://punjabitribuneonline.com", "https://www.ajitjalandhar.com"],
        "Odia": ["https://sambad.in", "https://dharitri.com"],
        "Assamese": ["https://asomiyapratidin.in", "https://pratidintime.com"]
    }
    open(output_file, 'w', encoding='utf-8-sig').close()
    all_samples = 0
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        for lang, urls in sites.items():
            max_samples_per_url = TARGET_PER_LANG // len(urls)
            future_to_url = {executor.submit(scrape_url, url, lang, max_samples_per_url): url for url in urls}
            lang_texts = []
            for future in concurrent.futures.as_completed(future_to_url):
                lang_texts.extend(future.result())
            if lang_texts:
                lang_df = pd.DataFrame(lang_texts)
                mode = 'a' if all_samples > 0 else 'w'
                header = all_samples == 0
                lang_df.to_csv(output_file, mode=mode, header=header, index=False, escapechar='\\', encoding='utf-8-sig')
                all_samples += len(lang_df)
                print(f"Scraped {lang}: {len(lang_df)} samples")
                del lang_df, lang_texts
                gc.collect()
    print(f"Saved total of {all_samples} samples from web to {output_file}")
    return all_samples

def extract_from_pdfs(pdf_dir="/kaggle/input/pdddffs/allpdfs", output_file="pdf_corpus.csv"):
    print("Extracting from PDFs...")
    if not os.path.exists(pdf_dir):
        print(f"PDF directory {pdf_dir} not found, skipping...")
        return 0
    open(output_file, 'w', encoding='utf-8-sig').close()
    all_samples = 0
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")][:5]  # Limit to 5 PDFs
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_corpus = []
        try:
            # Modified to handle pdfminer error more gracefully
            try:
                text = extract_text(pdf_path)
            except Exception as e:
                print(f"Primary PDF extraction failed: {e}, using backup method")
                text = ""
                
            if not text.strip():
                text = extract_text_from_pdf(pdf_path)
            if not text:
                continue
            paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
            for para in paragraphs:
                if filter_text_length(para):
                    top_lang, _, _ = detect_language_unicode(para)
                    if top_lang in LANGUAGES:
                        pdf_corpus.append({"text": para, "language": top_lang})
            if pdf_corpus:
                pdf_df = pd.DataFrame(pdf_corpus)
                mode = 'a' if all_samples > 0 else 'w'
                header = all_samples == 0
                pdf_df.to_csv(output_file, mode=mode, header=header, index=False, escapechar='\\', encoding='utf-8-sig')
                all_samples += len(pdf_df)
                print(f"Extracted {len(pdf_df)} samples from {pdf_file}")
                del pdf_df, pdf_corpus
                gc.collect()
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
    print(f"Saved total of {all_samples} samples from PDFs to {output_file}")
    return all_samples

def build_corpus(output_file="indic_corpus.csv"):
    print("Building the corpus...")
    dataset_count = load_from_datasets("temp_dataset_corpus.csv")
    gc.collect()
    scraped_count = scrape_from_web("temp_scraped_corpus.csv")
    gc.collect()
    pdf_count = extract_from_pdfs(output_file="temp_pdf_corpus.csv")
    gc.collect()
    
    if dataset_count + scraped_count + pdf_count > 0:
        print("Merging and balancing corpus...")
        open(output_file, 'w', encoding='utf-8-sig').close()
        for lang in LANGUAGES:
            lang_samples = []
            for source_file in ["temp_dataset_corpus.csv", "temp_scraped_corpus.csv", "temp_pdf_corpus.csv"]:
                if os.path.exists(source_file) and os.path.getsize(source_file) > 0:
                    for chunk in pd.read_csv(source_file, chunksize=5000, encoding='utf-8-sig', escapechar='\\'):
                        lang_chunk = chunk[chunk['language'] == lang]
                        if not lang_chunk.empty:
                            lang_samples.append(lang_chunk)
                            if sum(len(df) for df in lang_samples) >= TARGET_PER_LANG:
                                break
                    gc.collect()
            if lang_samples:
                combined_lang = pd.concat(lang_samples)
                if len(combined_lang) > TARGET_PER_LANG:
                    combined_lang = combined_lang.sample(n=TARGET_PER_LANG, random_state=42)
                mode = 'a' if os.path.getsize(output_file) > 0 else 'w'
                header = mode == 'w'
                combined_lang.to_csv(output_file, mode=mode, header=header, index=False, escapechar='\\', encoding='utf-8-sig')
                print(f"Added {len(combined_lang)} {lang} samples to final corpus")
                del combined_lang, lang_samples
                gc.collect()
        total_lines = sum(1 for _ in open(output_file, 'r', encoding='utf-8-sig')) - 1  # Subtract header
        print(f"Final corpus saved to {output_file} with {total_lines} samples")
        for temp_file in ["temp_dataset_corpus.csv", "temp_scraped_corpus.csv", "temp_pdf_corpus.csv"]:
            if os.path.exists(temp_file):
                os.remove(temp_file)
    else:
        print("No data collected!")

if __name__ == "__main__":
    build_corpus()

In [None]:
import gzip
import pandas as pd

# Decompress and clean NULL bytes
with gzip.open('/kaggle/working/dataset_corpus.csv.gz', 'rb') as f_in:
    with open('/kaggle/working/dataset_corpus_clean.csv', 'wb') as f_out:
        # Read raw bytes, replace NULL bytes
        data = f_in.read().replace(b'\0', b'')
        f_out.write(data)

# Load cleaned CSV
df = pd.read_csv('/kaggle/working/dataset_corpus_clean.csv', engine='python', escapechar='\\')
print(f"OSCAR samples: {len(df)}")
scraped_df = scrape_from_web()
pdf_df = extract_from_pdfs()

In [None]:
import pandas as pd
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import cv2
import numpy as np
import os
import gc
import time

LANGUAGES = [
    "Hindi", "Marathi", "Sindhi", "Gujarati", "Bengali", "Tamil", "Kannada",
    "Telugu", "Malayalam", "Punjabi", "Odia", "Assamese"
]
TARGET_TOTAL = 500000  # 500K from PDFs
MIN_WORDS = 50

def filter_text_length(text):
    if not isinstance(text, str):
        return False
    return len(text.split()) >= MIN_WORDS

def preprocess_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    kernel = np.ones((1, 1), np.uint8)
    return Image.fromarray(cv2.dilate(binary, kernel, iterations=1))

def extract_text_from_pdf(pdf_path, batch_size=5):
    try:
        print(f"Processing PDF: {pdf_path}")
        start_time = time.time()
        images = convert_from_path(pdf_path, dpi=150, first_page=1, last_page=20)  # More pages
        text = ""
        for i in range(0, len(images), batch_size):
            batch_images = images[i:i + batch_size]
            for image in batch_images:
                processed_image = preprocess_image(image)
                text += pytesseract.image_to_string(
                    processed_image,
                    lang='hin+mar+ben+tam+kan+tel+mal+pan+ori+asm+eng+urd+chi',
                    config="--psm 6 --oem 1"
                ).replace('\0', '')[:100000] + "\n"
            del batch_images
            gc.collect()
        print(f"Finished {pdf_path} in {time.time() - start_time:.2f}s")
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def detect_language_unicode(text):
    lang_counts = {
        'Hindi': sum(1 for char in text if '\u0900' <= char <= '\u097F'),
        'Marathi': sum(1 for char in text if '\u0900' <= char <= '\u097F'),
        'Sindhi': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),
        'Gujarati': sum(1 for char in text if '\u0A80' <= char <= '\u0AFF'),
        'Bengali': sum(1 for char in text if '\u0980' <= char <= '\u09FF'),
        'Tamil': sum(1 for char in text if '\u0B80' <= char <= '\u0BFF'),
        'Kannada': sum(1 for char in text if '\u0C80' <= char <= '\u0CFF'),
        'Telugu': sum(1 for char in text if '\u0C00' <= char <= '\u0C7F'),
        'Malayalam': sum(1 for char in text if '\u0D00' <= char <= '\u0D7F'),
        'Punjabi': sum(1 for char in text if '\u0A00' <= char <= '\u0A7F'),
        'Odia': sum(1 for char in text if '\u0B00' <= char <= '\u0B7F'),
        'Assamese': sum(1 for char in text if '\u0980' <= char <= '\u09FF'),
        'Urdu': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),
        'Chinese': sum(1 for char in text if '\u4E00' <= char <= '\u9FFF'),
        'English': sum(1 for char in text if '\u0041' <= char <= '\u007A' or '\u0061' <= char <= '\u007A')
    }
    total_chars = sum(lang_counts.values())
    if total_chars == 0:
        return None, {}, {}
    lang_probs = {lang: count / total_chars for lang, count in lang_counts.items()}
    top_lang = max(lang_probs, key=lang_probs.get)
    if top_lang in LANGUAGES:
        return top_lang, lang_probs, {top_lang: text}
    return None, lang_probs, {}

def extract_from_pdfs(pdf_dir="/kaggle/input/pdddffs/allpdfs", output_file="pdf_corpus.csv.gz"):
    print("Extracting from PDFs...")
    if not os.path.exists(pdf_dir):
        print(f"PDF directory {pdf_dir} not found, skipping...")
        return 0
    for f in os.listdir("/kaggle/working"):
        if f.endswith((".csv", ".gz")):
            os.remove(os.path.join("/kaggle/working", f))
    if os.path.exists(output_file):
        os.remove(output_file)
    all_samples = 0
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")]  # Process all
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_corpus = []
        try:
            text = extract_text(pdf_path)
            if not text.strip():
                text = extract_text_from_pdf(pdf_path)
            if not text:
                continue
            paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
            for para in paragraphs:
                if filter_text_length(para):
                    top_lang, _, _ = detect_language_unicode(para)
                    if top_lang in LANGUAGES:
                        pdf_corpus.append({"text": para, "language": top_lang})
            if pdf_corpus:
                pdf_df = pd.DataFrame(pdf_corpus)
                mode = 'a' if all_samples > 0 else 'w'
                header = all_samples == 0
                pdf_df.to_csv(output_file, mode=mode, header=header, index=False, escapechar='\\', encoding='utf-8-sig', compression='gzip')
                all_samples += len(pdf_df)
                print(f"Extracted {len(pdf_df)} samples from {pdf_file}")
                del pdf_df, pdf_corpus
                gc.collect()
                if all_samples >= TARGET_TOTAL:
                    break
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
    print(f"Saved total of {all_samples} samples from PDFs to {output_file}")
    return all_samples

if __name__ == "__main__":
    extract_from_pdfs()