In [None]:


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
!apt-get update && apt-get install -y poppler-utils tesseract-ocr

In [None]:
!pip install pandas requests beautifulsoup4 scrapy datasets pdfminer.six clean-text

In [None]:
!pip install langdetect

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datasets import load_dataset
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import cv2
import numpy as np
import os
from cleantext import clean
import gc
from langdetect import detect_langs

LANGUAGES = ["Hindi", "Marathi", "Sindhi", "Gujarati", "Bengali", "Tamil", "Kannada"]
TARGET_PER_LANG = 250000  
MIN_WORDS = 50

def filter_text_length(text):
    if not isinstance(text, str):
        return False
    words = text.split()
    return len(words) >= MIN_WORDS

def preprocess_image(image):
    """Preprocess scanned images to improve OCR accuracy."""
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    kernel = np.ones((1, 1), np.uint8)
    processed_image = cv2.dilate(binary, kernel, iterations=1)
    return Image.fromarray(processed_image)

def extract_text_from_pdf(pdf_path, batch_size=10):  
    """Extract text from scanned PDFs using OCR in batches."""
    try:
        images = convert_from_path(pdf_path, dpi=200, first_page=1, last_page=20)  
        text = ""
        for i in range(0, len(images), batch_size):
            batch_images = images[i:i + batch_size]
            for image in batch_images:
                processed_image = preprocess_image(image)
                text += pytesseract.image_to_string(
                    processed_image,
                    lang='hin+mar+ben+tam+kan+eng+urd+chi',
                    config="--psm 6 --oem 1"
                ) + "\n"
            del batch_images
            processed_image = None
            gc.collect()
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def detect_language_unicode(text):
    """Detects and labels language based on Unicode ranges."""
    lang_counts = {
        'Hindi': sum(1 for char in text if '\u0900' <= char <= '\u097F'),  # Devanagari
        'Marathi': sum(1 for char in text if '\u0900' <= char <= '\u097F'), # Devanagari (overlap with Hindi)
        'Sindhi': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),  # Arabic script
        'Gujarati': sum(1 for char in text if '\u0A80' <= char <= '\u0AFF'),
        'Bengali': sum(1 for char in text if '\u0980' <= char <= '\u09FF'),
        'Tamil': sum(1 for char in text if '\u0B80' <= char <= '\u0BFF'),
        'Kannada': sum(1 for char in text if '\u0C80' <= char <= '\u0CFF'),
        'Urdu': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),    # Arabic script
        'English': sum(1 for char in text if '\u0041' <= char <= '\u007A' or '\u0061' <= char <= '\u007A')
    }
    
    total_chars = sum(lang_counts.values())
    if total_chars == 0:
        return None, {}, {}
    
    lang_probs = {lang: count / total_chars for lang, count in lang_counts.items()}
    top_lang = max(lang_probs, key=lang_probs.get)
    
    if top_lang in LANGUAGES:
        return top_lang, lang_probs, {top_lang: text}
    return None, lang_probs, {}

def save_df_in_chunks(df, output_file, chunk_size=10000):
    """Save large dataframe in chunks to avoid memory issues."""
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i + chunk_size]
        mode = 'w' if i == 0 else 'a'
        header = i == 0
        chunk.to_csv(output_file, 
                    mode=mode, 
                    header=header, 
                    index=False, 
                    escapechar='\\', 
                    encoding='utf-8-sig')
        # Clear memory
        del chunk
        gc.collect()
    print(f"Saved {len(df)} samples to {output_file} in chunks")

def load_from_datasets(output_file="dataset_corpus.csv"):
    print("Loading from public datasets...")
    all_samples = 0
    oscar_langs = {
        "hi": "Hindi",
        "mr": "Marathi",
        "gu": "Gujarati",
        "bn": "Bengali",
        "ta": "Tamil",
        "kn": "Kannada"
    }
    
    open(output_file, 'w', encoding='utf-8-sig').close()
    
    for code, lang in oscar_langs.items():
        try:
            dataset = load_dataset("oscar", f"unshuffled_deduplicated_{code}", split="train", trust_remote_code=True)
            chunk_size = min(10000, TARGET_PER_LANG)
            num_chunks = min(TARGET_PER_LANG // chunk_size, len(dataset) // chunk_size)
            
            total_lang_samples = 0
            for i in range(num_chunks):
                start_idx = i * chunk_size
                end_idx = min((i + 1) * chunk_size, len(dataset))
                
                chunk_data = dataset[start_idx:end_idx]
                chunk_df = pd.DataFrame(chunk_data)
                filtered_df = chunk_df[chunk_df['text'].apply(filter_text_length)]
                
                if not filtered_df.empty:
                    filtered_df = filtered_df[['text']].assign(language=lang)
                    # Write chunk to file
                    mode = 'a' if i > 0 or all_samples > 0 else 'w'
                    header = i == 0 and all_samples == 0
                    filtered_df.to_csv(output_file, 
                                      mode=mode, 
                                      header=header, 
                                      index=False, 
                                      escapechar='\\', 
                                      encoding='utf-8-sig')
                    
                    samples_added = len(filtered_df)
                    total_lang_samples += samples_added
                    all_samples += samples_added
                
                del chunk_data, chunk_df, filtered_df
                gc.collect()
                
                if total_lang_samples >= TARGET_PER_LANG:
                    break
                    
            print(f"Loaded {total_lang_samples} {lang} samples from OSCAR")
            
        except Exception as e:
            print(f"Error loading OSCAR for {lang}: {e}")
    
    print(f"Saved total of {all_samples} samples from datasets to {output_file}")
    return all_samples

def scrape_from_web(output_file="scraped_corpus.csv"):
    print("Scraping from web...")
    sites = {
        "Hindi": ["https://hindi.bbc.com", "https://www.bhaskar.com"],
        "Marathi": ["https://lokmat.com", "https://maharashtratimes.com"],
        "Sindhi": ["https://awamiawaz.pk"],
        "Gujarati": ["https://divyabhaskar.co.in", "https://sandesh.com"],
        "Bengali": ["https://anandabazar.com", "https://eisamay.com"],
        "Tamil": ["https://dinamalar.com", "https://dailythanthi.com"],
        "Kannada": ["https://prajavani.net", "https://vijaykarnataka.com"]
    }
    
    open(output_file, 'w', encoding='utf-8-sig').close()
    all_samples = 0
    
    headers = {"User-Agent": "Mozilla/5.0"}
    
    for lang, urls in sites.items():
        lang_texts = []
        max_samples_per_url = TARGET_PER_LANG // len(urls)
        
        for url in urls:
            try:
                response = requests.get(url, headers=headers, timeout=10)
                soup = BeautifulSoup(response.content, "html.parser")
                paragraphs = soup.find_all("p")
                
                for p in paragraphs:
                    if len(lang_texts) >= max_samples_per_url:
                        break
                        
                    text = clean(p.get_text(), no_line_breaks=True, no_urls=True, no_emails=True)
                    if filter_text_length(text):
                        lang_texts.append({"text": text, "language": lang})
                        
                # Save immediately if we have enough data
                if len(lang_texts) >= max_samples_per_url:
                    break
                    
            except Exception as e:
                print(f"Error scraping {url}: {e}")
        
        # Save language data
        if lang_texts:
            lang_df = pd.DataFrame(lang_texts)
            mode = 'a' if all_samples > 0 else 'w'
            header = all_samples == 0
            lang_df.to_csv(output_file, 
                          mode=mode, 
                          header=header, 
                          index=False, 
                          escapechar='\\', 
                          encoding='utf-8-sig')
            
            all_samples += len(lang_df)
            print(f"Scraped {lang}: {len(lang_df)} samples")
            
            del lang_df, lang_texts
            gc.collect()
    
    print(f"Saved total of {all_samples} samples from web to {output_file}")
    return all_samples

def extract_from_pdfs(pdf_dir="/kaggle/input/pdddffs/allpdfs", output_file="pdf_corpus.csv"):
    print("Extracting from PDFs...")
    
    if not os.path.exists(pdf_dir):
        print(f"PDF directory {pdf_dir} not found, skipping...")
        return 0
    
    open(output_file, 'w', encoding='utf-8-sig').close()
    all_samples = 0
    
    # Limit to 10 PDFs max to prevent memory issues
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")][:10]
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_corpus = []
        
        try:
            # Try pdfminer for searchable PDFs first
            text = extract_text(pdf_path)
            if not text.strip():  # If empty, use OCR
                text = extract_text_from_pdf(pdf_path, batch_size=5)  # Reduced batch size
            
            if not text:
                continue
            
            # Split into paragraphs
            paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
            
            for para in paragraphs:
                if filter_text_length(para):
                    top_lang, lang_probs, _ = detect_language_unicode(para)
                    if top_lang in LANGUAGES:  # Only include target languages
                        pdf_corpus.append({"text": para, "language": top_lang})
            
            if pdf_corpus:
                pdf_df = pd.DataFrame(pdf_corpus)
                mode = 'a' if all_samples > 0 else 'w'
                header = all_samples == 0
                pdf_df.to_csv(output_file, 
                             mode=mode, 
                             header=header, 
                             index=False, 
                             escapechar='\\', 
                             encoding='utf-8-sig')
                
                all_samples += len(pdf_df)
                print(f"Extracted {len(pdf_df)} samples from {pdf_file}")
                
                del pdf_df, pdf_corpus
                gc.collect()
            
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
        
        text = None
        paragraphs = None
        gc.collect()
    
    print(f"Saved total of {all_samples} samples from PDFs to {output_file}")
    return all_samples

def build_corpus(output_file="indic_corpus.csv"):
    print("Building the corpus...")
    
    # Process each source individually and merge at the end
    dataset_count = load_from_datasets("temp_dataset_corpus.csv")
    gc.collect()  # Force garbage collection
    
    scraped_count = scrape_from_web("temp_scraped_corpus.csv")
    gc.collect()  # Force garbage collection
    
    pdf_count = extract_from_pdfs(output_file="temp_pdf_corpus.csv")
    gc.collect()  # Force garbage collection
    
    if dataset_count + scraped_count + pdf_count > 0:
        print("Merging and balancing corpus...")
        
        open(output_file, 'w', encoding='utf-8-sig').close()
        
        # Process each language separately
        for lang in LANGUAGES:
            lang_samples = []
            
            for source_file in ["temp_dataset_corpus.csv", "temp_scraped_corpus.csv", "temp_pdf_corpus.csv"]:
                if os.path.exists(source_file) and os.path.getsize(source_file) > 0:
                    # Read in chunks
                    chunk_size = 10000
                    for chunk in pd.read_csv(source_file, 
                                            chunksize=chunk_size, 
                                            encoding='utf-8-sig', 
                                            escapechar='\\'):
                        # Filter for current language
                        lang_chunk = chunk[chunk['language'] == lang]
                        if not lang_chunk.empty:
                            lang_samples.append(lang_chunk)
                            
                            # If we have enough samples, stop reading
                            total_samples = sum(len(df) for df in lang_samples)
                            if total_samples >= TARGET_PER_LANG:
                                break
                    
                    gc.collect()
            
            if lang_samples:
                combined_lang = pd.concat(lang_samples)
                available_samples = len(combined_lang)
                
                if available_samples > TARGET_PER_LANG:
                    combined_lang = combined_lang.sample(n=TARGET_PER_LANG, random_state=42)
                
                mode = 'a' if os.path.getsize(output_file) > 0 else 'w'
                header = mode == 'w'
                combined_lang.to_csv(output_file, 
                                    mode=mode, 
                                    header=header, 
                                    index=False, 
                                    escapechar='\\', 
                                    encoding='utf-8-sig')
                
                print(f"Added {len(combined_lang)} {lang} samples to final corpus")
                
                # Clear memory
                del combined_lang, lang_samples
                gc.collect()
            else:
                print(f"No samples found for {lang}")
        
        total_lines = 0
        with open(output_file, 'r', encoding='utf-8-sig') as f:
            for _ in f:
                total_lines += 1
        
        total_lines -= 1  # Subtract header line
        print(f"Final corpus saved to {output_file} with {total_lines} samples")
        
        # Clean up temporary files
        for temp_file in ["temp_dataset_corpus.csv", "temp_scraped_corpus.csv", "temp_pdf_corpus.csv"]:
            if os.path.exists(temp_file):
                os.remove(temp_file)
    else:
        print("No data collected!")

if __name__ == "__main__":
    build_corpus()

In [None]:
df = pd.read_csv("indic_corpus.csv", encoding='utf-8-sig', escapechar='\\')

In [None]:
!apt-get install -y tesseract-ocr tesseract-ocr-hin tesseract-ocr-mar tesseract-ocr-ben tesseract-ocr-tam tesseract-ocr-kan tesseract-ocr-eng tesseract-ocr-urd tesseract-ocr-chi-sim

In [None]:
!pip install pdf2image pytesseract opencv-python pillow clean-text langdetect datasets requests beautifulsoup4

In [None]:
!pip install pdfminer pdf2image pytesseract langdetect

In [None]:
!pip install pdfminer.six==20231228


In [None]:
!pip install cleantext

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datasets import load_dataset
try:
    from pdfminer.high_level import extract_text
except ImportError:
    def extract_text(pdf_path):
        print(f"Using fallback extraction for {pdf_path} due to pdfminer import issue")
        return ""
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import cv2
import numpy as np
import os
from cleantext import clean
import gc
from langdetect import detect_langs
import concurrent.futures
import time

LANGUAGES = [
    "Hindi", "Marathi", "Sindhi", "Gujarati", "Bengali", "Tamil", "Kannada",
    "Telugu", "Malayalam", "Punjabi", "Odia", "Assamese"
]
TARGET_PER_LANG = 250000  # Reduced for faster runtime
MIN_WORDS = 50

def filter_text_length(text):
    if not isinstance(text, str):
        return False
    return len(text.split()) >= MIN_WORDS

def preprocess_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    kernel = np.ones((1, 1), np.uint8)
    return Image.fromarray(cv2.dilate(binary, kernel, iterations=1))

def extract_text_from_pdf(pdf_path, batch_size=5):
    try:
        print(f"Processing PDF: {pdf_path}")
        start_time = time.time()
        images = convert_from_path(pdf_path, dpi=150, first_page=1, last_page=10)  # Reduced DPI and pages
        text = ""
        for i in range(0, len(images), batch_size):
            batch_images = images[i:i + batch_size]
            for image in batch_images:
                processed_image = preprocess_image(image)
                text += pytesseract.image_to_string(
                    processed_image,
                    lang='hin+mar+ben+tam+kan+tel+mal+pan+ori+asm+eng+urd+chi',
                    config="--psm 6 --oem 1"
                ).replace('\0', '')[:100000] + "\n"  # Clean NULL, truncate
            del batch_images
            gc.collect()
        print(f"Finished {pdf_path} in {time.time() - start_time:.2f}s")
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def detect_language_unicode(text):
    lang_counts = {
        'Hindi': sum(1 for char in text if '\u0900' <= char <= '\u097F'),
        'Marathi': sum(1 for char in text if '\u0900' <= char <= '\u097F'),
        'Sindhi': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),
        'Gujarati': sum(1 for char in text if '\u0A80' <= char <= '\u0AFF'),
        'Bengali': sum(1 for char in text if '\u0980' <= char <= '\u09FF'),
        'Tamil': sum(1 for char in text if '\u0B80' <= char <= '\u0BFF'),
        'Kannada': sum(1 for char in text if '\u0C80' <= char <= '\u0CFF'),
        'Telugu': sum(1 for char in text if '\u0C00' <= char <= '\u0C7F'),
        'Malayalam': sum(1 for char in text if '\u0D00' <= char <= '\u0D7F'),
        'Punjabi': sum(1 for char in text if '\u0A00' <= char <= '\u0A7F'),
        'Odia': sum(1 for char in text if '\u0B00' <= char <= '\u0B7F'),
        'Assamese': sum(1 for char in text if '\u0980' <= char <= '\u09FF'),
        'Urdu': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),
        'Chinese': sum(1 for char in text if '\u4E00' <= char <= '\u9FFF'),
        'English': sum(1 for char in text if '\u0041' <= char <= '\u007A' or '\u0061' <= char <= '\u007A')
    }
    total_chars = sum(lang_counts.values())
    if total_chars == 0:
        return None, {}, {}
    lang_probs = {lang: count / total_chars for lang, count in lang_counts.items()}
    top_lang = max(lang_probs, key=lang_probs.get)
    if top_lang in LANGUAGES:
        return top_lang, lang_probs, {top_lang: text}
    return None, lang_probs, {}

def save_df_in_chunks(df, output_file, chunk_size=5000):
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i + chunk_size]
        mode = 'w' if i == 0 else 'a'
        header = i == 0
        chunk.to_csv(output_file, mode=mode, header=header, index=False, escapechar='\\', encoding='utf-8-sig')
        del chunk
        gc.collect()

def load_from_datasets(output_file="dataset_corpus.csv"):
    print("Loading from public datasets...")
    oscar_langs = {
        "hi": "Hindi", "mr": "Marathi", "gu": "Gujarati", "bn": "Bengali",
        "ta": "Tamil", "kn": "Kannada", "te": "Telugu", "ml": "Malayalam",
        "pa": "Punjabi", "or": "Odia", "as": "Assamese"
    }
    open(output_file, 'w', encoding='utf-8-sig').close()
    all_samples = 0
    
    for code, lang in oscar_langs.items():
        try:
            print(f"Loading OSCAR for {lang}...")
            dataset = load_dataset("oscar", f"unshuffled_deduplicated_{code}", split="train", trust_remote_code=True)
            chunk_size = 5000
            num_chunks = min(TARGET_PER_LANG // chunk_size, len(dataset) // chunk_size)
            total_lang_samples = 0
            
            for i in range(num_chunks):
                chunk_data = dataset[i * chunk_size:(i + 1) * chunk_size]
                chunk_df = pd.DataFrame(chunk_data)
                chunk_df['text'] = chunk_df['text'].apply(lambda x: x.replace('\0', '')[:100000] if isinstance(x, str) else x)
                filtered_df = chunk_df[chunk_df['text'].apply(filter_text_length)]
                if not filtered_df.empty:
                    filtered_df = filtered_df[['text']].assign(language=lang)
                    mode = 'a' if all_samples > 0 else 'w'
                    header = all_samples == 0
                    filtered_df.to_csv(output_file, mode=mode, header=header, index=False, escapechar='\\', encoding='utf-8-sig')
                    total_lang_samples += len(filtered_df)
                    all_samples += len(filtered_df)
                del chunk_data, chunk_df, filtered_df
                gc.collect()
                if total_lang_samples >= TARGET_PER_LANG:
                    break
            print(f"Loaded {total_lang_samples} {lang} samples from OSCAR")
        except Exception as e:
            print(f"Error loading OSCAR for {lang}: {e}")
    print(f"Saved total of {all_samples} samples from datasets to {output_file}")
    return all_samples

def scrape_url(url, lang, max_samples):
    headers = {"User-Agent": "Mozilla/5.0"}
    lang_texts = []
    try:
        response = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find_all("p")
        for p in paragraphs:
            if len(lang_texts) >= max_samples:
                break
            text = clean(p.get_text(), no_line_breaks=True, no_urls=True, no_emails=True).replace('\0', '')[:100000]
            if filter_text_length(text):
                lang_texts.append({"text": text, "language": lang})
        print(f"Scraped {url}: {len(lang_texts)} samples")
    except Exception as e:
        print(f"Error scraping {url}: {e}")
    return lang_texts

def scrape_from_web(output_file="scraped_corpus.csv"):
    print("Scraping from web...")
    sites = {
        "Hindi": ["https://hindi.bbc.com", "https://www.bhaskar.com"],
        "Marathi": ["https://lokmat.com", "https://maharashtratimes.com"],
        "Sindhi": ["https://awamiawaz.pk"],
        "Gujarati": ["https://divyabhaskar.co.in", "https://sandesh.com"],
        "Bengali": ["https://anandabazar.com", "https://eisamay.com"],
        "Tamil": ["https://dinamalar.com", "https://dailythanthi.com"],
        "Kannada": ["https://prajavani.net", "https://vijaykarnataka.com"],
        "Telugu": ["https://eenadu.net", "https://sakshi.com"],
        "Malayalam": ["https://mathrubhumi.com", "https://www.manoramaonline.com"],
        "Punjabi": ["https://punjabitribuneonline.com", "https://www.ajitjalandhar.com"],
        "Odia": ["https://sambad.in", "https://dharitri.com"],
        "Assamese": ["https://asomiyapratidin.in", "https://pratidintime.com"]
    }
    open(output_file, 'w', encoding='utf-8-sig').close()
    all_samples = 0
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        for lang, urls in sites.items():
            max_samples_per_url = TARGET_PER_LANG // len(urls)
            future_to_url = {executor.submit(scrape_url, url, lang, max_samples_per_url): url for url in urls}
            lang_texts = []
            for future in concurrent.futures.as_completed(future_to_url):
                lang_texts.extend(future.result())
            if lang_texts:
                lang_df = pd.DataFrame(lang_texts)
                mode = 'a' if all_samples > 0 else 'w'
                header = all_samples == 0
                lang_df.to_csv(output_file, mode=mode, header=header, index=False, escapechar='\\', encoding='utf-8-sig')
                all_samples += len(lang_df)
                print(f"Scraped {lang}: {len(lang_df)} samples")
                del lang_df, lang_texts
                gc.collect()
    print(f"Saved total of {all_samples} samples from web to {output_file}")
    return all_samples

def extract_from_pdfs(pdf_dir="/kaggle/input/pdddffs/allpdfs", output_file="pdf_corpus.csv"):
    print("Extracting from PDFs...")
    if not os.path.exists(pdf_dir):
        print(f"PDF directory {pdf_dir} not found, skipping...")
        return 0
    open(output_file, 'w', encoding='utf-8-sig').close()
    all_samples = 0
    pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")][:5]  # Limit to 5 PDFs
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_dir, pdf_file)
        pdf_corpus = []
        try:
            # Modified to handle pdfminer error more gracefully
            try:
                text = extract_text(pdf_path)
            except Exception as e:
                print(f"Primary PDF extraction failed: {e}, using backup method")
                text = ""
                
            if not text.strip():
                text = extract_text_from_pdf(pdf_path)
            if not text:
                continue
            paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
            for para in paragraphs:
                if filter_text_length(para):
                    top_lang, _, _ = detect_language_unicode(para)
                    if top_lang in LANGUAGES:
                        pdf_corpus.append({"text": para, "language": top_lang})
            if pdf_corpus:
                pdf_df = pd.DataFrame(pdf_corpus)
                mode = 'a' if all_samples > 0 else 'w'
                header = all_samples == 0
                pdf_df.to_csv(output_file, mode=mode, header=header, index=False, escapechar='\\', encoding='utf-8-sig')
                all_samples += len(pdf_df)
                print(f"Extracted {len(pdf_df)} samples from {pdf_file}")
                del pdf_df, pdf_corpus
                gc.collect()
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
    print(f"Saved total of {all_samples} samples from PDFs to {output_file}")
    return all_samples

def build_corpus(output_file="indic_corpus.csv"):
    print("Building the corpus...")
    dataset_count = load_from_datasets("temp_dataset_corpus.csv")
    gc.collect()
    scraped_count = scrape_from_web("temp_scraped_corpus.csv")
    gc.collect()
    pdf_count = extract_from_pdfs(output_file="temp_pdf_corpus.csv")
    gc.collect()
    
    if dataset_count + scraped_count + pdf_count > 0:
        print("Merging and balancing corpus...")
        open(output_file, 'w', encoding='utf-8-sig').close()
        for lang in LANGUAGES:
            lang_samples = []
            for source_file in ["temp_dataset_corpus.csv", "temp_scraped_corpus.csv", "temp_pdf_corpus.csv"]:
                if os.path.exists(source_file) and os.path.getsize(source_file) > 0:
                    for chunk in pd.read_csv(source_file, chunksize=5000, encoding='utf-8-sig', escapechar='\\'):
                        lang_chunk = chunk[chunk['language'] == lang]
                        if not lang_chunk.empty:
                            lang_samples.append(lang_chunk)
                            if sum(len(df) for df in lang_samples) >= TARGET_PER_LANG:
                                break
                    gc.collect()
            if lang_samples:
                combined_lang = pd.concat(lang_samples)
                if len(combined_lang) > TARGET_PER_LANG:
                    combined_lang = combined_lang.sample(n=TARGET_PER_LANG, random_state=42)
                mode = 'a' if os.path.getsize(output_file) > 0 else 'w'
                header = mode == 'w'
                combined_lang.to_csv(output_file, mode=mode, header=header, index=False, escapechar='\\', encoding='utf-8-sig')
                print(f"Added {len(combined_lang)} {lang} samples to final corpus")
                del combined_lang, lang_samples
                gc.collect()
        total_lines = sum(1 for _ in open(output_file, 'r', encoding='utf-8-sig')) - 1  # Subtract header
        print(f"Final corpus saved to {output_file} with {total_lines} samples")
        for temp_file in ["temp_dataset_corpus.csv", "temp_scraped_corpus.csv", "temp_pdf_corpus.csv"]:
            if os.path.exists(temp_file):
                os.remove(temp_file)
    else:
        print("No data collected!")

if __name__ == "__main__":
    build_corpus()

In [None]:
import gzip
import pandas as pd

# Decompress and clean NULL bytes
with gzip.open('/kaggle/working/dataset_corpus.csv.gz', 'rb') as f_in:
    with open('/kaggle/working/dataset_corpus_clean.csv', 'wb') as f_out:
        # Read raw bytes, replace NULL bytes
        data = f_in.read().replace(b'\0', b'')
        f_out.write(data)

# Load cleaned CSV
df = pd.read_csv('/kaggle/working/dataset_corpus_clean.csv', engine='python', escapechar='\\')
print(f"OSCAR samples: {len(df)}")
scraped_df = scrape_from_web()
pdf_df = extract_from_pdfs()

In [None]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import cv2
import numpy as np
import os
import gc
import time
from multiprocessing import Pool, cpu_count
from datasets import load_dataset
import parquet
import gzip

# Initialize Spark
spark = SparkSession.builder \
    .appName("IndicCorpus") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

LANGUAGES = [
    "Hindi", "Marathi", "Sindhi", "Gujarati", "Bengali", "Tamil", "Kannada",
    "Telugu", "Malayalam", "Punjabi", "Odia", "Assamese"
]
TARGET_PER_LANG = 416667  # ~5M total / 12
MIN_WORDS = 50

def filter_text_length(text):
    if not isinstance(text, str):
        return False
    return len(text.split()) >= MIN_WORDS

def preprocess_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    kernel = np.ones((1, 1), np.uint8)
    return Image.fromarray(cv2.dilate(binary, kernel, iterations=1))

def extract_text_from_pdf(pdf_path):
    try:
        print(f"Processing PDF: {pdf_path}")
        start_time = time.time()
        text = extract_text(pdf_path)
        if not text.strip():
            images = convert_from_path(pdf_path, dpi=150, first_page=1, last_page=50)
            text = ""
            for image in images:
                processed_image = preprocess_image(image)
                text += pytesseract.image_to_string(
                    processed_image,
                    lang='hin+mar+ben+tam+kan+tel+mal+pan+ori+asm+eng+urd+chi',
                    config="--psm 6 --oem 1"
                ).replace('\0', '')[:100000] + "\n"
                del processed_image
                gc.collect()
        print(f"Finished {pdf_path} in {time.time() - start_time:.2f}s")
        return text.strip(), os.path.basename(pdf_path).split('_')[0]  # Assume filename like "Odia_1.pdf"
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return "", ""

def detect_language_unicode(text):
    lang_counts = {
        'Hindi': sum(1 for char in text if '\u0900' <= char <= '\u097F'),
        'Marathi': sum(1 for char in text if '\u0900' <= char <= '\u097F'),
        'Sindhi': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),
        'Gujarati': sum(1 for char in text if '\u0A80' <= char <= '\u0AFF'),
        'Bengali': sum(1 for char in text if '\u0980' <= char <= '\u09FF'),
        'Tamil': sum(1 for char in text if '\u0B80' <= char <= '\u0BFF'),
        'Kannada': sum(1 for char in text if '\u0C80' <= char <= '\u0CFF'),
        'Telugu': sum(1 for char in text if '\u0C00' <= char <= '\u0C7F'),
        'Malayalam': sum(1 for char in text if '\u0D00' <= char <= '\u0D7F'),
        'Punjabi': sum(1 for char in text if '\u0A00' <= char <= '\u0A7F'),
        'Odia': sum(1 for char in text if '\u0B00' <= char <= '\u0B7F'),
        'Assamese': sum(1 for char in text if '\u0980' <= char <= '\u09FF'),
        'Urdu': sum(1 for char in text if '\u0600' <= char <= '\u06FF'),
        'Chinese': sum(1 for char in text if '\u4E00' <= char <= '\u9FFF'),
        'English': sum(1 for char in text if '\u0041' <= char <= '\u007A' or '\u0061' <= char <= '\u007A')
    }
    total_chars = sum(lang_counts.values())
    if total_chars == 0:
        return None
    lang_probs = {lang: count / total_chars for lang, count in lang_counts.items()}
    top_lang = max(lang_probs, key=lang_probs.get)
    return top_lang if top_lang in LANGUAGES else None

def process_pdf(pdf_path):
    text, hinted_lang = extract_text_from_pdf(pdf_path)
    if not text:
        return []
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    corpus = []
    for para in paragraphs:
        if filter_text_length(para):
            detected_lang = detect_language_unicode(para)
            lang = hinted_lang if hinted_lang in LANGUAGES else detected_lang
            if lang in LANGUAGES:
                corpus.append({"text": para, "language": lang})
    return corpus

def load_oscar():
    print("Loading OSCAR data...")
    oscar_langs = {
        "hi": "Hindi", "mr": "Marathi", "gu": "Gujarati", "bn": "Bengali",
        "ta": "Tamil", "kn": "Kannada", "te": "Telugu", "ml": "Malayalam",
        "pa": "Punjabi", "or": "Odia", "as": "Assamese"
    }
    oscar_data = []
    for code, lang in oscar_langs.items():
        try:
            dataset = load_dataset("oscar", f"unshuffled_deduplicated_{code}", split="train", trust_remote_code=True)
            for i in range(min(TARGET_PER_LANG * 2, len(dataset))):  # Oversample, filter later
                text = dataset[i]['text'].replace('\0', '')[:100000]
                if filter_text_length(text):
                    oscar_data.append({"text": text, "language": lang})
        except Exception as e:
            print(f"Error loading OSCAR for {lang}: {e}")
    df = pd.DataFrame(oscar_data)
    df.to_parquet("/kaggle/working/temp_oscar.parquet", compression='gzip')
    print(f"Saved {len(df)} OSCAR samples")
    return len(df)

def load_samanantar():
    print("Loading Samanantar data...")
    samanantar_langs = {
        "hi": "Hindi", "mr": "Marathi", "gu": "Gujarati", "bn": "Bengali",
        "ta": "Tamil", "kn": "Kannada", "te": "Telugu", "ml": "Malayalam",
        "pa": "Punjabi", "or": "Odia", "as": "Assamese"
    }
    samanantar_data = []
    try:
        dataset = load_dataset("togethercomputer/samanantar", split="train", trust_remote_code=True)
        for row in dataset:
            for lang_code, lang_name in samanantar_langs.items():
                text = row.get(lang_code, "").replace('\0', '')[:100000]
                if text and filter_text_length(text):
                    samanantar_data.append({"text": text, "language": lang_name})
    except Exception as e:
        print(f"Error loading Samanantar: {e}")
    df = pd.DataFrame(samanantar_data)
    df.to_parquet("/kaggle/working/temp_samanantar.parquet", compression='gzip')
    print(f"Saved {len(df)} Samanantar samples")
    return len(df)

def process_pdfs_parallel(pdf_dir="/kaggle/input/meta-folder"):
    print("Processing PDFs in parallel...")
    pdf_files = []
    for lang_folder in os.listdir(pdf_dir):
        lang_path = os.path.join(pdf_dir, lang_folder)
        if os.path.isdir(lang_path):
            for pdf_file in os.listdir(lang_path):
                if pdf_file.endswith(".pdf"):
                    pdf_files.append(os.path.join(lang_path, pdf_file))
    
    with Pool(cpu_count()) as pool:
        results = pool.map(process_pdf, pdf_files)
    
    pdf_corpus = [item for sublist in results for item in sublist]
    df = pd.DataFrame(pdf_corpus)
    df.to_parquet("/kaggle/working/temp_pdf.parquet", compression='gzip')
    print(f"Saved {len(df)} PDF samples")
    return len(df)

def build_corpus(pdf_dir="/kaggle/input/meta-folder"):
    print("Building corpus...")
    for f in os.listdir("/kaggle/working"):
        if f.endswith((".parquet", ".gz")):
            os.remove(os.path.join("/kaggle/working", f))
    
    pdf_count = process_pdfs_parallel(pdf_dir)
    oscar_count = load_oscar()
    samanantar_count = load_samanantar()
    
    spark_df = None
    for source_file in ["/kaggle/working/temp_pdf.parquet", "/kaggle/working/temp_oscar.parquet", "/kaggle/working/temp_samanantar.parquet"]:
        if os.path.exists(source_file):
            temp_df = spark.read.parquet(source_file)
            spark_df = temp_df if spark_df is None else spark_df.union(temp_df)
    
    if spark_df:
        spark_df = spark_df.dropDuplicates(["text"])
        balanced_df = spark_df.groupBy("language").agg(F.count("*").alias("count")) \
            .filter(F.col("language").isin(LANGUAGES)) \
            .join(spark_df, "language", "inner") \
            .orderBy(F.rand()) \
            .limit(TARGET_PER_LANG) \
            .select("text", "language")
        
        balanced_df.write.parquet("/kaggle/working/indic_corpus.parquet", mode="overwrite", compression="gzip")
        total_samples = balanced_df.count()
        print(f"Saved {total_samples} samples to /kaggle/working/indic_corpus.parquet")
        
        dist = balanced_df.groupBy("language").count().collect()
        for row in dist:
            print(f"{row['language']}: {row['count']} samples")
        if total_samples < TARGET_PER_LANG * len(LANGUAGES):
            print(f"Warning: Only {total_samples} samples collected, below 5M target!")
    
    for f in ["temp_pdf.parquet", "temp_oscar.parquet", "temp_samanantar.parquet"]:
        if os.path.exists(f"/kaggle/working/{f}"):
            os.remove(f"/kaggle/working/{f}")

if __name__ == "__main__":
    build_corpus()

In [1]:
!pip install pdf2image pytesseract opencv-python pillow pdfminer.six datasets pyarrow pyspark fasttext
!apt-get install -y tesseract-ocr tesseract-ocr-hin tesseract-ocr-mar tesseract-ocr-ben tesseract-ocr-tam tesseract-ocr-kan tesseract-ocr-tel tesseract-ocr-mal tesseract-ocr-pan tesseract-ocr-ori tesseract-ocr-asm tesseract-ocr-eng tesseract-ocr-urd tesseract-ocr-chi-sim

Collecting pdfminer.six
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Downloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250327
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
tesseract-ocr-eng is already the newest version (1:4.00~git30-7274cfa-1.1).
tesseract-ocr-eng set to manually installed.
The following NEW packages will be installed:
  tesseract-ocr-asm tesseract-ocr-ben tesseract-ocr-chi-sim tesseract-ocr-hin tesseract-ocr-kan
  tesseract-ocr-mal tesseract-ocr-mar tesseract-ocr-ori tesseract-ocr-pan tesseract-ocr-tam
  tesseract-ocr-tel tesseract-ocr-urd
0 upgraded, 12 newly installed, 0 to remove and 129 not upgrad

In [None]:
# corpus_construction.py
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import cv2
import numpy as np
import torch
import os
import gc
from multiprocessing import Pool, cpu_count
from datasets import load_dataset
import fasttext

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize Spark
spark = SparkSession.builder.appName("IndicCorpus").config("spark.executor.memory", "8g").getOrCreate()

LANGUAGES = ["Hindi", "Marathi", "Sindhi", "Gujarati", "Bengali", "Tamil", "Kannada", "Telugu", "Malayalam", "Punjabi", "Odia", "Assamese"]
TARGET_PER_LANG = 500000  # ~6M total
MIN_WORDS = 50
BATCH_SIZE = 10  # Process 10 images at a time on GPU

# FastText path
FASTTEXT_PATH = "/kaggle/input/fasttextefficienttextclassification/lid.176.bin"
ft_model = fasttext.load_model(FASTTEXT_PATH)

def filter_text_length(text):
    return isinstance(text, str) and len(text.split()) >= MIN_WORDS

def preprocess_image_cpu(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    return Image.fromarray(cv2.dilate(binary, np.ones((1, 1), np.uint8), iterations=1))

def preprocess_image_gpu(images):
    try:
        # Resize images to fixed size (1024x1024) and convert to numpy
        resized_images = [image.resize((1024, 1024), Image.Resampling.LANCZOS) for image in images]
        image_arrays = [np.array(img) for img in resized_images]
        
        # Process in batches to avoid memory overload
        processed_images = []
        for i in range(0, len(image_arrays), BATCH_SIZE):
            batch = image_arrays[i:i + BATCH_SIZE]
            tensor_images = torch.tensor(np.stack(batch), dtype=torch.uint8).to(device)
            
            # Grayscale on GPU
            gray_images = 0.299 * tensor_images[:, :, :, 0] + 0.587 * tensor_images[:, :, :, 1] + 0.114 * tensor_images[:, :, :, 2]
            
            # Thresholding on GPU
            binary_images = (gray_images > 150).type(torch.uint8) * 255
            
            # Dilation on GPU
            kernel = torch.ones((1, 1, 1, 1), dtype=torch.uint8).to(device)
            dilated_images = torch.nn.functional.conv2d(binary_images.unsqueeze(1), kernel, padding=0).squeeze(1)
            
            # Back to CPU
            processed_images.extend([Image.fromarray(img.cpu().numpy()) for img in dilated_images])
        
        return processed_images
    except Exception as e:
        print(f"GPU preprocessing failed: {e}. Falling back to CPU.")
        return [preprocess_image_cpu(img) for img in images]

def extract_text_from_pdf(pdf_path):
    try:
        print(f"Processing PDF: {pdf_path}")
        text = extract_text(pdf_path)
        if not text.strip():
            images = convert_from_path(pdf_path, dpi=150, first_page=1, last_page=50)
            processed_images = preprocess_image_gpu(images) if torch.cuda.is_available() else [preprocess_image_cpu(img) for img in images]
            text = "".join([pytesseract.image_to_string(img, lang='hin+mar+ben+tam+kan+tel+mal+pan+ori+asm+eng+urd+chi', config="--psm 6 --oem 1").replace('\0', '')[:100000] + "\n" for img in processed_images])
            del processed_images, images
            torch.cuda.empty_cache()  # Clear GPU memory
            gc.collect()
        hinted_lang = os.path.basename(os.path.dirname(pdf_path))
        return text.strip(), hinted_lang if hinted_lang in LANGUAGES else None
    except Exception as e:
        print(f"Error: {pdf_path}: {e}")
        return "", None

def detect_language(text, hinted_lang=None):
    if hinted_lang in LANGUAGES:
        return hinted_lang
    sindhi_chars = sum(1 for char in text if '\u0600' <= char <= '\u06FF')
    total_chars = len(text)
    if total_chars > 0 and sindhi_chars / total_chars > 0.5:
        return "Sindhi"
    text_no_newlines = text.replace('\n', ' ')
    lang = ft_model.predict(text_no_newlines)[0][0].replace('__label__', '')
    return {'hi': 'Hindi', 'mr': 'Marathi', 'gu': 'Gujarati', 'bn': 'Bengali', 'ta': 'Tamil', 'kn': 'Kannada', 'te': 'Telugu', 'ml': 'Malayalam', 'pa': 'Punjabi', 'or': 'Odia', 'as': 'Assamese'}.get(lang, None)

def process_pdf(pdf_path):
    text, hinted_lang = extract_text_from_pdf(pdf_path)
    if not text:
        return []
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    return [{"text": para, "language": detect_language(para, hinted_lang)} for para in paragraphs if filter_text_length(para)]

def process_pdfs_parallel(pdf_dir="/kaggle/input/indic-data-corpuss3/pdfs data corpus"):
    pdf_files = [os.path.join(lang_path, f) for lang_folder in os.listdir(pdf_dir) if os.path.isdir(lang_path := os.path.join(pdf_dir, lang_folder)) for f in os.listdir(lang_path) if f.endswith(".pdf")]
    with Pool(cpu_count()) as pool:
        results = pool.map(process_pdf, pdf_files)
    df = pd.DataFrame([item for sublist in results for item in sublist])
    df.to_parquet("/kaggle/working/temp_pdf.parquet", compression='gzip')
    return len(df)

def load_oscar():
    oscar_langs = {"hi": "Hindi", "mr": "Marathi", "gu": "Gujarati", "bn": "Bengali", "ta": "Tamil", "kn": "Kannada", "te": "Telugu", "ml": "Malayalam", "pa": "Punjabi", "or": "Odia", "as": "Assamese"}
    oscar_data = []
    for code, lang in oscar_langs.items():
        try:
            dataset = load_dataset("oscar", f"unshuffled_deduplicated_{code}", split="train", trust_remote_code=True)
            oscar_data.extend({"text": item['text'].replace('\0', '')[:100000], "language": lang} for item in dataset if filter_text_length(item['text']))
        except Exception as e:
            print(f"Error OSCAR {lang}: {e}")
    df = pd.DataFrame(oscar_data).sample(min(3000000, len(oscar_data)))
    df.to_parquet("/kaggle/working/temp_oscar.parquet", compression='gzip')
    return len(df)

def load_samanantar():
    samanantar_langs = {"hi": "Hindi", "mr": "Marathi", "gu": "Gujarati", "bn": "Bengali", "ta": "Tamil", "kn": "Kannada", "te": "Telugu", "ml": "Malayalam", "pa": "Punjabi", "or": "Odia", "as": "Assamese"}
    dataset = load_dataset("togethercomputer/samanantar", split="train", trust_remote_code=True)
    samanantar_data = [{"text": row[lang_code].replace('\0', '')[:100000], "language": lang_name} for row in dataset for lang_code, lang_name in samanantar_langs.items() if row.get(lang_code) and filter_text_length(row[lang_code])]
    df = pd.DataFrame(samanantar_data).sample(min(2000000, len(samanantar_data)))
    df.to_parquet("/kaggle/working/temp_samanantar.parquet", compression='gzip')
    return len(df)

def build_corpus(pdf_dir="/kaggle/input/indic-data-corpuss3/pdfs data corpus", output_file="/kaggle/working/indic_corpus.parquet"):
    temp_files = ["/kaggle/working/temp_pdf.parquet", "/kaggle/working/temp_oscar.parquet", "/kaggle/working/temp_samanantar.parquet"]
    for f in temp_files:
        if os.path.exists(f):
            os.remove(f)
    
    pdf_count = process_pdfs_parallel(pdf_dir)
    oscar_count = load_oscar()
    samanantar_count = load_samanantar()
    
    if os.path.exists(output_file):
        existing_df = spark.read.parquet(output_file)
        new_df = spark.read.parquet(*temp_files)
        combined_df = existing_df.union(new_df).dropDuplicates(["text"])
    else:
        combined_df = spark.read.parquet(*temp_files).dropDuplicates(["text"])
    
    balanced_df = combined_df.groupBy("language").agg(F.count("*").alias("count")).filter(F.col("language").isin(LANGUAGES)).join(combined_df, "language").orderBy(F.rand()).limit(TARGET_PER_LANG).select("text", "language")
    balanced_df.write.parquet(output_file, mode="overwrite", compression="gzip")
    total_samples = balanced_df.count()
    print(f"Saved {total_samples} samples to {output_file}")
    dist = balanced_df.groupBy("language").count().collect()
    for row in dist:
        print(f"{row['language']}: {row['count']}")
    
    for f in temp_files:
        if os.path.exists(f):
            os.remove(f)
    
    size_mb = os.path.getsize(output_file) / (1024 * 1024)
    print(f"Output size: {size_mb:.2f} MB")
    if size_mb > 19000:
        print("Warning: File exceeds 19GB, may not fit Kaggle limit!")

if __name__ == "__main__":
    build_corpus()

Using device: cuda
Processing PDF: /kaggle/input/indic-data-corpuss3/pdfs data corpus/Sindhi/Sindhi POetry.pdfProcessing PDF: /kaggle/input/indic-data-corpuss3/pdfs data corpus/Sindhi/unset0000unse_i8n3.pdfProcessing PDF: /kaggle/input/indic-data-corpuss3/pdfs data corpus/Sindhi/choondsindhikaha0000unse.pdfProcessing PDF: /kaggle/input/indic-data-corpuss3/pdfs data corpus/Sindhi/pgsl.31053.bestsindhisabham0000hola.pdf



Processing PDF: /kaggle/input/indic-data-corpuss3/pdfs data corpus/Sindhi/choondsindhikaha0000bhag.pdf
Processing PDF: /kaggle/input/indic-data-corpuss3/pdfs data corpus/odia/janhamamu_oriya_1980_october_chandamama.pdf
Processing PDF: /kaggle/input/indic-data-corpuss3/pdfs data corpus/odia/67dddf0db8712.pdf
Processing PDF: /kaggle/input/indic-data-corpuss3/pdfs data corpus/odia/Chha-Mana-Atha-Guntha_FakirMohanSenapati_www.OdiaBooks.com.pdf
Processing PDF: /kaggle/input/indic-data-corpuss3/pdfs data corpus/odia/Raskel.pdf
Processing PDF: /kaggle/input/indic-data-corpuss

Process ForkPoolWorker-4:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
  File "<ipython-input-1-41a949784af2>", line 101, in process_pdf
    text, hinted_lang = extract_text_from_pdf(pdf_path)
Process ForkPoolWorker-2:
  File "<ipython-input-1-41a949784af2>", line 79, in extract_text_from_pdf
    text = "".join([pytesseract.image_to_string(img, lang='hin+mar+ben+tam+kan+tel+mal+pan+ori+asm+eng+urd+chi', config="--psm 6 --oem 1").replace('\0', '')[:100000] + "\n" for img in processed_images])
  File "<ipython-input-1-41a949784af2>", line 79, in <listcomp>
    text = "".jo

In [3]:
!apt-get update && apt-get install -y poppler-utils tesseract-ocr

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease                                              
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]                             
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]                
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]                           
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]                                
Get:8 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [69.9 kB]
Get:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:10 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,239 kB]
Get:11 https://ppa.launchpadcontent.net/graphics-drivers/