In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install pandas requests beautifulsoup4 scrapy datasets pdfminer.six clean-text

Collecting scrapy
  Downloading Scrapy-2.12.0-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting clean-text
  Downloading clean_text-0.6.0-py3-none-any.whl.metadata (6.6 kB)
Collecting Twisted>=21.7.0 (from scrapy)
  Downloading twisted-24.11.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.3.1-py3

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import scrapy
from scrapy.crawler import CrawlerProcess
from datasets import load_dataset
from pdfminer.high_level import extract_text
import os
import re
from cleantext import clean

# Languages and target sample size per language
LANGUAGES = ["Hindi", "Marathi", "Sindhi", "Gujarati", "Bengali", "Tamil", "Kannada"]
TARGET_PER_LANG = 750000  # Aiming for ~5.25 million total
MIN_WORDS = 50

# Function to filter text by word count
def filter_text_length(text):
    if not isinstance(text, str):  # Handle non-string inputs
        return False
    words = text.split()
    return len(words) >= MIN_WORDS

# Function 1: Load from Public Datasets
def load_from_datasets(output_file="dataset_corpus.csv"):
    print("Loading from public datasets...")
    corpus = []
    
    # Try IndicCorp (corrected name or fallback)
    try:
        # Note: IndicCorp might need manual download from AI4Bharat; using OSCAR as a fallback
        dataset = load_dataset("oscar", "unshuffled_deduplicated_hi", split="train")  # Hindi example
        df = pd.DataFrame(dataset)
        filtered_df = df[df['text'].apply(filter_text_length)].sample(n=min(TARGET_PER_LANG, len(df)), random_state=42)
        filtered_df = filtered_df[['text']].assign(language="Hindi")
        corpus.append(filtered_df)
        print("Loaded Hindi from OSCAR")
    except Exception as e:
        print(f"Error loading OSCAR for Hindi: {e}")
    
    # Add other languages from OSCAR (example for Tamil)
    try:
        dataset = load_dataset("oscar", "unshuffled_deduplicated_ta", split="train")
        df = pd.DataFrame(dataset)
        filtered_df = df[df['text'].apply(filter_text_length)].sample(n=min(TARGET_PER_LANG, len(df)), random_state=42)
        filtered_df = filtered_df[['text']].assign(language="Tamil")
        corpus.append(filtered_df)
        print("Loaded Tamil from OSCAR")
    except Exception as e:
        print(f"Error loading OSCAR for Tamil: {e}")

    # Manually add mappings for others if OSCAR works (Marathi: mr, Gujarati: gu, Bengali: bn, Kannada: kn)
    oscar_langs = {"mr": "Marathi", "gu": "Gujarati", "bn": "Bengali", "kn": "Kannada"}
    for code, lang in oscar_langs.items():
        try:
            dataset = load_dataset("oscar", f"unshuffled_deduplicated_{code}", split="train")
            df = pd.DataFrame(dataset)
            filtered_df = df[df['text'].apply(filter_text_length)].sample(n=min(TARGET_PER_LANG, len(df)), random_state=42)
            filtered_df = filtered_df[['text']].assign(language=lang)
            corpus.append(filtered_df)
            print(f"Loaded {lang} from OSCAR")
        except Exception as e:
            print(f"Error loading OSCAR for {lang}: {e}")

    # Sindhi might not be in OSCAR easily; skip or source elsewhere
    
    # Combine and save
    if corpus:
        combined_df = pd.concat(corpus).drop_duplicates(subset="text")
        combined_df.to_csv(output_file, index=False)
        print(f"Saved {len(combined_df)} samples from datasets to {output_file}")
        return combined_df
    else:
        print("No datasets loaded successfully.")
        return pd.DataFrame(columns=["text", "language"])  # Return empty DataFrame

# Function 2: Web Scraping
def scrape_from_web(output_file="scraped_corpus.csv"):
    print("Scraping from web...")
    sites = {
        "Hindi": ["https://hindi.bbc.com", "https://www.bhaskar.com"],
        "Marathi": ["https://lokmat.com", "https://maharashtratimes.com"],
        "Sindhi": ["https://awamiawaz.pk"],
        "Gujarati": ["https://divyabhaskar.co.in", "https://sandesh.com"],
        "Bengali": ["https://anandabazar.com", "https://eisamay.com"],
        "Tamil": ["https://dinamalar.com", "https://dailythanthi.com"],
        "Kannada": ["https://prajavani.net", "https://vijaykarnataka.com"]
    }
    
    corpus = []
    headers = {"User-Agent": "Mozilla/5.0"}
    
    for lang, urls in sites.items():
        lang_texts = []
        for url in urls:
            try:
                response = requests.get(url, headers=headers, timeout=10)
                soup = BeautifulSoup(response.content, "html.parser")
                paragraphs = soup.find_all("p")
                for p in paragraphs:
                    text = clean(p.get_text(), no_line_breaks=True, no_urls=True, no_emails=True)
                    if filter_text_length(text):
                        lang_texts.append({"text": text, "language": lang})
            except Exception as e:
                print(f"Error scraping {url}: {e}")
        
        lang_df = pd.DataFrame(lang_texts).sample(n=min(TARGET_PER_LANG, len(lang_texts)), random_state=42)
        corpus.append(lang_df)
    
    if corpus:
        combined_df = pd.concat(corpus).drop_duplicates(subset="text")
        combined_df.to_csv(output_file, index=False)
        print(f"Saved {len(combined_df)} samples from web to {output_file}")
        return combined_df
    return pd.DataFrame(columns=["text", "language"])

# Function 3: OCR from PDFs
def extract_from_pdfs(pdf_dir="pdfs", output_file="pdf_corpus.csv"):
    print("Extracting from PDFs...")
    corpus = []
    
    for lang in LANGUAGES:
        lang_texts = []
        lang_dir = os.path.join(pdf_dir, lang.lower())
        if not os.path.exists(lang_dir):
            print(f"No PDFs found for {lang}, skipping...")
            continue
        
        for pdf_file in os.listdir(lang_dir):
            if pdf_file.endswith(".pdf"):
                try:
                    text = extract_text(os.path.join(lang_dir, pdf_file))
                    paragraphs = text.split("\n\n")
                    for para in paragraphs:
                        cleaned_text = clean(para, no_line_breaks=True, no_urls=True, no_emails=True)
                        if filter_text_length(cleaned_text):
                            lang_texts.append({"text": cleaned_text, "language": lang})
                except Exception as e:
                    print(f"Error processing {pdf_file}: {e}")
        
        lang_df = pd.DataFrame(lang_texts).sample(n=min(TARGET_PER_LANG, len(lang_texts)), random_state=42)
        corpus.append(lang_df)
    
    if corpus:
        combined_df = pd.concat(corpus).drop_duplicates(subset="text")
        combined_df.to_csv(output_file, index=False)
        print(f"Saved {len(combined_df)} samples from PDFs to {output_file}")
        return combined_df
    return pd.DataFrame(columns=["text", "language"])

# Main function to build the corpus
def build_corpus(output_file="indic_corpus.csv"):
    print("Building the corpus...")
    
    dataset_df = load_from_datasets()
    scraped_df = scrape_from_web()
    pdf_df = extract_from_pdfs()
    
    all_dfs = [df for df in [dataset_df, scraped_df, pdf_df] if not df.empty]
    if not all_dfs:
        print("No data collected!")
        return
    
    combined_df = pd.concat(all_dfs).drop_duplicates(subset="text")
    
    balanced_corpus = []
    for lang in LANGUAGES:
        lang_df = combined_df[combined_df['language'] == lang]
        sampled_df = lang_df.sample(n=min(TARGET_PER_LANG, len(lang_df)), random_state=42)
        balanced_corpus.append(sampled_df)
    
    final_df = pd.concat(balanced_corpus).sample(frac=1, random_state=42)
    
    if len(final_df) < 5000000:
        print(f"Warning: Only {len(final_df)} samples collected, below 5 million!")
    else:
        print(f"Success: Collected {len(final_df)} samples!")
    
    final_df.to_csv(output_file, index=False)
    print(f"Corpus saved to {output_file}")

if __name__ == "__main__":
    build_corpus()

Building the corpus...
Loading from public datasets...


README.md:   0%|          | 0.00/303k [00:00<?, ?B/s]

oscar.py:   0%|          | 0.00/14.8k [00:00<?, ?B/s]

The repository for oscar contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/oscar.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/410 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/407M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/409M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/406M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/407M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/380M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1909387 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

Loaded Hindi from OSCAR


Downloading data:   0%|          | 0.00/246 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/342M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/343M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/285M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/833101 [00:00<?, ? examples/s]

Error loading OSCAR for Tamil: Cannot take a larger sample than population when 'replace=False'


Downloading data:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/300M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/212556 [00:00<?, ? examples/s]

Error loading OSCAR for Marathi: Cannot take a larger sample than population when 'replace=False'


Downloading data:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/163M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/169834 [00:00<?, ? examples/s]

Error loading OSCAR for Gujarati: Cannot take a larger sample than population when 'replace=False'


Downloading data:   0%|          | 0.00/328 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/390M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/390M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/391M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/86.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1114481 [00:00<?, ? examples/s]

Loaded Bengali from OSCAR


Downloading data:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/216M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/251064 [00:00<?, ? examples/s]

Error loading OSCAR for Kannada: Cannot take a larger sample than population when 'replace=False'


Error: need to escape, but no escapechar set