In [1]:
# Install required packages (run this cell only if dependencies are not already installed)
!python -m ensurepip --upgrade
!python -m pip install --upgrade pip


[0mLooking in links: /tmp/tmpyc1fp0i3
[0m[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0mInstalling collected packages: pip
  Attempting uninstall: pip
[0m    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
[0mSuccessfully installed pip-25.1.1


In [4]:
# Import libraries
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
import logging
from indicnlp.tokenize import indic_tokenize
from gensim.models.fasttext import load_facebook_model
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class CorpusVerifierCSV:
    def __init__(self, input_csv: str, output_dir: str = None, batch_size: int = 5000,
                 tamil_col: str = "Tamil", telugu_col: str = "Telugu"):
        """
        Initialize the CorpusVerifierCSV class.
        
        Args:
            input_csv (str): Path to the input CSV file.
            output_dir (str, optional): Directory to save output files. Defaults to a folder named 'cleaned_output' in the input CSV's directory.
            batch_size (int): Number of rows to process per chunk.
            tamil_col (str): Name of the column containing Tamil text.
            telugu_col (str): Name of the column containing Telugu text.
        """
        self.input_csv = input_csv
        self.batch_size = batch_size
        self.tamil_col = tamil_col
        self.telugu_col = telugu_col
        self.output_dir = output_dir or os.path.join(os.path.dirname(input_csv), "./Finance_Data/FastText")
        os.makedirs(self.output_dir, exist_ok=True)

        # Load IndicNLP FastText embeddings
        # Update these paths to point to your local FastText model files
        self.tamil_embeddings = load_facebook_model("./models/indicnlp.v1.ta.bin")
        self.telugu_embeddings = load_facebook_model("./models/indicnlp.v1.te.bin")
        self.tamil_vocab = set(self.tamil_embeddings.wv.index_to_key)  # ~9.45M word types
        self.telugu_vocab = set(self.telugu_embeddings.wv.index_to_key)  # ~4.19M word types

        self.tamil_script_pattern = re.compile(r'[\u0B80-\u0BFF]')
        self.telugu_script_pattern = re.compile(r'[\u0C00-\u0C7F]')

    def _ensure_utf8(self, text: str) -> str:
        """Ensure text is UTF-8 encoded."""
        return "" if pd.isna(text) else text.encode().decode('utf-8', errors='replace')

    def _check_vocabulary_baseline(self, text: str, lang: str) -> float:
        """Check proportion of tokens in IndicNLP vocabulary."""
        text = self._ensure_utf8(text)
        if not text.strip():
            return 0.0

        tokens = indic_tokenize.trivial_tokenize(text, lang=lang)
        if not tokens:
            return 0.0

        vocab = self.tamil_vocab if lang == 'ta' else self.telugu_vocab
        in_vocab_count = sum(1 for token in tokens if token in vocab)
        return in_vocab_count / len(tokens)  # Proportion of valid words

    def _verify_chunk(self, df_chunk: pd.DataFrame) -> List[Dict]:
        """Verify a chunk of the DataFrame for problematic entries."""
        problems = []
        for idx, row in df_chunk.iterrows():
            tamil_text = row[self.tamil_col]
            telugu_text = row[self.telugu_col]

            if pd.isna(tamil_text) or pd.isna(telugu_text):
                problems.append({'index': idx, 'Tamil': tamil_text, 'Telugu': telugu_text, 'reason': 'missing_values'})
                continue

            tamil_text = self._ensure_utf8(tamil_text)
            telugu_text = self._ensure_utf8(telugu_text)

            tamil_vocab_score = self._check_vocabulary_baseline(tamil_text, 'ta')
            telugu_vocab_score = self._check_vocabulary_baseline(telugu_text, 'te')

            reasons = []
            if tamil_vocab_score < 0.75:
                reasons.append('tamil_low_vocab_coverage')
            if telugu_vocab_score < 0.75:
                reasons.append('telugu_low_vocab_coverage')

            if reasons:
                problems.append({
                    'index': idx,
                    'Tamil': tamil_text,
                    'Telugu': telugu_text,
                    'tamil_vocab_score': tamil_vocab_score,
                    'telugu_vocab_score': telugu_vocab_score,
                    'reason': ','.join(reasons)
                })
        return problems

    def run_verification(self):
        """Run verification on the entire corpus and save results."""
        logger.info(f"Cleaning corpus: {self.input_csv}")
        all_problems = []
        total_rows = 0

        for df_chunk in tqdm(pd.read_csv(self.input_csv, chunksize=self.batch_size, encoding='utf-8'), desc="Processing"):
            total_rows += len(df_chunk)
            problems = self._verify_chunk(df_chunk)
            all_problems.extend(problems)

        full_df = pd.read_csv(self.input_csv, encoding='utf-8')
        problem_indices = set(p['index'] for p in all_problems)
        clean_df = full_df[~full_df.index.isin(problem_indices)]

        clean_path = os.path.join(self.output_dir, 'cleaned_corpus.csv')
        problem_path = os.path.join(self.output_dir, 'problematic_pairs.csv')
        clean_df.to_csv(clean_path, index=False, encoding='utf-8')
        pd.DataFrame(all_problems).to_csv(problem_path, index=False, encoding='utf-8')

        logger.info(f"Total pairs: {total_rows}, Clean pairs: {len(clean_df)}, Problematic: {len(all_problems)}")
        print(f"Cleaned corpus saved to: {clean_path}")
        print(f"Problematic pairs saved to: {problem_path}")

if __name__ == "__main__":
    # Update this path to your local CSV file
    input_csv_path = "./Finance_Data/Finance_Data(TAM).csv"
    verifier = CorpusVerifierCSV(input_csv=input_csv_path)
    verifier.run_verification()

2025-05-05 14:57:42,869 - INFO - loading 1449338 words for fastText model from ./models/indicnlp.v1.ta.bin
2025-05-05 14:57:49,711 - INFO - FastText lifecycle event {'params': 'FastText<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2025-05-05T14:57:49.711957', 'gensim': '4.3.3', 'python': '3.12.0 | packaged by Anaconda, Inc. | (main, Oct  2 2023, 17:29:18) [GCC 11.2.0]', 'platform': 'Linux-6.8.0-50-generic-x86_64-with-glibc2.35', 'event': 'created'}
2025-05-05 14:57:49,712 - INFO - Updating model with new vocabulary
2025-05-05 14:57:53,834 - INFO - FastText lifecycle event {'msg': 'added 1449338 new unique words (100.00% of original 1449338) and increased the count of 0 pre-existing words (0.00% of original 1449338)', 'datetime': '2025-05-05T14:57:53.834363', 'gensim': '4.3.3', 'python': '3.12.0 | packaged by Anaconda, Inc. | (main, Oct  2 2023, 17:29:18) [GCC 11.2.0]', 'platform': 'Linux-6.8.0-50-generic-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}
2025-05-05 14:57:57,911

Cleaned corpus saved to: ./Finance_Data/./Finance_Data/FastText/cleaned_corpus.csv
Problematic pairs saved to: ./Finance_Data/./Finance_Data/FastText/problematic_pairs.csv
