In [1]:
!pip install pandas numpy tqdm langid transformers torch scikit-learn

    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:

import pandas as pd
import numpy as np
import re
import os
import torch
from tqdm.notebook import tqdm  # Use notebook-specific tqdm for Jupyter
import logging
from typing import Dict, List, Tuple
from collections import Counter
import langid
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("corpus_verification.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class CorpusVerifierCSV:
    def __init__(self, input_csv: str, output_dir: str = None,
                 use_transformer_models: bool = True, batch_size: int = 10000,
                 tamil_col: str = "Tamil", telugu_col: str = "Telugu"):
        self.input_csv = input_csv
        self.batch_size = batch_size  # Increased batch size for A6000's VRAM
        self.use_transformer_models = use_transformer_models
        self.tamil_col = tamil_col
        self.telugu_col = telugu_col

        # Create output dir next to input file if none is given
        if output_dir is None:
            base_dir = os.path.dirname(input_csv)
            self.output_dir = os.path.join(base_dir, "corpus_verification_output")
        else:
            self.output_dir = output_dir

        os.makedirs(self.output_dir, exist_ok=True)

        self.verification_path = os.path.join(self.output_dir, 'verification_report.txt')
        self.problematic_pairs_path = os.path.join(self.output_dir, 'problematic_pairs.csv')

        if self.use_transformer_models:
            try:
                logger.info("Loading LaBSE model...")
                from transformers import BertModel, BertTokenizer
                self.tokenizer = BertTokenizer.from_pretrained('setu4993/LaBSE')
                self.model = BertModel.from_pretrained('setu4993/LaBSE')
                self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
                self.model = self.model.to(self.device)
                logger.info(f"LaBSE model loaded on {self.device} (GPU: {torch.cuda.get_device_name(0)})")
                torch.cuda.empty_cache()  # Clear GPU memory
            except Exception as e:
                logger.error(f"Error loading LaBSE model: {e}. Disabling transformer models.")
                self.use_transformer_models = False

        # Define script patterns for Tamil and Telugu
        self.tamil_script_pattern = re.compile(r'[\u0B80-\u0BFF]')  # Tamil Unicode range
        self.telugu_script_pattern = re.compile(r'[\u0C00-\u0C7F]')  # Telugu Unicode range

    def _ensure_utf8(self, text: str) -> str:
        """Ensure text is decoded as UTF-8, replacing invalid sequences."""
        if pd.isna(text):
            return ""
        try:
            return text.encode().decode('utf-8', errors='replace')
        except Exception as e:
            logger.warning(f"Failed to decode text: {e}. Returning empty string.")
            return ""

    def _calculate_character_ratio(self, tamil_text: str, telugu_text: str) -> float:
        tamil_len = len(self._ensure_utf8(tamil_text))
        telugu_len = len(self._ensure_utf8(telugu_text))
        if tamil_len == 0 or telugu_len == 0:
            return 0.0
        return max(tamil_len, telugu_len) / min(tamil_len, telugu_len)

    def _detect_language(self, text: str) -> str:
        try:
            if pd.isna(text) or not text.strip():
                return "unknown"
            text = self._ensure_utf8(text)
            lang, _ = langid.classify(text)
            return lang
        except Exception as e:
            logger.error(f"Language detection failed: {e}")
            return "unknown"

    def _script_purity(self, text: str, script_pattern) -> float:
        if pd.isna(text) or not text.strip():
            return 0.0
        text = self._ensure_utf8(text)
        script_chars = len(re.findall(script_pattern, text))
        total_chars = len(text)
        if total_chars == 0:
            return 0.0
        purity = script_chars / total_chars
        logger.debug(f"Script purity: {purity} for text: {text[:50]}...")
        return purity

    def _get_sentence_embedding(self, text: str) -> np.ndarray:
        if not self.use_transformer_models:
            return np.zeros(1)
        try:
            if pd.isna(text):
                return np.zeros(768)  # LaBSE has 768 dimensions
            text = self._ensure_utf8(text)
            inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
            inputs = {key: val.to(self.device) for key, val in inputs.items()}
            with torch.no_grad():
                outputs = self.model(**inputs)
            embedding = outputs.pooler_output.cpu().numpy()[0]
            embedding = embedding / np.linalg.norm(embedding)
            return embedding
        except Exception as e:
            logger.error(f"Error getting sentence embedding: {e}. Returning zero vector.")
            return np.zeros(768)

    def _calculate_semantic_similarity(self, tamil_text: str, telugu_text: str) -> float:
        if not self.use_transformer_models:
            return 0.0
        try:
            tamil_embedding = self._get_sentence_embedding(tamil_text)
            telugu_embedding = self._get_sentence_embedding(telugu_text)
            similarity = cosine_similarity([tamil_embedding], [telugu_embedding])[0][0]
            return float(similarity)
        except Exception as e:
            logger.error(f"Error calculating semantic similarity: {e}")
            return 0.0

    def _verify_chunk(self, df_chunk: pd.DataFrame) -> List[Dict]:
        problems = []
        for idx, row in df_chunk.iterrows():
            tamil_text = row[self.tamil_col]
            telugu_text = row[self.telugu_col]
            if pd.isna(tamil_text) or pd.isna(telugu_text):
                problems.append({
                    'index': idx,
                    'tamil': tamil_text,
                    'telugu': telugu_text,
                    'char_ratio': 0.0,
                    'lang_ta': 'unknown',
                    'lang_te': 'unknown',
                    'purity_ta': 0.0,
                    'purity_te': 0.0,
                    'semantic_sim': 0.0,
                    'reason': 'missing_values'
                })
                continue

            tamil_text = self._ensure_utf8(tamil_text)
            telugu_text = self._ensure_utf8(telugu_text)
            ratio = self._calculate_character_ratio(tamil_text, telugu_text)
            lang_ta = self._detect_language(tamil_text)
            lang_te = self._detect_language(telugu_text)
            purity_ta = self._script_purity(tamil_text, self.tamil_script_pattern)
            purity_te = self._script_purity(telugu_text, self.telugu_script_pattern)
            similarity = self._calculate_semantic_similarity(tamil_text, telugu_text) if self.use_transformer_models else 0.0

            reasons = []
            if ratio > 2.5:
                reasons.append('length_ratio')
            if lang_ta != 'ta':
                reasons.append('wrong_tamil_lang')
            if lang_te != 'te':
                reasons.append('wrong_telugu_lang')
            if purity_ta < 0.8:
                reasons.append('low_tamil_script_purity')
            if purity_te < 0.8:
                reasons.append('low_telugu_script_purity')
            if self.use_transformer_models and similarity < 0.7:
                reasons.append('low_semantic_similarity')

            if reasons:
                problems.append({
                    'index': idx,
                    'tamil': tamil_text,
                    'telugu': telugu_text,
                    'char_ratio': ratio,
                    'lang_ta': lang_ta,
                    'lang_te': lang_te,
                    'purity_ta': purity_ta,
                    'purity_te': purity_te,
                    'semantic_sim': similarity,
                    'reason': ','.join(reasons)
                })

        return problems

    def _validate_output(self, df: pd.DataFrame, file_path: str):
        """Validate that scripts are preserved in the output file."""
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            tamil_check = self.tamil_script_pattern.search(content)
            telugu_check = self.telugu_script_pattern.search(content)
            if not tamil_check or not telugu_check:
                logger.warning(f"Output file {file_path} may have lost Tamil or Telugu scripts.")

    def run_verification(self):
        logger.info(f"Starting verification using CSV file: {self.input_csv}")

        # Read CSV file in chunks - explicitly specify encoding
        all_problems = []
        chunk_num = 0
        total_rows = 0

        for df_chunk in tqdm(pd.read_csv(self.input_csv, chunksize=self.batch_size, encoding='utf-8'),
                             desc="Processing chunks"):
            chunk_num += 1
            total_rows += len(df_chunk)

            # Check if required columns exist
            if self.tamil_col not in df_chunk.columns or self.telugu_col not in df_chunk.columns:
                logger.error(f"Required columns '{self.tamil_col}' and/or '{self.telugu_col}' not found in CSV")
                raise ValueError(f"Required columns not found. Available columns: {df_chunk.columns.tolist()}")

            # Process this chunk
            problems = self._verify_chunk(df_chunk)
            all_problems.extend(problems)

            logger.info(f"Processed chunk {chunk_num} with {len(df_chunk)} rows. "
                        f"Found {len(problems)} problematic pairs in this chunk.")

        logger.info(f"Completed verification. Total rows processed: {total_rows}")
        logger.info(f"Total problematic pairs found: {len(all_problems)}")

        # Create DataFrame with problematic pairs
        if all_problems:
            problem_df = pd.DataFrame(all_problems)
            problem_df.to_csv(self.problematic_pairs_path, index=False, encoding='utf-8')
            self._validate_output(problem_df, self.problematic_pairs_path)
            logger.info(f"Saved problematic pairs to: {self.problematic_pairs_path}")
        else:
            logger.info("No problematic pairs found.")
            pd.DataFrame(columns=['index', 'tamil', 'telugu', 'char_ratio', 'lang_ta', 'lang_te',
                                 'purity_ta', 'purity_te', 'semantic_sim', 'reason']
                        ).to_csv(self.problematic_pairs_path, index=False, encoding='utf-8')
            self._validate_output(pd.DataFrame(), self.problematic_pairs_path)

        # Read complete dataset with UTF-8 encoding
        logger.info("Reading full dataset to create cleaned output...")
        full_df = pd.read_csv(self.input_csv, encoding='utf-8')

        # Get indices of problematic rows
        problem_indices = set(p['index'] for p in all_problems)

        # Create cleaned dataframe
        clean_df = full_df[~full_df.index.isin(problem_indices)].copy()

        # Save cleaned files with UTF-8 encoding
        cleaned_csv_path = os.path.join(self.output_dir, 'cleaned_pairs.csv')
        cleaned_tamil_path = os.path.join(self.output_dir, 'cleaned_tamil.csv')
        cleaned_telugu_path = os.path.join(self.output_dir, 'cleaned_telugu.csv')

        clean_df.to_csv(cleaned_csv_path, index=False, encoding='utf-8')
        self._validate_output(clean_df, cleaned_csv_path)
        clean_df[[self.tamil_col]].to_csv(cleaned_tamil_path, index=False, encoding='utf-8')
        self._validate_output(clean_df[[self.tamil_col]], cleaned_tamil_path)
        clean_df[[self.telugu_col]].to_csv(cleaned_telugu_path, index=False, encoding='utf-8')
        self._validate_output(clean_df[[self.telugu_col]], cleaned_telugu_path)

        # Counts for summary
        total_pairs = len(full_df)
        problematic_pairs = len(all_problems)
        clean_pairs = total_pairs - problematic_pairs

        # Write verification summary
        with open(self.verification_path, 'w', encoding='utf-8') as f:
            f.write(f"Total pairs: {total_pairs}\n")
            f.write(f"Problematic pairs: {problematic_pairs}\n")
            f.write(f"Clean pairs: {clean_pairs}\n")
            f.write(f"\nProblematic pairs file: {self.problematic_pairs_path}\n")
            f.write(f"Cleaned full pairs file: {cleaned_csv_path}\n")
            f.write(f"Cleaned Tamil file: {cleaned_tamil_path}\n")
            f.write(f"Cleaned Telugu file: {cleaned_telugu_path}\n")

            if all_problems:
                reason_counts = Counter()
                for p in all_problems:
                    for reason in p['reason'].split(','):
                        reason_counts[reason] += 1

                f.write("\nBreakdown of problematic pairs by reason:\n")
                for reason, count in reason_counts.most_common():
                    f.write(f"  - {reason}: {count}\n")

        logger.info(f"Verification complete.")
        logger.info(f"Total pairs: {total_pairs}")
        logger.info(f"Problematic pairs: {problematic_pairs}")
        logger.info(f"Clean pairs: {clean_pairs}")
        logger.info(f"Saved verification report to: {self.verification_path}")
        logger.info(f"Saved cleaned pairs to: {cleaned_csv_path}")

        print(f"\nAll output files are in: {self.output_dir}")
        print(f"\nVerification Summary:")
        print(f"Total pairs processed: {total_pairs}")
        print(f"Problematic pairs removed: {problematic_pairs}")
        print(f"Clean pairs retained: {clean_pairs}")
        print("\nOutput files:")
        print(f" - Cleaned pairs: {cleaned_csv_path}")
        print(f" - Cleaned Tamil: {cleaned_tamil_path}")
        print(f" - Cleaned Telugu: {cleaned_telugu_path}")
        print(f" - Problematic pairs: {self.problematic_pairs_path}")
        print(f" - Verification report: {self.verification_path}")

# Usage example:
if __name__ == "__main__":
    # Replace with your actual CSV file path (local or server path)
    input_csv = "./Finance_Data/Finance_Data.csv"  # Adjust to your local path

    verifier = CorpusVerifierCSV(
        input_csv=input_csv,
        output_dir="./Finance_Data/LaBSE",  # Adjust to your desired output path
        use_transformer_models=True,  # Leverage A6000 GPU
        batch_size=10000,  # Optimized for A6000
        tamil_col="Tamil",
        telugu_col="Telugu"
    )

    verifier.run_verification()

2025-05-05 14:37:19,145 - INFO - Loading LaBSE model...
2025-05-05 14:37:21,910 - INFO - LaBSE model loaded on cuda (GPU: NVIDIA RTX A6000)
2025-05-05 14:37:21,946 - INFO - Starting verification using CSV file: ./Finance_Data/Finance_Data.csv


Processing chunks: 0it [00:00, ?it/s]

2025-05-05 14:39:30,537 - INFO - Processed chunk 1 with 10000 rows. Found 3648 problematic pairs in this chunk.
2025-05-05 14:41:02,994 - INFO - Processed chunk 2 with 7721 rows. Found 3386 problematic pairs in this chunk.
2025-05-05 14:41:02,995 - INFO - Completed verification. Total rows processed: 17721
2025-05-05 14:41:02,996 - INFO - Total problematic pairs found: 7034
2025-05-05 14:41:03,079 - INFO - Saved problematic pairs to: ./Finance_Data/LaBSE/problematic_pairs.csv
2025-05-05 14:41:03,080 - INFO - Reading full dataset to create cleaned output...
2025-05-05 14:41:03,294 - INFO - Verification complete.
2025-05-05 14:41:03,294 - INFO - Total pairs: 17721
2025-05-05 14:41:03,294 - INFO - Problematic pairs: 7034
2025-05-05 14:41:03,295 - INFO - Clean pairs: 10687
2025-05-05 14:41:03,295 - INFO - Saved verification report to: ./Finance_Data/LaBSE/verification_report.txt
2025-05-05 14:41:03,295 - INFO - Saved cleaned pairs to: ./Finance_Data/LaBSE/cleaned_pairs.csv



All output files are in: ./Finance_Data/LaBSE

Verification Summary:
Total pairs processed: 17721
Problematic pairs removed: 7034
Clean pairs retained: 10687

Output files:
 - Cleaned pairs: ./Finance_Data/LaBSE/cleaned_pairs.csv
 - Cleaned Tamil: ./Finance_Data/LaBSE/cleaned_tamil.csv
 - Cleaned Telugu: ./Finance_Data/LaBSE/cleaned_telugu.csv
 - Problematic pairs: ./Finance_Data/LaBSE/problematic_pairs.csv
 - Verification report: ./Finance_Data/LaBSE/verification_report.txt
