# Quoco

## Storing structured data

In [2]:
import os
import re
import json
import pandas as pd
from typing import Dict, List, Optional, Tuple
import google.generativeai as genai
from dotenv import load_dotenv
import psycopg2
from psycopg2.extras import execute_values
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import asyncio
from functools import partial
import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load environment variables
load_dotenv()

True

In [None]:
# Pattern matching utilities for BoQ data extraction
class BoQPatternMatcher:
    """Pattern matching utilities for BoQ data extraction"""
    
    # Column name patterns for matching
    COLUMN_PATTERNS = {
        'item_code': [
            r'item[\s_-]*no',
            r'item[\s_-]*code',
            r's[\s_-]*no',
            r'sl[\s_-]*no',
            r'serial[\s_-]*no',
            r'code',
            r'no\.',
        ],
        'description': [
            r'description',
            r'item[\s_-]*description',
            r'work[\s_-]*description',
            r'particulars',
            r'scope[\s_-]*of[\s_-]*work',
            r'details',
        ],
        'quantity': [
            r'qty',
            r'quantity',
            r'qnty',
            r'quan\.',
        ],
        'unit': [
            r'unit',
            r'uom',
            r'unit[\s_-]*of[\s_-]*measurement',
            r'measure',
        ],
        'rate': [
            r'rate',
            r'unit[\s_-]*rate',
            r'price',
            r'rate[\s_-]*\([\s_-]*rs',
        ],
        'supply_rate': [
            r'supply[\s_-]*rate',
            r'material[\s_-]*rate',
            r'supply[\s_-]*unit[\s_-]*rate',
        ],
        'labour_rate': [
            r'labour[\s_-]*rate',
            r'labor[\s_-]*rate',
            r'labour[\s_-]*unit[\s_-]*rate',
            r'labor[\s_-]*unit[\s_-]*rate',
        ],
        'amount': [
            r'amount',
            r'total',
            r'value',
            r'amount[\s_-]*\([\s_-]*rs',
        ]
    }
    
    # Item code patterns (e.g., 1.1, 2.3.4, A1, etc.)
    ITEM_CODE_PATTERN = re.compile(r'^[A-Z]?[0-9]+(\.[0-9]+)*$')
    
    @staticmethod
    def normalize_column_name(col_name: str) -> str:
        """Normalize column name for matching"""
        if pd.isna(col_name):
            return ""
        return str(col_name).lower().strip()
    
    @classmethod
    def match_column(cls, col_name: str, field_type: str) -> bool:
        """Check if column name matches a field type"""
        normalized = cls.normalize_column_name(col_name)
        patterns = cls.COLUMN_PATTERNS.get(field_type, [])
        
        for pattern in patterns:
            if re.search(pattern, normalized, re.IGNORECASE):
                return True
        return False
    
    @classmethod
    def identify_columns(cls, df: pd.DataFrame) -> Dict[str, Optional[str]]:
        """Identify column mappings in the DataFrame"""
        columns = {
            'item_code': None,
            'description': None,
            'quantity': None,
            'unit': None,
            'rate': None,
            'supply_rate': None,
            'labour_rate': None,
            'amount': None
        }
        
        for col in df.columns:
            col_str = str(col)
            for field_type in columns.keys():
                if columns[field_type] is None and cls.match_column(col_str, field_type):
                    columns[field_type] = col
                    break
        
        return columns
    
    @classmethod
    def is_valid_item_code(cls, value) -> bool:
        """Check if value looks like a valid item code"""
        if pd.isna(value):
            return False
        value_str = str(value).strip()
        return bool(cls.ITEM_CODE_PATTERN.match(value_str))
    
    @classmethod
    def normalize_unit(cls, unit: str) -> str:
        """Normalize unit of measurement"""
        if pd.isna(unit):
            return "Each"
        
        # Convert to string first to handle integers/floats
        unit_str = str(unit).strip()
        
        # If it's just a number, return "Each"
        if unit_str.replace('.', '').replace('-', '').isdigit():
            return "Each"
        
        unit_lower = unit_str.lower()
        
        # Map common variations to standard units
        unit_map = {
            'sqm': 'Sqm', 'sq.m': 'Sqm', 'sq m': 'Sqm', 'square meter': 'Sqm',
            'cum': 'Cum', 'cu.m': 'Cum', 'cu m': 'Cum', 'cubic meter': 'Cum',
            'mtr': 'Mtr', 'm': 'Mtr', 'meter': 'Mtr', 'metre': 'Mtr',
            'kg': 'Kg', 'kilogram': 'Kg',
            'nos': 'Nos', 'no': 'Nos', 'number': 'Nos', 'each': 'Each',
            'litre': 'Ltr', 'ltr': 'Ltr', 'l': 'Ltr',
            'ton': 'Ton', 'tonne': 'Ton', 'mt': 'Ton',
            'rm': 'Rmt', 'rmt': 'Rmt', 'running meter': 'Rmt',
        }
        
        return unit_map.get(unit_lower, unit_str)
    
    @classmethod
    def extract_numeric(cls, value) -> float:
        """Extract numeric value from string"""
        if pd.isna(value):
            return 0.0
        
        if isinstance(value, (int, float)):
            return float(value)
        
        # Remove commas and extract numbers
        value_str = str(value).replace(',', '')
        match = re.search(r'[-+]?[0-9]*\.?[0-9]+', value_str)
        
        if match:
            try:
                return float(match.group())
            except ValueError:
                return 0.0
        return 0.0
