# Location Update Test Suite

This notebook contains test functions for the helpers.py module.

In [2]:
import os
import json
import requests
from rasa_sdk import Tracker
from pathlib import Path
from typing import Dict, List, Any
from icecream import ic
from constants import (
    LOOKUP_FILE_PATH,
    DEFAULT_CSV_PATH,
    COUNTER_FILE,
    LOCATION_FOLDER_PATH,
    CUT_OFF_FUZZY_MATCH_LOCATION,
    USE_QR_CODE,
    DIC_LOCATION_WORDS,
    DIC_LOCATION_MAPPING
)
from helpers import load_categories_from_lookup
from icecream import ic
LIST_OF_CATEGORIES = load_categories_from_lookup()


class LocationValidator:
    def __init__(self, 
                 tracker = Tracker, 
                 json_path=LOCATION_FOLDER_PATH):
        language_code = 'en'
        try:
            if tracker:
                language_temp = tracker.get_slot("language_code")
                if language_temp:
                    language_code = language_temp
        except:
            language_code = 'en'
            
        json_path = f"{json_path}{language_code}_cleaned.json"
        if USE_QR_CODE:
            with open(json_path, "r") as file:
                self.locations = self._normalize_locations(json.load(file))
            self.max_words = self._calculate_max_words()

    def _normalize_locations(self, locations):
        """Normalize all names in the locations data to lowercase."""
        i = 0
        j = 0
        k = 0
        new_locations = dict()
        for province in locations:
            if type(province) == dict:
                province["name"] = province["name"].title()
                for district in province.get("districts", []):
                    if type(district) == dict:
                        district["name"] = district["name"].lower()
                    for municipality in district.get("municipalities", []):
                        if type(municipality) == dict:
                            municipality["name"] = municipality["name"].lower()
        return locations

    def _calculate_max_words(self):
        """Calculate the maximum number of words in any municipality name."""
        max_words = 0
        for province in self.locations:
            for district in province.get("districts", []):
                for municipality in district.get("municipalities", []):
                    words = len(municipality["name"].split())
                    max_words = max(max_words, words)
        return max_words

    def _get_common_suffixes(self):
        """Return list of common suffixes to remove."""
        return [
            "province", "district", "municipality", 
            "rural municipality", "metropolitan", 
            "sub-metropolitan", "submetropolitan",
            "metropolitan city", "rural mun", "mun", 
            "महानगरपालिका" , "प्रदेश", "जिल्ला", "गाउँपालिका" , "नगरपालिका" 
            
        ]

    def _preprocess(self, text):
        """Normalize user input to lowercase and remove common suffixes."""
        if not text:
            return None
        
        text = text.lower().strip()
        for suffix in self._get_common_suffixes():
            text = text.replace(suffix, "").strip()
        return text

    def _generate_possible_names(self, text):
        """Generate possible location names from input text."""
        if not text:
            return []
        
        words = text.split()
        possible_names = []
        for i in range(len(words)):
            for j in range(i + 1, min(i + self.max_words + 1, len(words) + 1)):
                possible_names.append(" ".join(words[i:j]))
        return possible_names

    def _find_best_match(self, input_value, options, score_cutoff=CUT_OFF_FUZZY_MATCH_LOCATION):
        """Find the best match using fuzzy matching."""
        if not input_value or not options:
            return None
        
        input_value = self._preprocess(input_value)
        match = process.extractOne(input_value, options, score_cutoff= CUT_OFF_FUZZY_MATCH_LOCATION)
        if match:
            print(f"######## LocationValidator: Score: {match[1]}")
        return match[0] if match else None

    def _get_province_data(self, province_name):
        """Get province data by name."""
        return next(
            (p for p in self.locations
             if p["name"] == province_name),
            None
        )

    def _get_district_data(self, province_data, district_name):
        """Get district data by name within a province."""
        return next(
            (d for d in province_data.get("districts", []) 
             if d["name"] == district_name),
            None
        )
        
    def _get_municipality_names(self, district_data: dict) -> list:
        """
        Extract and process municipality names from district data.
        
        Args:
            district_data (dict): Dictionary containing municipality list
            
        Returns:
            list: Processed municipality names with common suffixes removed
        """
        language_code = self.tracker.get_slot("language_code")
        language_code = "en" if not language_code else language_code
        remove_words = DIC_LOCATION_WORDS["municipality"][language_code]
        if not district_data or "municipalities" not in district_data:  
            return []
        
        
        municipality_names = []
        for mun in district_data.get("municipalities", []):
            if not mun or "name" not in mun:
                continue
            
            name = mun["name"].lower()
            # Remove each word and clean up extra spaces
            for word in remove_words:
                name = name.replace(word, "")
            name = name.strip()
            
            if len(name) > 2:  # Only add non-empty names
                municipality_names.append(name)
            
        return municipality_names

    def _match_with_qr_data(self, possible_names, qr_province, qr_district):
        """Try to match location using QR-provided data."""
        print(f"######## LocationValidator: QR")
        province_list = self.locations
        province_names = [p["name"] for p in province_list]
        
        # Match province from QR data
        matched_province = self._find_best_match(qr_province, province_names) if qr_province else None
        if not matched_province:
            return None, None, None
            
        # Get province data and match district
        province_data = self._get_province_data(matched_province)
        district_names = [d["name"] for d in province_data.get("districts", [])]
        matched_district = self._find_best_match(qr_district, district_names) if qr_district else None
        
        if not matched_district:
            return matched_province, None, None
            
        # Get district data and match municipality
        district_data = self._get_district_data(province_data, matched_district)
        municipality_names = self._get_municipality_names(district_data)
        
        # Try to match municipality from possible names
        for possible_name in possible_names:
            if municipality_names:
                print(f"######## LocationValidator: Municipality names: {municipality_names}")
            else:
                print(f"######## LocationValidator: No municipality names")
            matched_municipality = self._find_best_match(possible_name, municipality_names)
            if matched_municipality:
                return matched_province, matched_district, matched_municipality
                
        return matched_province, matched_district, None

    def _match_from_string(self, possible_names):
        """Try to match location from possible names without QR data."""
        print(f"######## LocationValidator: String")
        for province in self.locations:
            for district in province.get("districts", []):
                municipality_names = self._get_municipality_names(district)
                print(f"######## LocationValidator: Municipality names: {municipality_names}")
                
                # Try municipality match first
                for possible_name in possible_names:
                    matched_municipality = self._find_best_match(possible_name, municipality_names)
                    if matched_municipality:
                        return province["name"], district["name"], matched_municipality
        
        # If no municipality match, try district match
        for province in self.locations:
            district_names = [d["name"] for d in province.get("districts", [])]
            for possible_name in possible_names:
                matched_district = self._find_best_match(possible_name, district_names)
                if matched_district:
                    return province["name"], matched_district, None
        
        # Finally, try province match
        province_names = [p["name"] for p in self.locations]
        for possible_name in possible_names:
            matched_province = self._find_best_match(possible_name, province_names)
            if matched_province:
                return matched_province, None, None
                
        return None, None, None

    def _format_result(self, province, district, municipality):
        """Format the validation result with appropriate error messages."""
        if not province:
            return {"error": "Could not determine province."}
        if not district:
            return {
                "province": province,
                "error": f"Could not determine district in {province}."
            }
        if not municipality:
            return {
                "province": province,
                "district": district,
                "error": f"Could not determine municipality in {district}."
            }
        return {
            "province": province,
            "district": district,
            "municipality": municipality
        }

    def _validate_location(self, location_string, qr_province=None, qr_district=None):
        """Validate location from a single string input and QR defaults."""
        # Preprocess input and generate possible names
        processed_text = self._preprocess(location_string)
        print(f"######## LocationValidator: Preprocessed text: {processed_text}")
        possible_names = self._generate_possible_names(processed_text)
        
        
        #check if QR code is provided
        if qr_province and qr_district:
            # Try matching with QR data first
            province, district, municipality = self._match_with_qr_data(
                possible_names, qr_province, qr_district
            )
        
        else:
            province, district, municipality = self._match_from_string(possible_names)

        
        # Format and return the result
        result = self._format_result(province, district, municipality)
        print(f"######## LocationValidator: Result: {result}")
        return result

    def _check_province(self, input_text):
        """Check if the province name is valid."""
        # Finally, try province match
        possible_names = self._generate_possible_names(input_text)
        province_names = [p["name"] for p in self.locations]
        for possible_name in possible_names:
            matched_province = self._find_best_match(possible_name, province_names)
            if matched_province:
                return matched_province
        return None
    
    def _check_district(self, input_text, province_name):
        """Check if the district name is valid."""
        # Finally, try province match
        possible_names = self._generate_possible_names(input_text)
        district_list_temp = [i for i in self.locations if i["name"] == province_name]
        district_list = district_list_temp[0].get("districts", [])
        district_names = [d["name"] for d in district_list]
        for possible_name in possible_names:
            matched_district = self._find_best_match(possible_name, district_names)
            if matched_district:
                return matched_district
        return None

In [21]:
validator = LocationValidator()
validator.locations


[{'id': 1,
  'name': 'Koshi Province',
  'area_sq_km': '25906',
  'website': 'https://koshi.gov.np/',
  'headquarter': 'Biratnagar',
  'districts': [{'id': 1,
    'province_id': 1,
    'name': 'bhojpur',
    'area_sq_km': '1507',
    'website': 'https://www.ddcbhojpur.gov.np',
    'headquarter': 'Bhojpur',
    'municipalities': [{'id': 1,
      'district_id': 1,
      'category_id': 3,
      'name': 'shadanand municipality',
      'area_sq_km': '241.15',
      'website': 'http://www.shadanandamun.gov.np/',
      'wards': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]},
     {'id': 2,
      'district_id': 1,
      'category_id': 3,
      'name': 'bhojpur municipality',
      'area_sq_km': '159.51',
      'website': 'http://www.bhojpurmun.gov.np/',
      'wards': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]},
     {'id': 3,
      'district_id': 1,
      'category_id': 4,
      'name': 'hatuwagadhi rural municipality',
      'area_sq_km': '142.61',
      'website': 'http://www.hatuwagadhimun.

In [3]:


def clean_locations(input_file_path, output_file_path):
    """
    Clean and standardize the location data structure.
    Ensures districts and municipalities are always lists with consistent structure.
    
    Args:
        input_file_path (str): Path to the input JSON file
        output_file_path (str): Path to save the cleaned JSON file
    """
    try:
        ic(input_file_path)
        # Read the input file
        with open(input_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        cleaned_data = []
        
        # Process each province
        for province in data:
            cleaned_province = {
                "id": province.get("id"),
                "name": province.get("name").title(),
                "area_sq_km": province.get("area_sq_km", ""),
                "website": province.get("website", ""),
                "headquarter": province.get("headquarter", ""),
                "districts": []
            }
            if "np" in input_file_path:
                if "." in cleaned_province["name"]:
                    print(f"LocationValidator: Old Nepali Province: {province['name']}")
                    cleaned_province["name"] = DIC_LOCATION_MAPPING[province["name"]]["new_nepali"]
                    print(f"######## LocationValidator: New Nepali Province: {cleaned_province['name']}")
                if cleaned_province["name"] in DIC_LOCATION_MAPPING.keys():
                    print(f"province name was already the new nepali name {cleaned_province['name']}")
            # Handle districts whether they're in a dict or list
            districts = []
            if isinstance(province.get("districts"), dict):
                districts = list(province["districts"].values())
            elif isinstance(province.get("districts"), list):
                districts = province["districts"]
            
            # Process each district
            for district in districts:
                if not district:  # Skip if district is None or empty
                    continue
                    
                cleaned_district = {
                    "id": district.get("id"),
                    "province_id": district.get("province_id"),
                    "name": district.get("name").title(),
                    "area_sq_km": district.get("area_sq_km", ""),
                    "website": district.get("website", ""),
                    "headquarter": district.get("headquarter", ""),
                    "municipalities": []
                }
                
                # Handle municipalities whether they're in a dict or list
                municipalities = []
                if isinstance(district.get("municipalities"), dict):
                    municipalities = list(district["municipalities"].values())
                elif isinstance(district.get("municipalities"), list):
                    municipalities = district["municipalities"]
                
                # Process each municipality
                for municipality in municipalities:
                    if not municipality:  # Skip if municipality is None or empty
                        continue
                        
                    cleaned_municipality = {
                        "id": municipality.get("id"),
                        "district_id": municipality.get("district_id"),
                        "category_id": municipality.get("category_id"),
                        "name": municipality.get("name").title(),
                        "area_sq_km": municipality.get("area_sq_km", ""),
                        "website": municipality.get("website", ""),
                        "wards": municipality.get("wards", [])
                    }
                    cleaned_district["municipalities"].append(cleaned_municipality)
                
                cleaned_province["districts"].append(cleaned_district)
            
            cleaned_data.append(cleaned_province)
        
        # Save the cleaned data
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(cleaned_data, f, indent=2, ensure_ascii=False)
            
        print(f"Successfully cleaned and saved location data to {output_file_path}")
        return cleaned_data
        
    except Exception as e:
        print(f"Error cleaning locations: {str(e)}")
        raise

# Function to validate the cleaned structure
def validate_cleaned_structure(data):
    """
    Validate that the cleaned data structure is correct.
    
    Args:
        data (list): The cleaned location data
    """
    try:
        assert isinstance(data, list), "Root data must be a list"
        
        for province in data:
            assert isinstance(province, dict), "Province must be a dictionary"
            assert isinstance(province["districts"], list), f"Districts in province {province['name']} must be a list"
            
            for district in province["districts"]:
                assert isinstance(district, dict), f"District in province {province['name']} must be a dictionary"
                assert isinstance(district["municipalities"], list), f"Municipalities in district {district['name']} must be a list"
                
                for municipality in district["municipalities"]:
                    assert isinstance(municipality, dict), f"Municipality in district {district['name']} must be a dictionary"
                    assert isinstance(municipality["wards"], list), f"Wards in municipality {municipality['name']} must be a list"
        
        print("Data structure validation passed!")
        return True
        
    except AssertionError as e:
        print(f"Validation error: {str(e)}")
        return False

# DIC_LOCATION_MAPPING = {
#     "प्रदेश न. १": {
#         "new_nepali": "कोशी प्रदेश",
#         "english": "Koshi Province"
#     },
#     "प्रदेश न. २": {
#         "new_nepali": "मधेश प्रदेश",
#         "english": "Madhesh Province"
#     },
#     "प्रदेश न. ३": {
#         "new_nepali": "बागमती प्रदेश",
#         "english": "Bagmati Province"
#     },
#     "प्रदेश न. ४": {
#         "new_nepali": "गण्डकी प्रदेश",
#         "english": "Gandaki Province"
#     },
#     "प्रदेश न. ५": {
#         "new_nepali": "लुम्बिनी प्रदेश",
#         "english": "Lumbini Province"
#     },
#     "प्रदेश न. ६": {
#         "new_nepali": "कर्णाली प्रदेश",
#         "english": "Karnali Province"
#     },
#     "प्रदेश न. ७": {
#         "new_nepali": "सुदूरपश्चिम प्रदेश",
#         "english": "Sudurpashchim Province"
#     }
# }

# Example usage
if __name__ == "__main__":
    input_path_en = "/home/ubuntu/nepal_chatbot/resources/location_dataset/en.json"
    output_path_en = "/home/ubuntu/nepal_chatbot/resources/location_dataset/en_cleaned.json"
    input_path_ne = "/home/ubuntu/nepal_chatbot/resources/location_dataset/np.json"
    output_path_ne = "/home/ubuntu/nepal_chatbot/resources/location_dataset/ne_cleaned.json"
    
    # Clean the data
    cleaned_data = clean_locations(input_path_en, output_path_en)
    cleaned_data_ne = clean_locations(input_path_ne, output_path_ne)
    
    # Validate the cleaned structure
    if cleaned_data:
        validate_cleaned_structure(cleaned_data)

ic| input_file_path: '/home/ubuntu/nepal_chatbot/resources/location_dataset/en.json'
ic| input_file_path: '/home/ubuntu/nepal_chatbot/resources/location_dataset/np.json'


Successfully cleaned and saved location data to /home/ubuntu/nepal_chatbot/resources/location_dataset/en_cleaned.json
LocationValidator: Old Nepali Province: प्रदेश न. १
######## LocationValidator: New Nepali Province: कोशी
LocationValidator: Old Nepali Province: प्रदेश न. २
######## LocationValidator: New Nepali Province: मधेश
Successfully cleaned and saved location data to /home/ubuntu/nepal_chatbot/resources/location_dataset/ne_cleaned.json
Data structure validation passed!


In [5]:
str_gr = '🔍 **Grievance ID:** {grievance_id}'
k = 'grievance_id'
str_format = str_gr.format(k='GR202503272449A9')
print(str_format)

KeyError: 'grievance_id'