# Eval: Data Quality

### Imports and API Setup

In [47]:
import os
import json
import cohere
import langchain_cohere
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

True

In [6]:
# initialize Cohere client
co = cohere.Client(api_key=os.getenv("COHERE_API_KEY"), log_warning_experimental_features=False)

### Data Prep

In [10]:
SAMPLES = [
    "business_name:Jessica Jensen,  address:Gladstone Ave, Toronto ON  ca , currency:cad emails: jessica@jensennagle.com",
    "business_name:Elizabeth Inwood,  address:14 Cadeau Terrace,Unit 20 London ON N6K 4X5 CA , currency:cad emails: eainwood@hotmail.ca",
    "business_name:Nadezhda Androncik,  address:120 Homewood Ave, Toronto ON M4y2j3 CA , currency:cad emails: ",
]

In [48]:
# prompts
PREAMBLE = \
"""
You are tasked with cleaning and processing customer data, primarily from users in Canada (mainly Ontario) and the US, specifically in the food or beverage industry. Follow these steps:

1. **Postal Code:** Extract and validate the postal code, which may be in US ZIP code or Canadian postal code format. Return an empty string if the postal code is invalid or missing.
2. **Street Address:** Extract and clean the street address without any unneccasary commas.
3. **Province/State:** Extract the province or state, ensuring it matches ISO 3166-2 codes. Return an empty string if it doesn’t match.
4. **City:** Extract the city name.
5. **Business Information:** Extract the business name and business identification number. Ensure the business ID follows the government’s registration format (digits only, no symbols like #). Return an empty string if not found.
6. **Domain Extraction:** Extract a list of domains related to the user from their emails, removing any public email domains (example: if username@gmail.com only extract, gmail.com). If no non-public domains remain, return an empty string.
7. **Country:** Ensure the country value matches ISO 3166-1 alpha-2 codes, or return an empty string.
8. **Phone Numbers:** Drop any ten-digit numbers that resemble phone numbers to avoid confusing them with the business ID.
9. **Data Consistency:** Ensure all extracted information is accurate and consistent with the input data.

Return the cleaned and processed data in the specified JSON format.
"""

PROMPT = \
"""
"I have customer data that needs to be cleaned and processed. The data is primarily from users in Canada (mostly Ontario) and the US, specifically those in the food or beverage industry. 

## Instructions
Here's what I need:
1. Extract and validate the postal code, which may be in US ZIP code format (example: 12345 or 12345-1234) or Canadian postal code format (example: M1W G8L). If the postal code is invalid or missing, return null.
2. Extract and clean the street address.
3. Extract the province or state (example: for provinces, ON or Ontario and for US states, AL or Alabama), ensuring it matches ISO 3166-2 codes. Return an empty string if it doesn’t match.
4. Extract the city name.
5. Extract the business name and business identification number, ensuring it follows the government’s registration format (digits only, without symbols like #). If not found, set it as an empty string.
6. Extract a list of domains related to the user from their emails. Remove any public email domains like gmail.com, hotmail.com, etc. If no non-public domains are left, return an empty string.
7. Ensure the country value matches ISO 3166-1 alpha-2 codes or return an empty string (example: CA for Canada).
8. Drop any ten-digit numbers that resemble phone numbers to avoid confusing them with the business ID.
9. Ensure all extracted information is accurate and consistent with the input data.

## Customer data
{sample}

"""

JSON_PROMPT = \
"""
## Json structure
Please provide the cleaned and processed data in the following JSON format:
{
  "business_name": "business name",
  "business_identification_number": "business identification number",
  "street_address": "clean street address in format of street number, street name, extra like unit or suite number",
  "province_state": "province or state from ISO 3166-2 or empty string",
  "city": "<city>",
  "postal_code": "postal code or zip code",
  "domains_related": [domain1.com, domain2.com, ...],
  "country": "country in ISO 3166-1 alpha-2 or empty string",
  "flag": "certain, likely, cannot extract"
}
"""

RESPONSE_FORMAT = {
    "type": "json_object",
    "scheme": {
        "type": "object",
        "required": [
            "business_name",
            "business_identification_number",
            "street_address",
            "province_state",
            "city",
            "postal_code",
            "domains_related",
            "country",
            "flag",
        ],
        "properties": {
            "business_name": { "type": "string" },
            "business_identification_number": { "type": "integer" },
            "street_address": { "type": "string" },
            "province_state": { "type": "string" },
            "city": { "type": "string" },
            "postal_code": { "type": "string" },
            "domains_related": { "type": "array" },
            "country": { "type": "string" },
            "flag": { "type": "string" },
        }
    }
}


### Testing with Samples

In [49]:
# generate on samples
for sample in SAMPLES:
    print(f"Sample: {sample}")
    print("JSON:")
    params = {
        "message": PROMPT.format(sample=sample) + JSON_PROMPT,
        "model": "command-r",
        "preamble": PREAMBLE,
        "temperature": 0.0,
        "response_format": RESPONSE_FORMAT,
    }
    resp = co.chat(
        **params
    ).text
    print(resp)
    print("\n")

Sample: business_name:Jessica Jensen,  address:Gladstone Ave, Toronto ON  ca , currency:cad emails: jessica@jensennagle.com
JSON:
{
  "business_name": "Jessica Jensen",
  "business_identification_number": "",
  "street_address": "Gladstone Ave",
  "province_state": "ON",
  "city": "Toronto",
  "postal_code": "M1W G8L",
  "domains_related": ["jensennagle.com"],
  "country": "CA",
  "flag": "certain"
}


Sample: business_name:Elizabeth Inwood,  address:14 Cadeau Terrace,Unit 20 London ON N6K 4X5 CA , currency:cad emails: eainwood@hotmail.ca
JSON:
{
  "business_name": "Elizabeth Inwood",
  "business_identification_number": "",
  "street_address": "14 Cadeau Terrace Unit 20",
  "province_state": "ON",
  "city": "London",
  "postal_code": "N6K 4X5",
  "domains_related": ["hotmail.ca"],
  "country": "CA",
  "flag": "certain"
}


Sample: business_name:Nadezhda Androncik,  address:120 Homewood Ave, Toronto ON M4y2j3 CA , currency:cad emails: 
JSON:
{
  "business_name": "Nadezhda Androncik",
  

## Evaluator Setup

In [52]:
class Evaluator(object):
    def __init__(self):
        self.sample_record = {
            "business_name": str,
            "business_identification_number": str,
            "street_address": str,
            "province_state": str,
            "city": str,
            "postal_code": str,
            "domains_related": list,
            "country": str,
            "flag": str
        }

    def _match_record(self, key: str, sample: dict, golden_sample: dict) -> int:
        # print(sample[key], golden_sample[key])
        # print(key)
        return 1 if sample[key] == golden_sample[key] else 0
    
    def _generate_samples(self, prompts) -> list[dict]:
        samples = []
        for prompt in prompts:
            params = {
                "message": PROMPT.format(sample=prompt) + JSON_PROMPT,
                "model": "command-r",
                "preamble": PREAMBLE,
                "temperature": 0.0,
                "response_format": RESPONSE_FORMAT,
            }
            resp = co.chat(
                **params
            ).text
            resp = json.loads(resp)
            samples.append(resp)
        return samples
    
    def run_eval(self, prompts: list, golden_samples: list[dict]):
        eval_dict = {
            key: [] for key in self.sample_record.keys()
        }
        eval_dict["all_keys_ext"] = []

        samples = self._generate_samples(prompts)

        for sample, golden_sample in zip(samples, golden_samples):
            for key in self.sample_record:
                eval_dict[key].append(self._match_record(key, sample, golden_sample))
            all_keys_ext_check = 1 if self.sample_record.keys() == sample.keys() else 0
            eval_dict["all_keys_ext"].append(all_keys_ext_check)
        
        df = pd.DataFrame(eval_dict, columns=eval_dict.keys())
        df["total_score"] = df.sum(axis=1, numeric_only=True) / len(eval_dict.keys())

        return df


In [53]:
prompts = [
    "business_name:Jessica Jensen,  address:Gladstone Ave, Toronto ON  ca , currency:cad emails: jessica@jensennagle.com",
    "business_name:Elizabeth Inwood,  address:14 Cadeau Terrace,Unit 20 London ON N6K 4X5 CA , currency:cad emails: eainwood@hotmail.ca",
    "business_name:Nadezhda Androncik,  address:120 Homewood Ave, Toronto ON M4y2j3 CA , currency:cad emails: ",
]
golden_samples = [
    {
        "business_name": "Jessica Jensen",
        "business_identification_number": "",
        "street_address": "Gladstone Ave",
        "province_state": "ON",
        "city": "Toronto",
        "postal_code": "M1W G8L",
        "domains_related": ["jensennagle.com"],
        "country": "CA",
        "flag": "certain"
    },
    {
        "business_name": "Elizabeth Inwood",
        "business_identification_number": "",
        "street_address": "14 Cadeau Terrace Unit 20",
        "province_state": "ON",
        "city": "London",
        "postal_code": "N6K 4X5",
        "domains_related": ["hotmail.ca"],
        "country": "CA",
        "flag": "certain"
    },
    {
    "business_name": "Nadezhda Androncik",
    "business_identification_number": "",
    "street_address": "120 Homewood Ave",
    "province_state": "ON",
    "city": "Toronto",
    "postal_code": "M4Y 2J3",
    "domains_related": [],
    "country": "CA",
    "flag": "certain"
    },
]

eval = Evaluator()
df = eval.run_eval(prompts, golden_samples)

### Eval Results Table

In [54]:
df

Unnamed: 0,business_name,business_identification_number,street_address,province_state,city,postal_code,domains_related,country,flag,all_keys_ext,total_score
0,1,1,1,1,1,1,1,1,1,1,1.0
1,1,1,1,1,1,1,1,1,1,1,1.0
2,1,1,1,1,1,1,1,1,1,1,1.0
