In [1]:
import sys
import os
import json
import pandas as pd

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

# Now you can import the IntentClassifier from src/infer_intent.py
from src.infer_location import LocationFinder
from tqdm import tqdm
import pandas as pd

In [2]:
cls = LocationFinder()

Downloading ONNX model...
ONNX model downloaded.


#### NER validate the generated data with pre and post modifiers

In [3]:
tqdm.pandas()

In [4]:
ner_val_generated_data = pd.read_csv("../data/named_entity_val_generated_data.csv")
ner_val_generated_data['city'] = ner_val_generated_data['city'].fillna('')
ner_val_generated_data['state'] = ner_val_generated_data['state'].fillna('')

city_state_preds = ner_val_generated_data['queries'].progress_apply(lambda query: cls.find_location(query))
ner_val_generated_data['city_pred'] = city_state_preds.apply(lambda cs: cs.get('city', '')).fillna('')
ner_val_generated_data['state_pred'] = city_state_preds.apply(lambda cs: cs.get('state', '')).fillna('')

100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:12<00:00, 77.26it/s]


In [5]:
## Prepare the same file in JSON format for FX ML inference
queries_array = ner_val_generated_data['queries'].values

# Create a dictionary structure
data = {
    "queries": queries_array.tolist()  
}

# Define the output JSON file path
output_file_path = "../data/named_entity_val_generated_data.json"

## Uncomment to write it else its almost static dataset
# Write to a JSON file
with open(output_file_path, "w") as json_file:
    json.dump(data, json_file, indent=2)


In [6]:
ner_val_generated_data

Unnamed: 0,queries,city,state,city_pred,state_pred
0,train stations near detroit,detroit,,detroit,
1,voter registration in wa,,wa,,wa
2,"weather report for los angeles, california",los angeles,california,los angeles,california
3,city hall in boston,boston,,boston,
4,"bike rentals in baltimore, maryland",baltimore,maryland,baltimore,maryland
...,...,...,...,...,...
995,tax offices in las vegas,las vegas,,las vegas,
996,"homes for rent in baltimore, maryland",baltimore,maryland,baltimore,maryland
997,"top universities in san diego, california",san diego,california,san diego,california
998,"top universities in tampa, florida",tampa,florida,tampa,florida


#### Partial city & state matches

In [7]:
partial_match_rate = float(((ner_val_generated_data['city'] == ner_val_generated_data['city_pred']) | 
(ner_val_generated_data['state'] == ner_val_generated_data['state_pred'])).value_counts(normalize=True)[True])

print(f"partial NER (City / state) accuracy = {partial_match_rate}")

partial NER (City / state) accuracy = 0.984


#### Full city & state matches (using Python)

In [8]:
full_match_rate = float(((ner_val_generated_data['city'] == ner_val_generated_data['city_pred']) & 
(ner_val_generated_data['state'] == ner_val_generated_data['state_pred'])).value_counts(normalize=True)[True])

print(f"NER accuracy = {full_match_rate}")

NER accuracy = 0.892


In [9]:
# NER accuracy = 0.9

In [10]:
ner_val_generated_data.loc[
~((ner_val_generated_data['city'] == ner_val_generated_data['city_pred']) & 
(ner_val_generated_data['state'] == ner_val_generated_data['state_pred']))
]

Unnamed: 0,queries,city,state,city_pred,state_pred
10,minnesota high school rankings,,minnesota,,
17,georgia high school rankings,,georgia,,
19,"pool cleaning services in st. louis, missouri",st. louis,missouri,st . louis,missouri
21,ca dmv locations,,ca,,
26,portland public library hours,portland,,,
...,...,...,...,...,...
965,texas high school rankings,,texas,,
968,new york housing prices,,new york,new york,
969,mi dmv locations,,mi,d,
975,mental health services in new york,,new york,new,


#### Full city & state matches (using Fx ML js)

In [11]:
# ML_output_NER_VAL_DATA.json
fx_ml_ner_val = pd.read_json("../data/ML_output_NER_VAL_DATA.json")
fx_ml_ner_val = fx_ml_ner_val.rename(columns={'query': 'queries', 'city': 'city_pred', 'state': 'state_pred', 'intent': 'intent_pred'})
fx_ml_ner_val['city_pred'] = fx_ml_ner_val['city_pred'].fillna('')
fx_ml_ner_val['state_pred'] = fx_ml_ner_val['state_pred'].fillna('')
len(fx_ml_ner_val)

1000

In [12]:
fx_ml_ner_val

Unnamed: 0,queries,intent_pred,city_pred,state_pred
0,train stations near detroit,yelp_intent,detroit,
1,voter registration in wa,yelp_intent,,wa
2,"weather report for los angeles, california",weather_intent,los angeles,california
3,city hall in boston,yelp_intent,boston,
4,"bike rentals in baltimore, maryland",yelp_intent,baltimore,maryland
...,...,...,...,...
995,tax offices in las vegas,yelp_intent,las vegas,
996,"homes for rent in baltimore, maryland",yelp_intent,baltimore,maryland
997,"top universities in san diego, california",yelp_intent,san diego,california
998,"top universities in tampa, florida",yelp_intent,tampa,florida


In [13]:
ner_fx_results = pd.merge(ner_val_generated_data[['queries', 'city', 'state']], fx_ml_ner_val, on='queries', how='left')
ner_fx_results

Unnamed: 0,queries,city,state,intent_pred,city_pred,state_pred
0,train stations near detroit,detroit,,yelp_intent,detroit,
1,voter registration in wa,,wa,yelp_intent,,wa
2,"weather report for los angeles, california",los angeles,california,weather_intent,los angeles,california
3,city hall in boston,boston,,yelp_intent,boston,
4,"bike rentals in baltimore, maryland",baltimore,maryland,yelp_intent,baltimore,maryland
...,...,...,...,...,...,...
1053,tax offices in las vegas,las vegas,,yelp_intent,las vegas,
1054,"homes for rent in baltimore, maryland",baltimore,maryland,yelp_intent,baltimore,maryland
1055,"top universities in san diego, california",san diego,california,yelp_intent,san diego,california
1056,"top universities in tampa, florida",tampa,florida,yelp_intent,tampa,florida


In [14]:
fx_ml_full_match_rate = float(((ner_fx_results['city'] == ner_fx_results['city_pred']) & 
(ner_fx_results['state'] == ner_fx_results['state_pred'])).value_counts(normalize=True)[True])

print(f"Fx ML NER accuracy = {fx_ml_full_match_rate}")

Fx ML NER accuracy = 0.9073724007561437


In [None]:
# Fx ML NER accuracy = 0.9073724007561437

In [15]:
ner_fx_results.loc[
~((ner_fx_results['city'] == ner_fx_results['city_pred']) & 
(ner_fx_results['state'] == ner_fx_results['state_pred']))
]

Unnamed: 0,queries,city,state,intent_pred,city_pred,state_pred
10,minnesota high school rankings,,minnesota,yelp_intent,,
13,tx dmv locations,,tx,yelp_intent,d,
19,"pool cleaning services in st. louis, missouri",st. louis,missouri,yelp_intent,st . louis,missouri
21,ca dmv locations,,ca,yelp_intent,d,
22,ca dmv locations,,ca,yelp_intent,d,
...,...,...,...,...,...,...
1020,philadelphia marathon registration,philadelphia,,yelp_intent,,
1023,texas high school rankings,,texas,yelp_intent,,
1026,new york housing prices,,new york,yelp_intent,york,new
1045,current weather in new york,new york,,weather_intent,new,york
