Purpose of the notebook:

Evaluate the current NER approach. This approach uses existing models supported by Transformers.js library.
We see where it fails. 
With the hypothesis classifier based approach might be better for we prepare and label the data
(https://www.microsoft.com/en-us/download/details.aspx?id=58227)

In [None]:
## imports

from transformers import pipeline
import pandas as pd
from tqdm import tqdm
from pprint import pprint

#### Some examples of where the NER based approach is failing

In [None]:
classifier = pipeline("zero-shot-classification", model='typeform/mobilebert-uncased-mnli', device='cpu')
# classifier = pipeline("zero-shot-classification", model='typeform/distilbert-base-uncased-mnli', device='mps')


# text = "what is democracy"
# text = "restaurants in oakville"
# text = "buy iphone"
# text = "bank login"
# text = "temperature in San Jose"
# text = "wood floor buckling repair"

texts = [
    "what is democracy",
    "restaurants in oakville",
    "buy iphone",
    "bank login",
    "temperature in San Jose",
    "wood floor buckling repair",
    "wood floor cost estimator",
    "panera bread menu price",
    "how much is hbo now subscription",
    "how much is a golden retriever puppy",
    "how much is nebraska's sales tax",
    "how much is donald trump jr worth",
    "how much is a liposuction",
    "does mushroom cause food allergy",
]

# intent_labels_lkp = {
#     "yelp_intent": "find a local service for home repair, cost estimation, or maintenance",
#     "information_intent": "search for general knowledge not related to weather, services, or products",
#     "weather_intent": "check weather conditions like forecast, temperature, radar, storms, or pollen",
#     "purchase_intent": "make an online purchase",
#     "navigation_intent": "navigate to a specific website"
# }

intent_labels_lkp = {
    "yelp_intent": "search for local service, food, home repair, maintenance, cost estimation excluding weather intents",
    # "yelp_intent": "to discover, connect and transact with local businesses",
    "information_intent": "search for general knowledge what some concept is and not related to weather, services, or products",
    "weather_intent": "check weather conditions like forecast, temperature, radar, storms, or pollen",
    "purchase_intent": "make an online purchase",
    "navigation_intent": "navigate to a specific website"
}

 # informational, transactional, consideration, etc., and suggest whether

# intent_labels = [
#     "web navigational intent",
#     "informational intent for learners",
#     "local business or services search intent",
#     "product purchase action intent",
#     "related to weather conditions or forecast or radar"
# ]



intent_desc_lkp = {intent_desc: intent_key for intent_key, intent_desc in intent_labels_lkp.items()}

# Refined intent labels
intent_labels = [
    intent_labels_lkp["yelp_intent"],
    intent_labels_lkp["information_intent"],
    intent_labels_lkp["weather_intent"],
    intent_labels_lkp["purchase_intent"],
    intent_labels_lkp["navigation_intent"],
]

result = classifier(texts, candidate_labels=intent_labels)
# pprint(result)


In [None]:
# result_df = pd.DataFrame(result)
def prepare_df_from_reesult(result):
    updated_result = []
    for idx, res in enumerate(result):
        labels_and_scores = {'sequence': res['sequence']}
        for label, score in zip(res['labels'], res['scores']):
            labels_and_scores[intent_desc_lkp[label]] = score
        updated_result.append(labels_and_scores)
    
    return pd.DataFrame(updated_result)

updated_result_df = prepare_df_from_reesult(result)

In [None]:
updated_result_df

Some of the above results are bit unclear. `does the mushroom cause food allergy` is more of a information intent than a yelp intent.
There were many other cases which showed that NER alone may not be suitable for this problem. We need to solve the intent classification problem in this use case

#### Marco data

This dataset can be downloaded from https://www.microsoft.com/en-us/download/details.aspx?id=58227

In [None]:
marco_text_queries = set()
with open("../data/full_marco_sessions_ann_split.train.tsv", "r") as f:
    marco_texts = f.read().split('\n')
    for text in marco_texts:
        for query in text.split("\t"):
            if "marco-gen-train" not in query and len(query) >= 3:
                marco_text_queries.add(query.lower())

marco_text_queries_list = list(marco_text_queries)

In [None]:
len(marco_text_queries_list)

In [None]:
## some example queries

marco_text_queries_list[:50]

In [None]:
marco_df = pd.DataFrame({"sequence": marco_text_queries_list})

In [None]:
def labeling_stats(df):
    if 'target' not in marco_df.columns:
        df['target'] = None
    print(f"Size of the dataset = {len(df)}")
    print(f"Number of examples to be labeled = {df['target'].isna().sum()}")
    print(f"Number of examples labeled = {(~df['target'].isna()).sum()}")
    print("Labels distributed as \n", df['target'].value_counts())


## Prints labeling stats
labeling_stats(marco_df)

#### Find potential ngram mappings for targets

In [None]:
from collections import Counter
from itertools import islice

# Generalize function to extract n-grams
def extract_ngrams(query, n):
    words = query.split()
    ngrams = zip(*[islice(words, i, None) for i in range(n)])  # Generate n-grams
    return [' '.join(ngram) for ngram in ngrams]  # Join n-grams into a single string

# Flatten the n-grams into a list and count them
def count_ngrams(queries_list, n):
    all_ngrams = [ngram for query in queries_list for ngram in extract_ngrams(query, n)]
    ngram_counter = Counter(all_ngrams)
    return ngram_counter



In [None]:
def search_queries_by_words(search_text, to_be_labelled_sequence_list):
    for query in to_be_labelled_sequence_list:
        if search_text in query:
            yield query

In [None]:
cnt = 0
for query in search_queries_by_words(" recipe", marco_text_queries_list):
    if cnt >= 100:  # Stop after 20 results
        break
    print(cnt + 1, query)
    cnt += 1

In [None]:

target_mapping = {
    'how do': 'information_intent',
    'weather in': 'weather_intent',
    'the weather': 'weather_intent',
    'hurricane': 'information_intent',
    'tornado': 'weather_intent',
    'current temperature': 'weather_intent',
    'current weather': 'weather_intent',
    'weather forecast in': 'weather_intent',
    'temperature in': 'weather_intent',
    # 'how much': 'purchase_intent', 
    # 'cost to': 'purchase_intent',
    # 'where is': 'navigation_intent', 
    'routing number': 'navigation_intent',
    'sign in ': 'navigation_intent',
    'signin ': 'navigation_intent',
    'login ': 'navigation_intent',
    'phone number': 'yelp_intent', 
    'customer service': 'yelp_intent',
    'what are': 'information_intent',
    'what county is': 'information_intent',
    'what is a ': 'information_intent',
    # 'what is': 'information_intent',
    'what does': 'information_intent',
    'what do': 'information_intent',
    'definition of': 'information_intent',
    'meaning': 'information_intent',
    'symptoms': 'yelp_intent',
    'zip code': 'information_intent',
    'zipcode': 'information_intent',
    'postal code': 'information_intent',
    'postalcode': 'information_intent',
    'area code': 'information_intent',
    'areacode': 'information_intent',
    'definition': 'information_intent',
    'define': 'information_intent',
    'what is the difference between': 'information_intent',
    'what is the purpose of': 'information_intent',
    'what is the function of': 'information_intent',
    'how long does it take': 'information_intent',
    'what is the name of': 'information_intent',
    'what is the population of': 'information_intent',
    'what is an example of': 'information_intent',
    'which of the following': 'information_intent',
    'what is the purpose': 'information_intent',
    # 'what time zone is': 'information_intent',
    'what is the average': 'information_intent',
    'is in what county': 'information_intent',
    'calories in': 'information_intent',
    # 'how many calories in': 'information_intent',
    "causes of": 'information_intent',
    'visit': 'travel_intent',
    'travel to': 'travel_intent',
    'cruise': 'travel_intent',
    'tours': 'travel_intent',
    'mortgage rate': 'yelp_intent',
    'interest rate': 'yelp_intent',
    'price of': 'purchase_intent',
    'cost of living': 'information_intent',
    'does it cost': 'yelp_intent', 
    # 'what is the current': ?
    'what is the largest': 'information_intent',
    'what is the currency': 'information_intent',
    'how old do you': 'information_intent',
    'how long does a': 'information_intent',
    # 'what time is it': 'information_intent',
    'what time': 'information_intent',
    'you have to be': 'information_intent',
    'do you need to': 'information_intent',
    'what is considered a': 'information_intent',
    'dialing code': 'information_intent',
    'side effects': 'information_intent',
    'stock market': 'information_intent',
    'how many calories': 'information_intent',
    'average salary for': 'information_intent',
    'how many grams': 'information_intent',
    'what foods are': 'information_intent',
    'how many ounces': 'information_intent',
    'how many carbs': 'information_intent',
    'what year was': 'information_intent',
    'how old is': 'information_intent',
    'how much is': 'information_intent',
    'what type of': 'information_intent',
    'how do i': 'information_intent',
    'what kind of': 'information_intent',
    'who is the': 'information_intent',
    'where is the': 'information_intent',
    # 'different types of': 'information_intent',
    'types': 'information_intent',
    'what is': 'information_intent',
    'how do you': 'information_intent',
    'what was the': 'information_intent',
    'in the world': 'information_intent',
    'how long is': 'information_intent',
    'when was': 'information_intent',
    'when did': 'information_intent',
    'how far is': 'information_intent',
    'how tall is': 'information_intent',
    'what to do': 'information_intent',
    'how long': 'information_intent',
    'types of': 'information_intent',
    'who is': 'information_intent',
    'where is': 'information_intent',
    'what causes': 'information_intent',
    'stock price': 'information_intent',
    'difference between': 'information_intent',
    'social security': 'information_intent',
    'who was': 'information_intent',
    'net worth': 'information_intent',
    'cast of': 'information_intent',
    'how many': 'information_intent',
    'how does': 'information_intent',
    'how is': 'information_intent',
    'what did': 'information_intent',
    'good for': 'information_intent',
    'population of': 'information_intent',
    'can you': 'information_intent',
    'what can': 'information_intent',
    'how big': 'information_intent',
    'what size': 'information_intent',
    'average salary of': 'information_intent',
    'what year': 'information_intent',
    'part of': 'information_intent',
    'another word': 'information_intent',
    'who invented': 'information_intent',
    'what can you': 'information_intent',
    'how much money': 'information_intent',
    'what size': 'information_intent',
    'what state': 'information_intent',
    'what county': 'information_intent',
    'in the us': 'information_intent',
    'how old': 'information_intent',
    'icd code': 'information_intent',
    'what city': 'information_intent',
    'can you': 'information_intent',
    'can i': 'information_intent',
    'when is': 'information_intent',
    'how did': 'information_intent',
    'what can': 'information_intent',
    'what to': 'information_intent',
    'the same': 'information_intent',
    "cleaning ": 'yelp_intent',
    'restaurant': 'yelp_intent',
    'recommendation': 'yelp_intent',
    'repair': 'yelp_intent',
    'parking': 'yelp_intent',
    'oil change': 'yelp_intent',
    ' rental': 'yelp_intent',
    'auto ': 'yelp_intent',
    'dry clean': 'yelp_intent',
    'club': 'yelp_intent',
    'hotel': 'yelp_intent',
    'stores': 'yelp_intent',
    'shopping': 'yelp_intent',
    ' shop ': 'yelp_intent',
    ' shops ': 'yelp_intent',
    ' mall ': 'yelp_intent',
    'furniture': 'yelp_intent',
    'crafts': 'yelp_intent',
    'clothing': 'yelp_intent',
    'benefits of': 'yelp_intent',
    'average cost': 'yelp_intent',
    'cost to install': 'yelp_intent',
    'contact number': 'yelp_intent',
    'what airport': 'travel_intent',
    # 'flight': 'travel_intent',
    'cost for': 'yelp_intent',
    'do you': 'information_intent',
    'when does': 'information_intent',
    'do you': 'information_intent',
    'why is': 'information_intent',
    "what's the": 'information_intent',
    'what was': 'information_intent',
    'what language': 'information_intent',
    'should i': 'information_intent',
    'convert': 'information_intent',
    'medication': 'yelp_intent',
    'treatment': 'yelp_intent',
    'tv show': 'information_intent',
    'history': 'information_intent',
    'remedies': 'information_intent',
    'county is': 'information_intent',
    'synonym ': 'information_intent',
    'credit union': 'yelp_intent',
    'movie cast': 'information_intent',
    'average salary': 'information_intent',
    'example': 'information_intent',
    'blood pressure': 'information_intent',
    'credit card': 'navigation_intent',
    'time zone': 'information_intent',
    'time in': 'information_intent',
    'foods that': 'information_intent',
    'salary for': 'information_intent',
    "weather": 'weather_intent',
    "forecast": 'weather_intent',
    "windy": 'weather_intent',
    "humidity": 'weather_intent',
    "monsoon": 'weather_intent',
    "flooding": 'weather_intent',
    "rain in": 'weather_intent',
    "storms": 'weather_intent',
    "storm in": 'weather_intent',
    "forcast": 'weather_intent',
    "wether": 'weather_intent',
    "wather": 'weather_intent',
    "weahter": 'weather_intent',
    "weater": 'weather_intent',
    "weaher": 'weather_intent',
    " vindy ": 'weather_intent',
    " sunny ": 'weather_intent',
    " rain ": 'weather_intent',
    "windy": 'weather_intent',
    "cloudy": 'weather_intent',
    "storms": 'weather_intent',
    "air quality": 'weather_intent',
    "thunderstorm": 'weather_intent',
    "pollen": 'weather_intent',
    "snow": 'weather_intent',
    "blizzard": 'weather_intent',
    "radar": 'weather_intent',
    "tiempo": 'weather_intent',
    "clima": 'weather_intent',
    "doppler radar": 'weather_intent',
    "local radar": 'weather_intent',
    "local weather": 'weather_intent',
    # "map": 'weather_intent',
    "us weather radar": 'weather_intent',
    "weather radar near me": 'weather_intent',
    "radar near me": 'weather_intent',
    'salary': 'information_intent',
    'cost to build': 'yelp_intent',
    'icd ': 'information_intent',
    'how often': 'information_intent',
    'get rid of': 'information_intent',
    'university of': 'navigation_intent',
    'windows 10': 'navigation_intent',
    'causes for': 'information_intent',
    'calculat': 'information_intent',
    'which is ': 'information_intent',
    'where are ': 'information_intent',
    'kelvin': 'information_intent',
    'celsius': 'information_intent',
    'fahrenheit': 'information_intent',
    'when ': 'information_intent',
    'benefit of': 'yelp_intent',
    'most common': 'information_intent',
    'which ': 'information_intent',
    'refers ': 'information_intent',
    'where does ': 'information_intent',
    'synonym': 'information_intent', 
    'salaries': 'information_intent', 
    'function of': 'information_intent', 
    'cause of': 'information_intent', 
    'effects of': 'information_intent', 
    'used for': 'information_intent', 
    'what color is': 'information_intent', 
    'weight loss': 'yelp_intent', 
    'where do': 'information_intent', 
    'what foods': 'information_intent', 
    'used for': 'information_intent', 
    'why': 'information_intent', 
    'age of': 'information_intent', 
    'who wrote': 'information_intent', 
    'function of': 'information_intent', 
    "what's a": 'information_intent', 
    "how fast": 'information_intent', 
    'most popular': 'information_intent', 
    'where': 'information_intent', 
    'is used': 'information_intent', 
    'doctors': 'yelp_intent', 
    'who ': 'information_intent', 
    ' hours': 'yelp_intent',
    'schedule': 'information_intent', 
    'what age': 'information_intent',
    'cheap': 'yelp_intent',
    'most expensive': 'information_intent',
    'size of': 'information_intent',
    'what exactly': 'information_intent',
    'ways to ': 'information_intent',
    'disorder': 'information_intent',
    'disease': 'information_intent',
    'felony': 'information_intent',
    'movie': 'information_intent',
    'cost of': 'yelp_intent',
    'what were': 'information_intent',
    'degree': 'information_intent',
    'what day': 'information_intent',
    'ways to': 'information_intent',
    'influen': 'information_intent',
    'importan': 'information_intent',
    'school': 'information_intent',
    'train': 'information_intent',
    'dimension': 'information_intent',
    'what makes': 'information_intent',
    'what were': 'information_intent',
    'what food': 'information_intent',
    'normal range': 'information_intent',
    'ways to': 'information_intent',
    'requirements for': 'information_intent',
    'employment': 'information_intent',
    'support number': 'navigation_intent',
    'fax number': 'navigation_intent',
    'considered a': 'information_intent',
    'distance ': 'information_intent',
    'share price': 'information_intent',
    'stock': 'information_intent',
    'channel is': 'information_intent',
    'continent': 'information_intent',
    'what level': 'information_intent',
    'english to': 'translation_intent',
    'to english': 'translation_intent',
    'translat': 'translation_intent',
    'what currency': 'information_intent',
    'blood test': 'information_intent',
    'replacement cost': 'yelp_intent',
    'how tall': 'information_intent',
    'characteristics of': 'information_intent',
    'tracking number': 'navigation_intent',
    'to replace': 'yelp_intent',
    'pay for': 'information_intent',
    'calories': 'information_intent',
    'health': 'information_intent',
    'tax': 'information_intent',
    'deadline': 'information_intent',
    'insurance': 'information_intent',
    'cancel': 'navigation_intent',
    'address': 'navigation_intent',
    'healthy': 'yelp_intent',
    'diet': 'information_intent',
    'lyrics': 'information_intent',
    'iphone': 'purchase_intent',
    'cell phone': 'purchase_intent',
    'android phone': 'purchase_intent',
    'android': 'information_intent',
    'protein': 'information_intent',
    'how to': 'information_intent',
    '401k': 'information_intent',
    ' ira ': 'information_intent',
    'population': 'information_intent',
    'president': 'information_intent',
    'whats': 'information_intent',
    "what's": 'information_intent',
    'benefits': 'information_intent',
    ' pain ': 'yelp_intent',
    'installation cost': 'yelp_intent',
    'in spanish': 'translation_intent',
    'to spanish': 'translation_intent',
    'in french': 'translation_intent',
    'to french': 'translation_intent',
    'in japanese': 'translation_intent',
    'to japanese': 'translation_intent',
    'in chinese': 'translation_intent',
    'to chinese': 'translation_intent',
    'side effect': 'information_intent',
    'cost to': 'yelp_intent',
    'cost per': 'information_intent',
    'disney world': 'navigation_intent',
    'surgery cost': 'yelp_intent',
    'album': 'information_intent',
    'genre': 'information_intent',
    'much water': 'information_intent',
    'job': 'navigation_intent',
    'netflix': 'information_intent',
    'nutrient': 'information_intent',
    'amazon': 'navigation_intent',
    'music': 'information_intent',
    'caffeine': 'information_intent',
    'adoption': 'yelp_intent',
    'dogs': 'yelp_intent',
    'cats': 'yelp_intent',
    'countries': 'information_intent',
    'number of': 'information_intent',
    'related to': 'information_intent',
    'foods with': 'information_intent',
    'restaurant': 'yelp_intent',
    'cusine': 'yelp_intent',
    'italian': 'yelp_intent',
    'mediterranean': 'yelp_intent',
    'vietnamese': 'yelp_intent',
    'recipe': 'yelp_intent',
    'vegan': 'yelp_intent',
    ' veg': 'yelp_intent',
    ' meat': 'yelp_intent',
    ' spice': 'yelp_intent',
    ' beer': 'yelp_intent',
    ' wine': 'yelp_intent',
    ' fresh ': 'yelp_intent',
    'fruit': 'yelp_intent',
    'restaurant': 'yelp_intent',
    'resort': 'travel_intent',
    'attraction': 'travel_intent',
    'installation': 'yelp_intent',
    'service': 'yelp_intent',
    
}

In [None]:
def apply_target_mapping(df, target_mapping):
    mapped_text_set = set()
    for ngram in target_mapping.keys():
        # mask = df['sequence'].apply(lambda text: ngram in text)
        mask = df['sequence'].apply(lambda text: ngram in text and text not in mapped_text_set)
        print(f'Number of matches found for "{ngram}"  = {mask.sum()}')
        print(f'size of mapped_text_set = {len(mapped_text_set)}')
        df.loc[mask, 'target'] = target_mapping[ngram]
        mapped_text_set.update(df.loc[mask, 'sequence'].values.tolist())
        print()

In [None]:
to_be_labelled = marco_df.loc[marco_df['target'].isna()].copy()
labelled = marco_df.loc[~marco_df['target'].isna()].copy()

In [None]:
manual_labelled = pd.read_csv("../data/manual_labels.csv")
manual_labelled = manual_labelled.loc[~manual_labelled['target'].isna()]
print(len(manual_labelled))
print(manual_labelled['target'].value_counts())
manual_labelled_lkp = manual_labelled[['sequence','target']].set_index('sequence').to_dict()['target']
manual_labelled.head()

In [None]:
def apply_manual_mapping(df, manual_labelled_lkp):
    mask = df['sequence'].apply(lambda text: text in manual_labelled_lkp)
    print(f'Number of matches found in manual labels = {mask.sum()}')
    df.loc[mask, 'target'] = df.loc[mask, 'sequence'].map(manual_labelled_lkp)
    print()

In [None]:

print(f"Number of examples labeled = {len(labelled)}")
print(f"Number of examples to be labeled = {len(to_be_labelled)}")
print(f"Label stats \n{labelled['target'].value_counts()}\n")

# Step 3: Get most common n-grams for a given n
n = 2  # Change this to any n (e.g., 1 for unigrams, 3 for trigrams)
to_be_labelled_sequence_list = to_be_labelled['sequence'].values.tolist()
ngram_counter = count_ngrams(to_be_labelled_sequence_list, n)
most_common_ngrams = ngram_counter.most_common(100)

# Display the most common n-grams
print(most_common_ngrams)

# Example usage with a limit on the number of results
cnt = 0
for query in search_queries_by_words("italian", to_be_labelled_sequence_list):
    if cnt >= 100:  # Stop after 20 results
        break
    print(cnt + 1, query)
    cnt += 1

apply_target_mapping(to_be_labelled, target_mapping)
apply_manual_mapping(to_be_labelled, manual_labelled_lkp)
labelled = pd.concat([labelled, to_be_labelled.loc[~to_be_labelled['target'].isna()]], axis=0)
to_be_labelled = to_be_labelled.loc[to_be_labelled['target'].isna()]
print()


#### Skip this for manual labeling

In [None]:
## Only if special list for manual process needed else skip this 

SKIP_MANUAL_LABEL_PREP = True
if not SKIP_MANUAL_LABEL_PREP:
    special_list = set()
    
    cnt = 0
    
    for query in search_queries_by_words("how much", to_be_labelled_sequence_list):
        if cnt >= 10000:  # Stop after 20 results
            break
        # print(cnt + 1, query)
        cnt += 1
        special_list.add(query)
    
    pd.DataFrame(special_list, columns=['sequence']).to_csv('special_list_manual_label.csv', index=False)

In [None]:
to_be_labelled

In [None]:
labelled['target'].value_counts()

In [None]:
combined = pd.concat([labelled, to_be_labelled], axis=0).reset_index(drop=True)
print(len(combined))
combined

In [None]:
labelled['target'].value_counts()

In [None]:
labelled.to_csv("../data/marco_train.csv", index=False)

In [None]:
import pandas as pd
from umap import UMAP
from sklearn.pipeline import make_pipeline 
from embetter.text import SentenceEncoder


SKIP_ENCODING = False
if not SKIP_ENCODING:
    # Build a sentence encoder pipeline with UMAP at the end.
    enc = SentenceEncoder('all-MiniLM-L6-v2')
    umap = UMAP()
    
    text_emb_pipeline = make_pipeline(
      enc, umap
    )
    
    # Load sentences
    X = combined['sequence'].values.tolist()
    
    # Calculate embeddings 
    X_tfm = text_emb_pipeline.fit_transform(X)
    
    # Write to disk. Note! Text column must be named "text"
    df = pd.DataFrame({"text": X})
    df['x'] = X_tfm[:, 0]
    df['y'] = X_tfm[:, 1]
    df.to_csv("marco_ready.csv", index=False)
    df['target'] = combined['target'].fillna('unknown')
else:
    df = pd.read_csv("marco_ready.csv")
    df['target'] = combined['target'].fillna('unknown')

In [None]:
combined

In [None]:
df

In [None]:
import plotly.express as px

In [None]:
fig_2d = px.scatter(
    df, x='x', y='y',
    color=df['target'], labels={'color': 'target'},
    hover_name="text",
    opacity=0.3,
    title="marcos web search queries intents map"
)



In [None]:
fig_2d

In [None]:
fig_2d.write_html("../reports/web_search_intents.html")