Purpose of the notebook:

Evaluate the current NER approach. This approach uses existing models supported by Transformers.js library.
We see where it fails. 
With the hypothesis classifier based approach might be better for we prepare and label the data
(https://www.microsoft.com/en-us/download/details.aspx?id=58227)

with some improvements in labeling

In [None]:
## imports

from transformers import pipeline
import pandas as pd
from tqdm import tqdm
from pprint import pprint
import random

#### Some examples of where the NER based approach is failing

In [None]:
classifier = pipeline("zero-shot-classification", model='typeform/mobilebert-uncased-mnli', device='cpu')


texts = [
    "what is democracy",
    "restaurants in oakville",
    "buy iphone",
    "bank login",
    "temperature in San Jose",
    "wood floor buckling repair",
    "wood floor cost estimator",
    "panera bread menu price",
    "how much is hbo now subscription",
    "how much is a golden retriever puppy",
    "how much is nebraska's sales tax",
    "how much is donald trump jr worth",
    "how much is a liposuction",
    "does mushroom cause food allergy",
]



intent_labels_lkp = {
    "yelp_intent": "search for local service, food, home repair, maintenance, cost estimation excluding weather intents",
    # "yelp_intent": "to discover, connect and transact with local businesses",
    "information_intent": "search for general knowledge what some concept is and not related to weather, services, or products",
    "weather_intent": "check weather conditions like forecast, temperature, radar, storms, or pollen",
    "purchase_intent": "make an online purchase",
    "navigation_intent": "navigate to a specific website"
}

intent_desc_lkp = {intent_desc: intent_key for intent_key, intent_desc in intent_labels_lkp.items()}

# Refined intent labels
intent_labels = [
    intent_labels_lkp["yelp_intent"],
    intent_labels_lkp["information_intent"],
    intent_labels_lkp["weather_intent"],
    intent_labels_lkp["purchase_intent"],
    intent_labels_lkp["navigation_intent"],
]

result = classifier(texts, candidate_labels=intent_labels)
# pprint(result)


In [None]:
# result_df = pd.DataFrame(result)
def prepare_df_from_reesult(result):
    updated_result = []
    for idx, res in enumerate(result):
        labels_and_scores = {'sequence': res['sequence']}
        for label, score in zip(res['labels'], res['scores']):
            labels_and_scores[intent_desc_lkp[label]] = score
        updated_result.append(labels_and_scores)
    
    return pd.DataFrame(updated_result)

updated_result_df = prepare_df_from_reesult(result)

In [None]:
updated_result_df

Some of the above results are bit unclear. `does the mushroom cause food allergy` is more of a information intent than a yelp intent.
There were many other cases which showed that NER alone may not be suitable for this problem. We need to solve the intent classification problem in this use case

#### Marco data

This dataset can be downloaded from https://www.microsoft.com/en-us/download/details.aspx?id=58227

In [None]:
marco_text_queries = set()
with open("../data/full_marco_sessions_ann_split.train.tsv", "r") as f:
    marco_texts = f.read().split('\n')
    for text in marco_texts:
        for query in text.split("\t"):
            if "marco-gen-train" not in query and len(query) >= 3:
                marco_text_queries.add(query.lower())

marco_text_queries_list = list(marco_text_queries)

In [None]:
len(marco_text_queries_list)

In [None]:
## some example queries

marco_text_queries_list[:50]

In [None]:
marco_df = pd.DataFrame({"sequence": marco_text_queries_list})

In [None]:
def labeling_stats(df):
    if 'target' not in marco_df.columns:
        df['target'] = None
    print(f"Size of the dataset = {len(df)}")
    print(f"Number of examples to be labeled = {df['target'].isna().sum()}")
    print(f"Number of examples labeled = {(~df['target'].isna()).sum()}")
    print("Labels distributed as \n", df['target'].value_counts())


## Prints labeling stats
labeling_stats(marco_df)

#### Find potential ngram mappings for targets

In [None]:
from collections import Counter
from itertools import islice

# Generalize function to extract n-grams
def extract_ngrams(query, n):
    words = query.split()
    ngrams = zip(*[islice(words, i, None) for i in range(n)])  # Generate n-grams
    return [' '.join(ngram) for ngram in ngrams]  # Join n-grams into a single string

# Flatten the n-grams into a list and count them
def count_ngrams(queries_list, n):
    all_ngrams = [ngram for query in queries_list for ngram in extract_ngrams(query, n)]
    ngram_counter = Counter(all_ngrams)
    return ngram_counter


In [None]:
def search_queries_by_words(search_text, to_be_labelled_sequence_list):
    for query in to_be_labelled_sequence_list:
        if search_text in query:
            yield query

In [None]:
cnt = 0
for query in search_queries_by_words("24 hour", marco_text_queries_list):
    if cnt >= 100:  # Stop after 20 results
        break
    print(cnt + 1, query)
    cnt += 1

In [None]:

target_mapping = {
    'how do': 'information_intent',
    'how to': 'information_intent',
    'weather in': 'weather_intent',
    'the weather': 'weather_intent',
    'hurricane': 'information_intent',
    # 'tornado': 'weather_intent',
    'current temperature': 'weather_intent',
    'current weather': 'weather_intent',
    'weather forecast in': 'weather_intent',
    'temperature in': 'weather_intent',
    # 'how much': 'purchase_intent', 
    # 'cost to': 'purchase_intent',
    # 'where is': 'navigation_intent', 
    'sign in ': 'navigation_intent',
    'signin ': 'navigation_intent',
    'login ': 'navigation_intent',
    'phone number': 'navigation_intent', 
    'customer service': 'navigation_intent',
    'bank routing': 'navigation_intent',
    'phone banking': 'navigation_intent',
    'watch online': 'navigation_intent',
    'help desk': 'navigation_intent',
    'what are': 'information_intent',
    'what county is': 'information_intent',
    'what is a ': 'information_intent',
    # 'what is': 'information_intent',
    'what does': 'information_intent',
    'what do': 'information_intent',
    'definition of': 'information_intent',
    'meaning': 'information_intent',
    'symptoms': 'information_intent',
    'zip code': 'information_intent',
    'zipcode': 'information_intent',
    'postal code': 'information_intent',
    'postalcode': 'information_intent',
    'area code': 'information_intent',
    'areacode': 'information_intent',
    'definition': 'information_intent',
    'define': 'information_intent',
    'what is the difference between': 'information_intent',
    'what is the purpose of': 'information_intent',
    'what is the function of': 'information_intent',
    'how long does it take': 'information_intent',
    'what is the name of': 'information_intent',
    'what is the population of': 'information_intent',
    'what is an example of': 'information_intent',
    'which of the following': 'information_intent',
    'what is the purpose': 'information_intent',
    # 'what time zone is': 'information_intent',
    'what is the average': 'information_intent',
    'is in what county': 'information_intent',
    'calories in': 'information_intent',
    # 'how many calories in': 'information_intent',
    "causes of": 'information_intent',
    "tom cruise": 'information_intent',
    'visit': 'travel_intent',
    'travel to': 'travel_intent',
    'cruise': 'travel_intent',
    'tours': 'travel_intent',
    'mortgage rate': 'yelp_intent',
    'interest rate': 'yelp_intent',
    'price of': 'purchase_intent',
    'amazon price': 'purchase_intent',
    'cost of living': 'information_intent',
    'to eat': 'yelp_intent', 
    'does it cost': 'yelp_intent', 
    'dental': 'yelp_intent',
    'dentist': 'yelp_intent',
    # 'what is the current': ?
    'what is the largest': 'information_intent',
    'what is the currency': 'information_intent',
    'how old do you': 'information_intent',
    'how long does a': 'information_intent',
    # 'what time is it': 'information_intent',
    'what time': 'information_intent',
    'you have to be': 'information_intent',
    'do you need to': 'information_intent',
    'what is considered a': 'information_intent',
    'dialing code': 'information_intent',
    'side effects': 'information_intent',
    'stock market': 'information_intent',
    'how many calories': 'information_intent',
    'average salary for': 'information_intent',
    'how many grams': 'information_intent',
    'what foods are': 'information_intent',
    'how many ounces': 'information_intent',
    'how many carbs': 'information_intent',
    'what year was': 'information_intent',
    'how old is': 'information_intent',
    'how much is': 'information_intent',
    'what type of': 'information_intent',
    'how do i': 'information_intent',
    'what kind of': 'information_intent',
    'who is the': 'information_intent',
    'where is the': 'information_intent',
    # 'different types of': 'information_intent',
    'types': 'information_intent',
    'what is': 'information_intent',
    'how do you': 'information_intent',
    'what was the': 'information_intent',
    'in the world': 'information_intent',
    'how long is': 'information_intent',
    'when was': 'information_intent',
    'when did': 'information_intent',
    'how far is': 'information_intent',
    'how tall is': 'information_intent',
    'what to do': 'information_intent',
    'how long': 'information_intent',
    'types of': 'information_intent',
    'who is': 'information_intent',
    'where is': 'information_intent',
    'what causes': 'information_intent',
    'stock price': 'information_intent',
    'difference between': 'information_intent',
    'social security': 'information_intent',
    'who was': 'information_intent',
    'net worth': 'information_intent',
    'cast of': 'information_intent',
    'how many': 'information_intent',
    'how does': 'information_intent',
    'how is': 'information_intent',
    'what did': 'information_intent',
    'good for': 'information_intent',
    'population of': 'information_intent',
    'can you': 'information_intent',
    'what can': 'information_intent',
    'how big': 'information_intent',
    'what size': 'information_intent',
    'average salary of': 'information_intent',
    'what year': 'information_intent',
    'part of': 'information_intent',
    'another word': 'information_intent',
    'who invented': 'information_intent',
    'what can you': 'information_intent',
    'how much money': 'information_intent',
    'what size': 'information_intent',
    'what state': 'information_intent',
    'what county': 'information_intent',
    'in the us': 'information_intent',
    'how old': 'information_intent',
    'icd code': 'information_intent',
    'what city': 'information_intent',
    'can you': 'information_intent',
    'can i': 'information_intent',
    'when is': 'information_intent',
    'how did': 'information_intent',
    'what can': 'information_intent',
    'what to': 'information_intent',
    'the same': 'information_intent',
    "cleaning ": 'yelp_intent',
    'restaurant': 'yelp_intent',
    'recommendation': 'yelp_intent',
    'repair': 'yelp_intent',
    'parking': 'yelp_intent',
    'oil change': 'yelp_intent',
    ' rental': 'yelp_intent',
    'auto ': 'yelp_intent',
    'dry clean': 'yelp_intent',
    'club': 'yelp_intent',
    'hotel': 'yelp_intent',
    'stores': 'yelp_intent',
    'shopping': 'yelp_intent',
    ' shop ': 'yelp_intent',
    ' shops ': 'yelp_intent',
    ' mall ': 'yelp_intent',
    'furniture': 'yelp_intent',
    'crafts': 'yelp_intent',
    'clothing': 'yelp_intent',
    # 'benefits of': 'yelp_intent',
    'average cost': 'yelp_intent',
    'cost to install': 'yelp_intent',
    'contact number': 'yelp_intent',
    'what airport': 'travel_intent',
    # 'flight': 'travel_intent',
    'cabins': 'travel_intent',
    'cost for': 'yelp_intent',
    'do you': 'information_intent',
    'when does': 'information_intent',
    'why is': 'information_intent',
    "what's the": 'information_intent',
    'what was': 'information_intent',
    'what language': 'information_intent',
    'should i': 'information_intent',
    'convert': 'information_intent',
    'medication': 'information_intent',
    'treatment': 'yelp_intent',
    'tv show': 'information_intent',
    'history': 'information_intent',
    'remedies': 'information_intent',
    'county is': 'information_intent',
    'synonym ': 'information_intent',
    'credit union number': 'yelp_intent',
    'credit union phone number': 'navigation_intent',
    'credit union hours': 'navigation_intent',
    'movie cast': 'information_intent',
    'average salary': 'information_intent',
    'example': 'information_intent',
    'blood pressure': 'information_intent',
    'credit card': 'navigation_intent',
    'time zone': 'information_intent',
    'time in': 'information_intent',
    'foods that': 'information_intent',
    'salary for': 'information_intent',
    "weather": 'weather_intent',
    "weather forecast": 'weather_intent',
    "windy": 'weather_intent',
    "humidity": 'weather_intent',
    "monsoon": 'weather_intent',
    "flooding": 'weather_intent',
    "rain in": 'weather_intent',
    "storms": 'weather_intent',
    "storm in": 'weather_intent',
    "forcast": 'weather_intent',
    "wether": 'weather_intent',
    "wather": 'weather_intent',
    "weahter": 'weather_intent',
    "weater": 'weather_intent',
    "weaher": 'weather_intent',
    " vindy ": 'weather_intent',
    " sunny ": 'weather_intent',
    " rain ": 'weather_intent',
    "windy": 'weather_intent',
    "cloudy": 'weather_intent',
    "storms": 'weather_intent',
    "air quality": 'weather_intent',
    "thunderstorm": 'weather_intent',
    "pollen": 'weather_intent',
    "snow": 'weather_intent',
    "blizzard": 'weather_intent',
    "radar": 'weather_intent',
    "tiempo": 'weather_intent',
    "clima": 'weather_intent',
    "doppler radar": 'weather_intent',
    "local radar": 'weather_intent',
    "local weather": 'weather_intent',
    # "map": 'weather_intent',
    "us weather radar": 'weather_intent',
    "weather radar near me": 'weather_intent',
    "radar near me": 'weather_intent',
    'salary': 'information_intent',
    'cost to build': 'yelp_intent',
    'icd ': 'information_intent',
    'how often': 'information_intent',
    'get rid of': 'information_intent',
    'university of': 'navigation_intent',
    'windows 10': 'navigation_intent',
    'causes for': 'information_intent',
    'calculat': 'information_intent',
    'which is ': 'information_intent',
    'where are ': 'information_intent',
    'kelvin': 'information_intent',
    'celsius': 'information_intent',
    'fahrenheit': 'information_intent',
    'when ': 'information_intent',
    'benefit of': 'yelp_intent',
    'most common': 'information_intent',
    'which ': 'information_intent',
    'refers ': 'information_intent',
    'where does ': 'information_intent',
    'synonym': 'information_intent', 
    'salaries': 'information_intent', 
    'function of': 'information_intent', 
    'cause of': 'information_intent', 
    'effects of': 'information_intent', 
    'used for': 'information_intent', 
    'what color is': 'information_intent', 
    'weight loss': 'yelp_intent', 
    'where do': 'information_intent', 
    'what foods': 'information_intent', 
    'used for': 'information_intent', 
    'why': 'information_intent', 
    'age of': 'information_intent', 
    'who wrote': 'information_intent', 
    'function of': 'information_intent', 
    "what's a": 'information_intent', 
    "how fast": 'information_intent', 
    'most popular': 'information_intent', 
    'where': 'information_intent', 
    'is used': 'information_intent', 
    'doctors': 'yelp_intent', 
    'who ': 'information_intent', 
    ' hours': 'navigation_intent',
    'schedule': 'information_intent', 
    'what age': 'information_intent',
    'cheap': 'yelp_intent',
    'most expensive': 'information_intent',
    'size of': 'information_intent',
    'what exactly': 'information_intent',
    'ways to ': 'information_intent',
    'disorder': 'information_intent',
    'disease': 'information_intent',
    'felony': 'information_intent',
    'movie': 'information_intent',
    # 'cost of': 'yelp_intent',
    'what were': 'information_intent',
    'degree': 'information_intent',
    'what day': 'information_intent',
    'ways to': 'information_intent',
    'influen': 'information_intent',
    'importan': 'information_intent',
    'school': 'information_intent',
    'train': 'information_intent',
    'dimension': 'information_intent',
    'what makes': 'information_intent',
    'what were': 'information_intent',
    'what food': 'information_intent',
    'normal range': 'information_intent',
    'ways to': 'information_intent',
    'requirements for': 'information_intent',
    'employment': 'information_intent',
    'support number': 'navigation_intent',
    ' support ': 'navigation_intent',
    'appointment': 'navigation_intent',
    'calculator': 'navigation_intent',
    ' application': 'navigation_intent',
    ' license': 'navigation_intent',
    'craigslist': 'navigation_intent',
    'fedex': 'navigation_intent',
    'forex': 'navigation_intent',
    ' ups ': 'navigation_intent',
    ' usps ': 'navigation_intent',
    'dhl': 'navigation_intent',
    'fax number': 'navigation_intent',
    'considered a': 'information_intent',
    'distance ': 'information_intent',
    'share price': 'information_intent',
    'stock': 'information_intent',
    'channel is': 'information_intent',
    'continent': 'information_intent',
    'what level': 'information_intent',
    'english to': 'translation_intent',
    'to english': 'translation_intent',
    'translat': 'translation_intent',
    'what currency': 'information_intent',
    'blood test': 'information_intent',
    'replacement cost': 'yelp_intent',
    'how tall': 'information_intent',
    'characteristics of': 'information_intent',
    'tracking number': 'navigation_intent',
    'tracking': 'navigation_intent',
    'to replace': 'yelp_intent',
    'pay for': 'information_intent',
    'calories': 'information_intent',
    'health': 'information_intent',
    'tax': 'information_intent',
    'deadline': 'information_intent',
    'insurance': 'information_intent',
    'cancel': 'navigation_intent',
    'address': 'navigation_intent',
    'healthy': 'yelp_intent',
    'diet': 'information_intent',
    'lyrics': 'information_intent',
    'cell phone': 'purchase_intent',
    'discount': 'purchase_intent',
    'coupon': 'purchase_intent',
    'promo code': 'purchase_intent',
    ' deal': 'purchase_intent',
    'where to buy': 'purchase_intent',
    ' buy': 'purchase_intent',
    'purchase': 'purchase_intent',
    'blackfriday': 'purchase_intent',
    'cybermonday': 'purchase_intent',
    'amazon prime': 'purchase_intent',
    'clearance': 'purchase_intent',
    'on sale': 'purchase_intent',
    'refurbished': 'purchase_intent',
    'warranty': 'purchase_intent',
    'compare price': 'purchase_intent',
    'cashback': 'purchase_intent',
    'in stock': 'purchase_intent',
    'lowest price': 'purchase_intent',
    'free shipping': 'purchase_intent',
    'android': 'information_intent',
    'protein': 'information_intent',
    '401k': 'information_intent',
    ' ira ': 'information_intent',
    'population': 'information_intent',
    'president': 'information_intent',
    'whats': 'information_intent',
    "what's": 'information_intent',
    'benefits': 'information_intent',
    ' pain ': 'yelp_intent',
    'installation cost': 'yelp_intent',
    'in spanish': 'translation_intent',
    'to spanish': 'translation_intent',
    'in french': 'translation_intent',
    'to french': 'translation_intent',
    'in japanese': 'translation_intent',
    'to japanese': 'translation_intent',
    'in chinese': 'translation_intent',
    'to chinese': 'translation_intent',
    'side effect': 'information_intent',
    'cost to live': 'information_intent',
    'cost of living': 'information_intent',
    'cost to': 'yelp_intent',
    'cost per': 'information_intent',
    'disney world': 'navigation_intent',
    'surgery cost': 'yelp_intent',
    'album': 'information_intent',
    'genre': 'information_intent',
    'much water': 'information_intent',
    'job': 'navigation_intent',
    'netflix': 'information_intent',
    'nutrient': 'information_intent',
    'amazon stock': 'information_intent',
    'music': 'information_intent',
    'caffeine': 'information_intent',
    'adoption': 'yelp_intent',
    'dogs': 'yelp_intent',
    'cats': 'yelp_intent',
    'countries': 'information_intent',
    'number of': 'information_intent',
    'related to': 'information_intent',
    'foods with': 'information_intent',
    'restaurant': 'yelp_intent',
    'cusine': 'yelp_intent',
    'italian': 'yelp_intent',
    'mediterranean': 'yelp_intent',
    'vietnamese': 'yelp_intent',
    'recipe': 'yelp_intent',
    'vegan': 'yelp_intent',
    ' vegeta': 'yelp_intent',
    ' meat': 'yelp_intent',
    ' spice': 'yelp_intent',
    ' beer': 'yelp_intent',
    ' wine': 'yelp_intent',
    ' fresh ': 'yelp_intent',
    'fruit': 'yelp_intent',
    'restaurant': 'yelp_intent',
    'resort': 'travel_intent',
    'attraction': 'travel_intent',
    'installation': 'yelp_intent',
    'service': 'yelp_intent',
    'routing number': 'navigation_intent',
    'amazon': 'navigation_intent',
}

In [None]:
print("key", "#examples")
navigation_queries_set = set()
for key,val in target_mapping.items():
    if val == 'navigation_intent':
        cnt = 0
        for query in search_queries_by_words(key, marco_text_queries_list):
            # if key == 'amazon':
            #     print(query)
            navigation_queries_set.add(query)
            cnt += 1

        print(key, cnt)


In [None]:
print("key", "#examples")
purchase_queries_set = set()
for key,val in target_mapping.items():
    if val == 'purchase_intent':
        cnt = 0
        for query in search_queries_by_words(key, marco_text_queries_list):
            purchase_queries_set.add(query)
            cnt += 1

        print(key, cnt)


In [None]:
purchase_queries_set

In [None]:
print("key", "#examples")
yelp_queries_set = set()
for key,val in target_mapping.items():
    if val == 'yelp_intent':
        cnt = 0
        for query in search_queries_by_words(key, marco_text_queries_list):
            yelp_queries_set.add(query)
            cnt += 1

        print(key, cnt)


In [None]:
yelp_queries = list(yelp_queries_set)
yelp_queries[:5]

yelp_ngram_counter = count_ngrams(yelp_queries, 2)
yelp_most_common_ngrams = yelp_ngram_counter.most_common(100)

# Display the weather_most_common_ngrams
print(yelp_most_common_ngrams)

In [None]:
print("key", "#examples")
weather_queries_set = set()
for key,val in target_mapping.items():
    if val == 'weather_intent':
        cnt = 0
        for query in search_queries_by_words(key, marco_text_queries_list):
            weather_queries_set.add(query)
            cnt += 1

        print(key, cnt)


In [None]:
weather_queries = list(weather_queries_set)
weather_queries[:5]

weather_ngram_counter = count_ngrams(weather_queries, 2)
weather_most_common_ngrams = weather_ngram_counter.most_common(100)

# Display the weather_most_common_ngrams
print(weather_most_common_ngrams)

In [None]:
weather_templates = [
    # Original Patterns
    ("The weather in {}", 0.539),
    ("What is the weather in {}", 0.499),
    ("What's the weather in {}", 0.046),
    ("Weather forecast in {}", 0.039),
    ("What is the temperature in {}", 0.033),
    ("The weather forecast for {}", 0.034),
    ("Current weather in {}", 0.023),
    ("Average weather in {}", 0.022),
    ("What is the weather forecast for {}", 0.014),
    ("Weather in {} in {}", 0.011),
    ("How is the weather in {}", 0.006),
    ("What is the climate of {}", 0.009),
    ("Is the weather forecast for {}", 0.005),
    ("Rain in {}", 0.002),
    ("What is the weather like in {}", 0.009),
    ("What is the climate in {}", 0.001),
    ("The weather today in {}", 0.001),
    ("What's the weather forecast for {}", 0.002),
    ("What is the best weather in {}", 0.001),
    ("Is the weather today in {}", 0.001),
    ("Current temperature in {}", 0.001),
    ("Storms in {}", 0.0007),
    ("Humidity in {}", 0.003),
    ("Windy in {}", 0.0005),
    ("Snow in {}", 0.009),
    ("Weather radar in {}", 0.005),
    ("The temperature in {}", 0.005),
    ("Weather like in {}", 0.006),
    ("What's the temperature in {}", 0.001),
    ("Is the weather like in {}", 0.006),

    # # Additional Patterns (10% of original weight)
    ("weather {}", 0.10 * 0.539),
    ("{} weather", 0.10 * 0.539),
    ("temperature {}", 0.10 * 0.033),
    ("{} temperature", 0.10 * 0.033),
]

# Expanding the typo variants further to include the common misspellings for "weather", "temperature", and "forecast"
extended_typo_variants = [
    # Misspellings for "weather"
    ("The weathr in {}", 0.20 * 0.539),
    ("What is the weathr in {}", 0.20 * 0.499),
    ("What's the weathr in {}", 0.20 * 0.046),
    ("Weathr forecast in {}", 0.20 * 0.039),
    ("What is the weathr like in {}", 0.20 * 0.009),
    ("The wether in {}", 0.20 * 0.539),
    ("What is the wether in {}", 0.20 * 0.499),
    ("What's the wether in {}", 0.20 * 0.046),
    ("Wether forecast in {}", 0.20 * 0.039),
    ("What is the wether like in {}", 0.20 * 0.009),
    ("The weater in {}", 0.20 * 0.539),
    ("What is the weater in {}", 0.20 * 0.499),
    ("What's the weater in {}", 0.20 * 0.046),
    ("Weater forecast in {}", 0.20 * 0.039),
    ("What is the weater like in {}", 0.20 * 0.009),
    ("The wather in {}", 0.20 * 0.539),
    ("What is the wather in {}", 0.20 * 0.499),
    ("What's the wather in {}", 0.20 * 0.046),
    ("Wather forecast in {}", 0.20 * 0.039),
    ("What is the wather like in {}", 0.20 * 0.009),
    ("The weahter in {}", 0.20 * 0.539),
    ("What is the weahter in {}", 0.20 * 0.499),
    ("What's the weahter in {}", 0.20 * 0.046),
    ("Weahter forecast in {}", 0.20 * 0.039),
    ("What is the weahter like in {}", 0.20 * 0.009),
    ("The weaher in {}", 0.20 * 0.539),
    ("What is the weaher in {}", 0.20 * 0.499),
    ("What's the weaher in {}", 0.20 * 0.046),
    ("Weaher forecast in {}", 0.20 * 0.039),
    ("What is the weaher like in {}", 0.20 * 0.009),
    ("The waether in {}", 0.20 * 0.539),
    ("What is the waether in {}", 0.20 * 0.499),
    ("What's the waether in {}", 0.20 * 0.046),
    ("Waether forecast in {}", 0.20 * 0.039),
    ("What is the waether like in {}", 0.20 * 0.009),

    # Misspellings for "temperature"
    ("What is the temprature in {}", 0.20 * 0.033),
    ("What is the temperture in {}", 0.20 * 0.033),
    ("What is the tempreture in {}", 0.20 * 0.033),
    ("What is the tempratuer in {}", 0.20 * 0.033),
    ("What is the tempratue in {}", 0.20 * 0.033),
    ("What is the tempertuer in {}", 0.20 * 0.033),
    ("What is the tempretuer in {}", 0.20 * 0.033),
    ("What is the temprture in {}", 0.20 * 0.033),

    # Misspellings for "forecast"
    ("Forcast in {}", 0.20 * 0.039),
    ("What is the forcast for {}", 0.20 * 0.034),
    ("Forcst in {}", 0.20 * 0.039),
    ("What is the forcst for {}", 0.20 * 0.034),
    ("Forescast in {}", 0.20 * 0.039),
    ("What is the forescast for {}", 0.20 * 0.034),
    ("Forecats in {}", 0.20 * 0.039),
    ("What is the forecats for {}", 0.20 * 0.034),
    ("Forcaste in {}", 0.20 * 0.039),
    ("What is the forcaste for {}", 0.20 * 0.034),
    ("Forecst in {}", 0.20 * 0.039),
    ("What is the forecst for {}", 0.20 * 0.034),
    ("Forecase in {}", 0.20 * 0.039),
    ("What is the forecase for {}", 0.20 * 0.034),
    ("Foercast in {}", 0.20 * 0.039),
    ("What is the foercast for {}", 0.20 * 0.034),
]

# Combine original templates and the expanded typo variants
weather_templates_extended = weather_templates + extended_typo_variants


weather_templates_df = pd.DataFrame(weather_templates_extended, columns=['pattern', 'weight'])
weather_templates_df['weight'] = weather_templates_df['weight'] / weather_templates_df['weight'].sum()
weather_templates_df

In [None]:
weather_templates_df.head(50)

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [None]:
url = "https://en.m.wikipedia.org/wiki/List_of_television_stations_in_North_America_by_media_market"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    dma_heading = soup.find('h4', string='DMAs')
    dma_list = dma_heading.find_next('ul')
    
    dma_data = []
    if dma_list:
        for li in dma_list.find_all('li'):
            market_name = li.get_text(strip=True)

            # Split by dash (-) or en-dash (–) to handle cases like "Dallas-Fort Worth"
            split_names = re.split(r'–|-', market_name)

            # Process each split name
            for name in split_names:
                # Remove the (#NUM) part using regex
                name = re.sub(r'\s*\(#\d+\)', '', name).strip()

                # Check if there's a city in parentheses and split them
                match = re.match(r'(.+?)\s*\((.+?)\)', name)
                if match:
                    main_city = match.group(1).strip()
                    parenthetical_city = match.group(2).strip()
                    dma_data.append(main_city)  # Add the main city
                    dma_data.append(parenthetical_city)  # Add the city in parentheses
                else:
                    dma_data.append(name) 



In [None]:
len(dma_data)

In [None]:
print(dma_data)

In [None]:
from collections import Counter

# months
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

# Function to generate random queries with 30% lowercased
def generate_queries_with_case(df, cities, months, num_queries=10, lower_case_prob=0.3):
    queries = set()
    cnt = 0
    pattern_counter = Counter()
    while cnt < num_queries:
        # Choose a pattern based on the weights
        pattern = random.choices(df['pattern'], weights=df['weight'], k=1)[0]
        
        # Replace placeholders in the pattern with a random city and/or month
        city = random.choice(cities)
        if "{} in {}" in pattern:
            month = random.choice(months)
            query = pattern.format(city, month)
        else:
            query = pattern.format(city)

        if pattern_counter.get(pattern, 0) > num_queries//10:
            continue
        pattern_counter.update([pattern])
        
        # Randomly convert the query to lowercase with the given probability
        if random.random() < lower_case_prob:
            query = query.lower()

        if query not in queries:
            queries.add(query)
            cnt += 1
    
    return list(queries), pattern_counter

# Generate 10 sample queries with 30% in lowercase
sample_queries_with_case, pattern_counter = generate_queries_with_case(weather_templates_df, dma_data, months, num_queries=10000, lower_case_prob=0.3)

print(len(sample_queries_with_case))
sample_queries_with_case[:10]


In [None]:
pattern_counter

In [None]:
# sample_queries_with_case[1000:2000]

In [None]:
# sample_queries_with_case[:100]
weather_examples = pd.DataFrame(sample_queries_with_case, columns=['sequence'])
weather_examples['target'] = 'weather_intent'
weather_examples

#### Yelp examples

In [None]:
# Original Yelp Intent Templates
yelp_intent_templates = [
    ("What are the best restaurants in {}", 0.12),
    ("Top-rated restaurants in {}", 0.10),
    ("Popular coffee shops in {}", 0.09),
    ("Best pizza places in {}", 0.08),
    ("Best sushi places in {}", 0.07),
    ("Cheap restaurants in {}", 0.06),
    ("Best places to eat in {}", 0.06),
    ("Restaurants near me in {}", 0.05),
    # ("What is the average cost of a meal in {}", 0.04),
    ("Best Italian restaurants in {}", 0.04),
    ("Best fast food restaurants in {}", 0.04),
    ("Mexican restaurants in {}", 0.03),
    ("Chinese food near me in {}", 0.03),
    ("Best hotels in {}", 0.03),
    ("Affordable hotels in {}", 0.03),
    ("Best parks to visit in {}", 0.02),
    ("Best attractions in {}", 0.02),
    ("Popular things to do in {}", 0.02),
    ("Best shopping centers in {}", 0.02),
    ("Best gyms in {}", 0.02),
    ("Top hair salons in {}", 0.02),
    ("What are the best-rated dentists in {}", 0.02),
    ("Local plumbers in {}", 0.02),
    ("Popular electricians in {}", 0.02),
    # ("What is the phone number for a restaurant in {}", 0.02),
    # ("Phone number for hotels in {}", 0.02),
    ("Top-rated cafes in {}", 0.02),
    ("Best massage spas in {}", 0.02),
    ("Grocery stores near me in {}", 0.02),
    ("Where can I buy clothes in {}", 0.01),
    ("Pharmacies near me in {}", 0.01),
    ("Best bars in {}", 0.01),
    ("Cocktail bars in {}", 0.01),
    ("Family-friendly restaurants in {}", 0.01),
    ("Kid-friendly restaurants in {}", 0.01),
    ("Pet-friendly restaurants in {}", 0.01),
    ("Vegan restaurants in {}", 0.01),
    ("Best rooftop bars in {}", 0.01),
    ("Top pizza delivery places in {}", 0.01),
    ("Where can I get sushi in {}", 0.01),
    ("Best food delivery services in {}", 0.01),
    ("Catering services in {}", 0.01),
    ("Top-rated bakeries in {}", 0.01),
    ("Where can I find a gym in {}", 0.01),
    ("Yoga studios near me in {}", 0.01),
    ("What’s the cost of living in {}", 0.01),
    ("How much does it cost to live in {}", 0.01),
    ("Best places for nightlife in {}", 0.01),
    ("Local car repair shops in {}", 0.01),
    ("Best car rental services in {}", 0.01),
    ("{} restaurants", 0.02),
    ("{} hotels", 0.02),
    ("{} food", 0.02),
]

# Function to add typos to templates
def add_typos_to_template(template, typo_prob=0.1):
    typos = {
        "restaurants": ["restarants", "resturants", "restrants"],
        "best": ["bst", "besst", "bet"],
        "popular": ["populer", "ppular", "poplar"],
        "coffee": ["cofee", "cofffe", "cofee"],
        "pizza": ["piza", "pzza", "piza"],
        "hotels": ["hoetls", "hotls", "hoetls"],
        "places": ["plces", "place", "palces"],
        "attractions": ["attractons", "atrctions", "attractins"],
        "cheap": ["chep", "cheep", "cheap"],
        "meal": ["mel", "meel", "male"],
        "cost": ["cst", "cots", "cot"],
        "living": ["lving", "livng", "livin"],
        "yoga": ["yga", "yoaga", "ygoa"],
        "food": ["fod", "fud", "fodd"],
        "parks": ["praks", "parcs", "paks"],
        "near": ["ner", "neer", "naer"],
        "bar": ["bar", "ber", "baer"],
        "family": ["famly", "famliy", "faimly"],
        "friendly": ["frindly", "frendly", "friendley"]
    }

    words = template.split()
    for i, word in enumerate(words):
        if word.lower().strip("{}") in typos and random.random() < typo_prob:
            words[i] = random.choice(typos[word.lower().strip("{}")])
    return " ".join(words)

# Extending the list with typos
extended_yelp_intent_templates = []
extended_yelp_intent_templates_set = set()

for template, weight in yelp_intent_templates:
    if template in extended_yelp_intent_templates_set:
        continue
    extended_yelp_intent_templates.append((template, weight))
    extended_yelp_intent_templates_set.add(template)
    
    # Adding a typo variant 10-20% of the time
    if random.random() < 0.2:
        typo_template = add_typos_to_template(template)
        typo_weight = weight * 0.2  # Typos occur less frequently, so reduce weight
        if typo_template in extended_yelp_intent_templates_set:
            continue
        extended_yelp_intent_templates.append((typo_template, typo_weight))
        extended_yelp_intent_templates_set.add(typo_template)

# Convert to DataFrame for better readability
df_extended_yelp_intent_templates = pd.DataFrame(extended_yelp_intent_templates, columns=["pattern", "weight"])
df_extended_yelp_intent_templates['weight'] = df_extended_yelp_intent_templates['weight'] / df_extended_yelp_intent_templates['weight'].sum()
df_extended_yelp_intent_templates

In [None]:
list(weather_templates_df['pattern'].values) + list(df_extended_yelp_intent_templates['pattern'].values)

In [None]:
# Function to generate random queries with 30% lowercased
def generate_yelp_queries_with_case(df, cities, num_queries=10, lower_case_prob=0.3):
    queries = set()
    cnt = 0
    pattern_counter = Counter()
    while cnt < num_queries:
        # Choose a pattern based on the weights
        pattern = random.choices(df['pattern'], weights=df['weight'], k=1)[0]
        
        # Replace placeholders in the pattern with a random city and/or month
        city = random.choice(cities)
        query = pattern.format(city)

        if pattern_counter.get(pattern, 0) > num_queries//10:
            continue
        pattern_counter.update([pattern])
        
        # Randomly convert the query to lowercase with the given probability
        if random.random() < lower_case_prob:
            query = query.lower()

        if query not in queries:
            queries.add(query)
            cnt += 1
    
    return list(queries), pattern_counter

In [None]:
# Generate 10 sample queries with 30% in lowercase
sample_yelp_queries_with_case, pattern_counter = generate_yelp_queries_with_case(df_extended_yelp_intent_templates, dma_data, num_queries=10000, lower_case_prob=0.4)

print(len(sample_yelp_queries_with_case))
sample_yelp_queries_with_case[:10]

In [None]:
# sample_yelp_queries_with_case

In [None]:
pattern_counter

In [None]:
yelp_examples = pd.DataFrame(sample_yelp_queries_with_case, columns=['sequence'])
yelp_examples['target'] = 'yelp_intent'
yelp_examples

#### Purchase intent data augmentation

In [None]:
electronics = [
    'iPhone', 'Samsung Galaxy', 'MacBook', 'PlayStation 5', 'AirPods', 
    'Xbox Series X', 'Canon DSLR', 'GoPro', 'Fitbit', 'Google Pixel',
    'Bose headphones', 'Sony TV', 'Apple Watch', 'Nintendo Switch', 'Kindle',
    'Sony WH-1000XM4', 'Microsoft Surface', 'DJI Drone', 'Logitech Webcam', 'HP Spectre x360'
]

home_appliances = [
    'Dyson vacuum', 'Roomba', 'KitchenAid mixer', 'Ninja air fryer', 'Instant Pot', 
    'LG refrigerator', 'Samsung washing machine', 'Whirlpool dryer', 'Panasonic microwave', 'Breville toaster oven',
    'Miele dishwasher', 'Cuisinart coffee maker', 'GE oven', 'Philips air purifier', 'Hoover carpet cleaner',
    'Honeywell thermostat', 'LG air conditioner', 'Bosch induction cooktop', 'Crock-Pot', 'Frigidaire freezer'
]

furnitures = [
    'Ikea sofa', 'West Elm dining table', 'La-Z-Boy recliner', 'Ashley bed frame', 'Herman Miller chair', 
    'CB2 bookshelf', 'Pottery Barn desk', 'Crate & Barrel coffee table', 'Sealy mattress', 'Serta sectional sofa',
    'Wayfair sideboard', 'RH leather chair', 'Flexsteel armchair', 'Sauder TV stand', 'Modway bar stool',
    'Tempur-Pedic mattress', 'Ikea wardrobe', 'Zinus platform bed', 'Ashley loveseat', 'AllModern bench'
]

fashion_and_clothing = [
    'Nike shoes', 'Adidas sneakers', 'Levi’s jeans', 'Gucci handbag', 'Rolex watch', 
    'Ray-Ban sunglasses', 'Patagonia jacket', 'H&M dress', 'Michael Kors purse', 'North Face parka',
    'Calvin Klein suit', 'Under Armour hoodie', 'Puma sneakers', 'Tommy Hilfiger t-shirt', 'Lululemon leggings',
    'Vans skate shoes', 'Coach wallet', 'Fossil watch', 'Zara coat', 'Birkenstock sandals'
]

beauty_and_personal_care = [
    'Dior perfume', 'Chanel foundation', 'Neutrogena moisturizer', 'MAC lipstick', 'Olay anti-aging cream', 
    'Pantene shampoo', 'Gilette razor', 'Oral-B electric toothbrush', 'Clarisonic face brush', 'Nivea body lotion',
    'L’Oreal conditioner', 'Revlon hair dryer', 'Estee Lauder serum', 'Clinique cleanser', 'Philips hair trimmer',
    'Remington hair straightener', 'Aveeno sunscreen', 'Aveda hair oil', 'La Roche-Posay sunscreen', 'Anastasia eyebrow pencil'
]

automotives = [
    'Tesla Model S', 'Ford Mustang', 'Chevrolet Camaro', 'Toyota Corolla', 'Honda Civic', 
    'BMW X5', 'Mercedes-Benz GLC', 'Jeep Wrangler', 'Ford F-150', 'Hyundai Tucson',
    'Mazda CX-5', 'Volkswagen Jetta', 'Nissan Altima', 'Dodge Ram', 'Chevrolet Tahoe',
    'Lexus RX', 'Kia Sorento', 'Subaru Outback', 'Volvo XC90', 'Cadillac Escalade'
]

household_items = [
    'Tide laundry detergent', 'Scotch-Brite sponges', 'Bounty paper towels', 'Clorox bleach', 'Ziploc bags', 
    'Swiffer mop', 'Mr. Clean Magic Eraser', 'Glad trash bags', 'Febreze air freshener', 'Lysol disinfectant spray',
    'Dawn dish soap', 'Windex glass cleaner', 'Arm & Hammer baking soda', 'Tupperware', 'Brita water filter',
    'O-Cedar mop', 'Scrub Daddy', 'Bounce dryer sheets', 'Hefty storage containers', 'Method all-purpose cleaner'
]

toys_and_games = [
    'LEGO sets', 'Barbie dolls', 'Hot Wheels cars', 'Nerf blasters', 'Fisher-Price playsets', 
    'Monopoly board game', 'Jenga', 'Uno card game', 'Crayola coloring kits', 'Play-Doh sets',
    'Marvel action figures', 'RC cars', 'Beyblade', 'Transformers toys', 'Super Soaker water guns',
    'Paw Patrol toys', 'My Little Pony dolls', 'Magic: The Gathering cards', 'Lego Mindstorms', 'Nintendo Switch games'
]

books_and_media = [
    'Harry Potter books', 'The Lord of the Rings', 'The Great Gatsby', 'To Kill a Mockingbird', '1984 by George Orwell', 
    'The Catcher in the Rye', 'The Hunger Games', 'Game of Thrones', 'Twilight series', 'Sherlock Holmes novels',
    'The Da Vinci Code', 'The Alchemist', 'The Chronicles of Narnia', 'Percy Jackson series', 'The Maze Runner',
    'The Girl with the Dragon Tattoo', 'Moby Dick', 'Pride and Prejudice', 'The Handmaid’s Tale', 'The Witcher series'
]

sport_equipments = [
    'Nike soccer ball', 'Wilson tennis racket', 'Adidas football cleats', 'Spalding basketball', 'Under Armour workout gloves', 
    'Yonex badminton racket', 'Callaway golf clubs', 'Fitbit fitness tracker', 'Everlast boxing gloves', 'Wilson baseball glove',
    'Babolat tennis shoes', 'Reebok CrossFit gear', 'Nike running shoes', 'Speedo swim goggles', 'Bauer hockey skates',
    'Garmin GPS watch', 'Rawlings baseball bat', 'Easton batting gloves', 'Columbia hiking boots', 'Asics running shoes'
]

gifts = [
    'custom gifts', 'gift cards', 'personalized mugs', 'engraved jewelry',
    'photo frames', 'custom t-shirts', 'personalized blankets', 
    'engraved watches', 'photo books', 'custom calendars'
]

hunting_equipment = [
    'crossbow', 'compound bow', 'hunting knives', 'camouflage clothing',
    'deer stand', 'trail camera', 'hunting boots', 'binoculars', 
    'rangefinder', 'backpack for hunting'
]

eyewear = [
    'prescription glasses', 'sunglasses', 'blue light blocking glasses',
    'bifocals', 'transition lenses', 'polarized sunglasses',
    'contact lenses', 'eyeglass frames', 'sports glasses', 'reading glasses'
]

supplements = [
    'magnesium taurate', 'vitamin D', 'fish oil', 'multivitamins',
    'probiotics', 'protein powder', 'collagen', 'iron supplements',
    'zinc supplements', 'calcium supplements'
]

pet_supplies = [
    'dog food', 'cat food', 'pet beds', 'dog treats', 'pet grooming kits',
    'cat litter', 'dog toys', 'cat scratchers', 'pet carriers', 'pet feeders'
]

bedding = [
    'queen size bedspreads', 'king size comforter sets', 'sheets',
    'duvet covers', 'pillows', 'mattress protectors', 'weighted blankets',
    'electric blankets', 'bamboo sheets', 'silk pillowcases'
]
##
kitchen_appliance = [
    'blender', 'air fryer', 'pressure cooker', 'food processor',
    'stand mixer', 'toaster oven', 'microwave', 'coffee maker',
    'deep fryer', 'slow cooker'
]

automotive_parts = [
    'tires', 'car batteries', 'carburetors', 'brake pads', 
    'windshield wipers', 'car mats', 'air filters', 'engine oil',
    'spark plugs', 'headlights'
]

tech_accessories = [
    'phone case', 'charging cable', 'laptop sleeve', 'wireless charger',
    'screen protector', 'portable battery', 'USB hub', 'headphone adapter',
    'keyboard cover', 'stylus pen'
]

fitness_equipment = [
    'treadmill', 'dumbbells', 'resistance bands', 'exercise bike',
    'yoga mat', 'pull-up bar', 'rowing machine', 'kettlebell',
    'weight bench', 'jump rope'
]

seasonal_products = [
    'Christmas tree', 'holiday lights', 'Halloween costumes',
    'summer outdoor furniture', 'Thanksgiving decorations',
    'winter jackets', 'snow blowers', 'Easter baskets', 'grills', 'pool accessories'
]


In [None]:
purchase_intent_templates = [
    # Electronics Purchase Intent
    "Where to buy {electronics} online?",
    "Best deals on {electronics} this year",
    "Is {electronics} worth buying in 2024?",
    "Discounts available for {electronics}?",
    # "How to repair {electronics} at home?",
    "Which is better: {electronics} or {electronics}?",
    "Where to buy used {electronics}?",
    "Is {electronics} in stock near me?",
    "What {electronics} make the best gifts?",
    
    # Home Appliances Purchase Intent
    "Where to buy {home_appliance} at the best price?",
    "How to repair a {home_appliance}?",
    "Best deals on {home_appliance} right now",
    "What are the reviews for {home_appliance}?",
    "Should I upgrade my {home_appliance} this year?",
    "Compare {home_appliance} with {home_appliance} for best value",
    "Are refurbished {home_appliance} worth buying?",
    "Where can I find {home_appliance} available now?",
    
    # Furniture Purchase Intent
    "What is the best {furniture} for a small space?",
    "Where to buy affordable {furniture} online?",
    "Best places to buy {furniture} for my home",
    "Top-rated {furniture} on sale this weekend",
    "How to assemble {furniture} yourself",
    "Best {furniture} to buy as gifts for new homeowners",
    
    # Fashion and Clothing Purchase Intent
    "What are the latest deals on {fashion_and_clothing}?",
    "Where to buy {fashion_and_clothing} online?",
    "Top-rated {fashion_and_clothing} for this season",
    "Best styles of {fashion_and_clothing} in 2024",
    "Is {fashion_and_clothing} worth the price?",
    "What {fashion_and_clothing} brands are best for longevity?",
    
    # Beauty and Personal Care Purchase Intent
    "Best {beauty_and_personal_care} products to buy this year",
    "Where to buy {beauty_and_personal_care} online?",
    "Top reviews for {beauty_and_personal_care} products",
    "Are {beauty_and_personal_care} products worth it?",
    "What are the best deals for {beauty_and_personal_care}?",
    "How to get a subscription for {beauty_and_personal_care} products?",
    
    # Automotive Purchase Intent
    "Is {automotive} a good car to buy?",
    "Best deals on {automotive} this year",
    "Where to buy {automotive} accessories?",
    "How to finance a new {automotive}?",
    "Top-rated {automotive} models in 2024",
    "What’s the lifespan of {automotive}?",
    
    # Household Items Purchase Intent
    "What are the top-rated {household_item} this year?",
    "Where to buy {household_item} online?",
    "How to get discounts on {household_item}?",
    "Are {household_item} worth buying?",
    "Top stores for {household_item} deals",
    
    # Toys and Games Purchase Intent
    "Where to buy {toys_and_games} for kids?",
    "Best reviews for {toys_and_games}",
    "What are the best prices for {toys_and_games}?",
    "What are the top {toys_and_games} for Christmas?",
    "Top-rated {toys_and_games} on sale",
    
    # Books and Media Purchase Intent
    "Best places to buy {books_and_media} online",
    "What are the reviews for {books_and_media}?",
    "Is {books_and_media} worth buying?",
    "Top-rated {books_and_media} for this year",
    "What are the best deals on {books_and_media}?",
    "Where to buy a subscription for {books_and_media}?",
    
    # Sports Equipment Purchase Intent
    "What are the best {sport_equipments} to buy?",
    "Where can I find discounts on {sport_equipments}?",
    "Top-rated {sport_equipments} for 2024",
    "Where to buy {sport_equipments} online?",
    "Are {sport_equipments} worth the price?",
    "What are the best deals on {sport_equipments}?",
    "Reviews of {sport_equipments} from users",
    "Where to buy {sport_equipments} for beginners?",
    "What are the must-have {sport_equipments} for athletes?",
    "Top stores offering deals on {sport_equipments}"

    # Gift-Related Queries
    "Where to buy {gifts} online?",
    "Best deals on {gifts} this holiday season",
    "What {gifts} make the best presents?",
    "How to personalize {gifts} for special occasions?",
    "Are {gifts} available for same-day delivery?",
    
    # Hunting Equipment Queries
    "Best {hunting_equipment} for deer hunting",
    "Where to buy {hunting_equipment} online?",
    "What are the top {hunting_equipment} brands?",
    "How to maintain {hunting_equipment}?",
    
    # Eyewear Queries
    "Where to order {eyewear} online?",
    "Best prices for {eyewear}",
    "Are {eyewear} available with insurance coverage?",
    "Top-rated {eyewear} for outdoor sports",
    
    # Supplements Queries
    "Where to buy {supplements} for health?",
    "Top reviews for {supplements}",
    # "What are the benefits of {supplements}?",
    "How to find discounts on {supplements}?",
    
    # Pet Supplies Queries
    "Where to buy {pet_supplies} online?",
    "What are the best {pet_supplies} for dogs?",
    "Top-rated {pet_supplies} for cats",
    "How to get discounts on {pet_supplies}?",
    
    # Bedding Queries
    "Where to buy {bedding} on sale?",
    "Best {bedding} for a comfortable night's sleep",
    "How to choose {bedding} for different seasons?",
    "Are {bedding} available for delivery today?"
##
    # Kitchen Appliance Queries
    "Best deals on {kitchen_appliance} this year",
    "Where to buy {kitchen_appliance} online?",
    "How to repair {kitchen_appliance} at home?",
    "Are {kitchen_appliance} worth buying refurbished?",
    
    # Automotive Parts Queries
    "Where to buy {automotive_parts} for my car?",
    "Best deals on {automotive_parts} this year",
    "How to install {automotive_parts}?",
    "Is {automotive_parts} in stock near me?",
    
    # Tech Accessories Queries
    "Where to buy {tech_accessories} online?",
    "Best {tech_accessories} for my {electronics}",
    "What are the reviews for {tech_accessories}?",
    "Are {tech_accessories} compatible with {device}?",
    
    # Fitness Equipment Queries
    "Best {fitness_equipment} to buy for home gym",
    "Where to find {fitness_equipment} deals?",
    "Top-rated {fitness_equipment} for 2024",
    "What are the must-have {fitness_equipment}?",
    
    # Seasonal Products Queries
    "Where to buy {seasonal_products} during the holiday season?",
    "Best {seasonal_products} for {season}",
    "Are {seasonal_products} available for same-day delivery?",
    "Top-rated {seasonal_products} for this year"
]


In [None]:
len(purchase_intent_templates)

In [None]:
ELECTRONICS_PURCHASE = "{electronics}"
HOME_APPLIANCES_PURCHASE = "{home_appliance}"
FURNITURES_PURCHASE = "{furniture}"
FASHION_CLOTHING_PURCHASE = "{fashion_and_clothing}"
BEAUTY_AND_PERSONAL_CARE_PURCHASE = "{beauty_and_personal_care}"
AUTOMOTIVE_PURCHASE = "{automotive}"
HOUSEHOLD_ITEMS_PURCHASE = "{household_item}"
TOYS_AND_GAMES_PURCHASE = "{toys_and_games}"
BOOKS_AND_MEDIA_PURCHASE = "{books_and_media}"
SPORTS_EQUIPMENT_PURCHASE = "{sport_equipments}"
GIFTS_PURCHASE = "{gifts}"
HUNTING_EQUIPMENT_PURCHASE = "{hunting_equipment}"
EYEWEAR_PURCHASE = "{eyewear}"
SUPPLEMENTS_PURCHASE = "{supplements}"
PET_SUPPLIES_PURCHASE = "{pet_supplies}"
BEDDING_PURCHASE = "{bedding}"
KITCHEN_APPLIANCE_PURCHASE = "{kitchen_appliance}"
AUTOMOTIVE_PARTS_PURCHASE = "{automotive_parts}"
TECH_ACCESSORIES_PURCHASE = "{tech_accessories}"
FITNESS_EQUIPMENT_PURCHASE = "{fitness_equipment}"
SEASONAL_PRODUCTS_PURCHASE = "{seasonal_products}"

product_categories = {
    ELECTRONICS_PURCHASE: electronics,
    HOME_APPLIANCES_PURCHASE: home_appliances,
    FURNITURES_PURCHASE: furnitures,
    FASHION_CLOTHING_PURCHASE: fashion_and_clothing,
    BEAUTY_AND_PERSONAL_CARE_PURCHASE: beauty_and_personal_care,
    AUTOMOTIVE_PURCHASE: automotives,
    HOUSEHOLD_ITEMS_PURCHASE: household_items,
    TOYS_AND_GAMES_PURCHASE: toys_and_games,
    BOOKS_AND_MEDIA_PURCHASE: books_and_media,
    SPORTS_EQUIPMENT_PURCHASE: sport_equipments,
    GIFTS_PURCHASE: gifts,
    HUNTING_EQUIPMENT_PURCHASE: hunting_equipment,
    EYEWEAR_PURCHASE: eyewear,
    SUPPLEMENTS_PURCHASE: supplements,
    PET_SUPPLIES_PURCHASE: pet_supplies,
    BEDDING_PURCHASE: bedding,
    KITCHEN_APPLIANCE_PURCHASE: kitchen_appliance,
    AUTOMOTIVE_PARTS_PURCHASE: automotive_parts,
    TECH_ACCESSORIES_PURCHASE: tech_accessories,
    FITNESS_EQUIPMENT_PURCHASE: fitness_equipment,
    SEASONAL_PRODUCTS_PURCHASE: seasonal_products,
}

def detect_product(product_categories, template):
    for category in product_categories.keys():
        if category in template:
            return category

def generate_queries(templates, n_queries=1000):
    cnt = 0
    queries = []
    query_set = set()
    while cnt < n_queries:
        if cnt %500 == 0:
            print(f"{cnt+1} examples added")
        template = random.choice(templates)
        # print(f"template = {template}")
        category = detect_product(product_categories, template)
        # print(f"category = {category}")
        product = random.choice(product_categories[category])
        # print(f"product = {product}")
        category = category.replace("{","").replace("}", "")
        query = template.replace(f"{{{category}}}",product)
        # print(f"query = {query}")
        # print()
        if query not in query_set:
            queries.append(query)
            query_set.add(query)
            cnt += 1
    return queries

In [None]:
purchase_intent_queries = generate_queries(purchase_intent_templates, n_queries=1700)

In [None]:
len(purchase_intent_queries)

In [None]:
purchase_intent_examples = pd.DataFrame(purchase_intent_queries, columns=['sequence'])
purchase_intent_examples['target'] = 'purchase_intent'
purchase_intent_examples

In [None]:
import json 

def get_geonames_city_state_data():
    geonames_file = "../data/geonames-cities-states.json"
    with open(geonames_file, 'r') as f:
        geonames_dict = json.load(f)
    
    
    cities_data = pd.DataFrame(geonames_dict['cities'])\
                    .rename(columns={'admin1_code': 'state_code', 'name': 'city_name', 'population': 'city_popln'})
    cities_data = cities_data[['id', 'state_code', 'city_name', 'city_popln', 'alternate_names']]
    states_data = pd.DataFrame(geonames_dict['states_by_abbr'].values())\
                    .rename(columns={'admin1_code': 'state_code', 'name': 'state_name'})
    states_data = states_data[['state_code', 'state_name']]
    city_states_data = cities_data.merge(states_data, how='left', on='state_code')
    city_states_data['city_weight'] = city_states_data['city_popln'] / city_states_data['city_popln'].sum()
    return city_states_data

In [None]:
city_states_data = get_geonames_city_state_data()
city_weights = city_states_data[['city_name', 'city_weight']].set_index('city_name').to_dict()['city_weight']
city_state_code_info = city_states_data[['city_name', 'state_code', 'city_weight']].copy()
city_state_name_info = city_states_data[['city_name', 'state_name', 'city_weight']].copy()

In [None]:
def get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8):
    rand_val = random.random()
    if rand_val <= state_code_threshold:
        return ', '.join(city_state_code_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_code']].values.tolist()[0])
    return ', '.join(city_state_name_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_name']].values.tolist()[0])

city_state=get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8)

In [None]:
city_state

In [None]:
from collections import Counter

city_states_counter = Counter()
for _ in range(10000):
    city_states_counter.update([get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8)])

city_state = [cit_sta for cit_sta, cnt in city_states_counter.most_common(200)]

In [None]:
city_state


#### some additional augmented yelp intent queries

In [None]:
home_maintenance_repair = [
    'roofing', 'flooring', 'plumbing', 'house painting', 'carpet installation',
    'hardwood floor refinishing', 'drywall repair', 'electrical services', 'window installation',
    'HVAC services', 'fencing', 'roof replacement', 'gutter repair', 'kitchen renovation',
    'bathroom remodeling', 'exterior painting', 'interior painting', 'concrete repair',
    'driveway paving', 'deck repair', 'plumber',
]

moving_storage = [
    'local movers', 'long-distance movers', 'packing services', 'furniture movers',
    'storage solutions', 'cross-country movers', 'apartment moving', 'pool table moving',
    'interstate movers', 'moving truck rental', 'moving labor', 'junk removal',
    'relocation services', 'packing supplies', 'office movers', 'vehicle shipping',
    'moving container rental', 'small move services', 'senior moving services', 'pet relocation'
]

restaurants_food = [
    'sushi', 'tacos', 'pizza', 'burgers', 'ramen', 'pasta', 'salads', 'barbecue', 'ice cream',
    'acai bowls', 'vegan food', 'steakhouses', 'buffet restaurants', 'seafood restaurants',
    'brunch spots', 'food trucks', 'fast food', 'diner', 'Mexican restaurants', 'Italian restaurants',
    'fried chicken', 'hot dogs', 'donuts', 'bagels', 'barbecue chicken',
    'buffalo wings', 'grilled cheese', 'cheesesteak', 'poutine', 'meatloaf',
    'fried fish', 'soul food', 'dim sum', 'dumplings', 'tapas',
    'Indian restaurants', 'Mediterranean food', 'Korean BBQ', 'Vietnamese pho', 'crepes',

]

health_wellness = [
    'dermatologists', 'dentists', 'optometrists', 'pediatricians', 'pharmacies',
    'acupuncture', 'chiropractors', 'physical therapy', 'massage therapy', 'eyebrow threading',
    'laser hair removal', 'facials', 'hair salons', 'nail salons', 'spas', 'mental health counseling',
    'cosmetic surgery', 'nutritionists', 'fitness trainers', 'wellness centers'
]

car_repair_automotive_services = [
    'oil change', 'tire replacement', 'brake repair', 'car inspection', 'car detailing',
    'transmission repair', 'engine diagnostics', 'battery replacement', 'alignment services',
    'auto body repair', 'windshield replacement', 'car wash', 'wheel alignment', 'car painting',
    'exhaust system repair', 'auto glass repair', 'AC repair', 'tune-up services', 'car rental', 'car towing'
]

cleaning_services = [
    'house cleaning', 'deep cleaning', 'maid services', 'disinfection and sanitization', 'carpet cleaning',
    'window cleaning', 'move-out cleaning', 'office cleaning', 'apartment cleaning', 'laundry services',
    'floor cleaning', 'pressure washing', 'garage cleaning', 'post-construction cleaning',
    'air duct cleaning', 'roof cleaning', 'tile and grout cleaning', 'upholstery cleaning',
    'yard cleaning', 'organizing services'
]

entertainment_activities = [
    'bowling', 'karaoke', 'movie theaters', 'mini-golf', 'amusement parks', 'live music venues', 'escape rooms',
    'arcades', 'zoos', 'aquariums', 'water parks', 'comedy clubs', 'museums', 'laser tag', 'go-karts',
    'roller skating', 'trampoline parks', 'horseback riding', 'batting cages', 'rock climbing gyms'
]

beauty_personal_care = [
    'hair salons', 'barbershops', 'nail salons', 'spas', 'eyebrow threading', 'facials', 'microblading',
    'laser hair removal', 'brow lamination', 'spray tanning', 'makeup artists', 'cosmetic surgery',
    'waxing services', 'beauty salons', 'eyelash extensions', 'massage therapy', 'piercing', 'acne treatments',
    'dermatology', 'body sculpting'
]

specialty_shops_services = [
    'embroidery services', 'custom painting', 'interior design', 'furniture restoration', 'florists', 
    'tailors', 'wedding planners', 'personal chefs', 'home organizers', 'antique shops', 'handyman services',
    'custom cabinet makers', 'fence installation', 'deck building', 'security system installation', 'pest control',
    'landscaping services', 'pet grooming', 'art restoration', 'personal trainers'
]

city_short = ["sf", "sfo", "san francisco",
              "nyc", "new york",
              "la", "lax",
              "chi", "chicago",
              "hou", "houston",
              "mia", "miami",
              "vegas", "lv",
              "bos", "boston", 
              "sea", "seattle", 
              "atl", "atlanta",
              "dfw", "dallas",
              "dc", "washington",
              "philly", "philadelphia",
              "phx", "phoenix",
              "sd", "sandiego",
              "den", "denver",
              "orl", "orlando",
              "atx", "austin",
              "nash", "nashville",
              "pdx", "portland",
              "nola", "new orleans",
              "sat", "san antonio",
              "clt", "charlotte",
              "det", "detroit",
              "tpa", "tampa",
              "balt", "baltimore",
              "cle", "cleveland",
              "mpls", "minneapolis",
              "slc", "salt lake city",
              "indy", "indianapolis",
              "kc", "kansas city",
]


In [None]:
yelp_intent_additional_templates = [
    # Home Maintenance & Repair
    "Find {home_maintenance_repair} near me",
    "Best {home_maintenance_repair} providers in {city_state}",
    "Affordable {home_maintenance_repair} in {city_state}",
    "Top-rated {home_maintenance_repair} companies",
    "How much does {home_maintenance_repair} cost?",
    "Compare reviews of {home_maintenance_repair} providers",
    "{city_short} {home_maintenance_repair}",
    "{home_maintenance_repair} {city_short}",
    "{city_state} {home_maintenance_repair}",
    "{home_maintenance_repair} {city_state}",
    

    # Moving & Storage
    "Best {moving_storage} for long distance moving",
    "Find a local {moving_storage} company",
    "Compare prices for {moving_storage} in {city_state}",
    "Reviews of {moving_storage} companies near me",
    "How to hire {moving_storage} for a small move",
    "Best-rated {moving_storage} companies",
    "{city_short} {moving_storage}",
    "{moving_storage} {city_short}",
    "{city_state} {moving_storage}",
    "{moving_storage} {city_state}",

    # Restaurants & Food
    "Best {restaurants_food} near me",
    "Top {restaurants_food} reviews in {city_state}",
    "Affordable {restaurants_food} options near me",
    "Where to find the best {restaurants_food} in {city_state}?",
    "Compare {restaurants_food} reviews in {city_state}",
    "5-star {restaurants_food} recommendations",
    "{city_short} {restaurants_food}",
    "{restaurants_food} {city_short}",
    "{city_state} {restaurants_food}",
    "{restaurants_food} {city_state}",

    # Health & Wellness
    "Find {health_wellness} near me",
    "Best-rated {health_wellness} in {city_state}",
    "Affordable {health_wellness} options near me",
    "Compare reviews for {health_wellness} providers",
    "Top doctors and clinics for {health_wellness}",
    "How much does {health_wellness} cost?",
    "{city_short} {health_wellness}",
    "{health_wellness} {city_short}",
    "{city_state} {health_wellness}",
    "{health_wellness} {city_state}",

    # Car Repair & Automotive Services
    "Find {car_repair_automotive_services} near me",
    "Best {car_repair_automotive_services} providers in {city_state}",
    "Top car repair shops for {car_repair_automotive_services}",
    "Compare prices for {car_repair_automotive_services}",
    "Affordable {car_repair_automotive_services} options near me",
    "Top-rated {car_repair_automotive_services} providers",
    "{city_short} {car_repair_automotive_services}",
    "{car_repair_automotive_services} {city_short}",
    "{city_state} {car_repair_automotive_services}",
    "{car_repair_automotive_services} {city_state}",

    # Cleaning Services
    "Find a {cleaning_services} in {city_state}",
    "Affordable {cleaning_services} options near me",
    "Compare reviews for {cleaning_services} providers",
    "Get a {cleaning_services} quote near me",
    "Best {cleaning_services} providers in {city_state}",
    "How much does {cleaning_services} cost?",
    "{city_short} {cleaning_services}",
    "{cleaning_services} {city_short}",
    "{city_state} {cleaning_services}",
    "{cleaning_services} {city_state}",

    # Entertainment & Activities
    "Best {entertainment_activities} near me",
    "Top-rated {entertainment_activities} in {city_state}",
    "Where to find {entertainment_activities} options in {city_state}?",
    "Affordable {entertainment_activities} activities near me",
    "Compare reviews for {entertainment_activities} venues",
    "Top places for {entertainment_activities} this weekend",
    "{city_short} {entertainment_activities}",
    "{entertainment_activities} {city_short}",
    "{city_state} {entertainment_activities}",
    "{entertainment_activities} {city_state}",

    # Beauty & Personal Care
    "Find {beauty_personal_care} near me",
    "Top-rated {beauty_personal_care} salons in {city_state}",
    "Compare reviews for {beauty_personal_care} providers",
    "Affordable {beauty_personal_care} services near me",
    "Best {beauty_personal_care} options in {city_state}",
    "How much does {beauty_personal_care} cost?",
    "{city_short} {beauty_personal_care}",
    "{beauty_personal_care} {city_short}",
    "{city_state} {beauty_personal_care}",
    "{beauty_personal_care} {city_state}",

    # Specialty Shops & Services
    "Where to find {specialty_shops_services} in {city_state}?",
    "Best reviews for {specialty_shops_services} near me",
    "Affordable {specialty_shops_services} options near me",
    "Compare {specialty_shops_services} providers in {city_state}",
    "How to hire {specialty_shops_services} professionals",
    "Top-rated {specialty_shops_services} in {city_state}"
    "{city_short} {specialty_shops_services}",
    "{specialty_shops_services} {city_short}",
    "{city_state} {specialty_shops_services}",
    "{specialty_shops_services} {city_state}",
]


In [None]:
len(yelp_intent_additional_templates)

In [None]:
def detect_service(service_categories, template):
    categories = []
    for category in service_categories.keys():
        if category in template:
            categories.append(category)
    return categories


def generate_service_queries(service_categories, templates, n_queries=1000):
    cnt = 0
    queries = []
    query_set = set()
    while cnt < n_queries:
        if cnt % 1000 == 0:
            print(f"{cnt+1} examples added")
        template = random.choice(templates)
        # print(f"template = {template}")
        categories = detect_service(service_categories, template)
        # print(f"categories = {categories}")
        query = template
        for category in categories:
            if category:
                service = random.choice(service_categories[category])
                category = category.replace("{","").replace("}", "")
                query = query.replace(f"{{{category}}}",service)
                # print(f"query = {query}")
                # print(f"category = {category}")
                if query not in query_set and "{" not in query:
                    queries.append(query)
                    query_set.add(query)
                    cnt += 1
    return queries

In [None]:
HOME_MAINTENANCE_REPAIR = "{home_maintenance_repair}"
MOVING_STORAGE = "{moving_storage}"
RESTAURANT_FOOD = "{restaurants_food}"
HEALTH_WELLNESS = "{health_wellness}"
CAR_REPAIR_AUTOMOTIVE_SERVICES = "{car_repair_automotive_services}"
CLEANING_SERVICES = "{cleaning_services}"
ENTERTAINMENT_ACTIVITIES = "{entertainment_activities}"
BEAUTY_PERSONAL_CARE = "{beauty_personal_care}"
SPECIALITY_SHOPS_SERVICES = "{specialty_shops_services}"
CITY_STATES = "{city_state}"
CITY_SHORT = "{city_short}"


service_categories = {
    HOME_MAINTENANCE_REPAIR: home_maintenance_repair,
    MOVING_STORAGE: moving_storage,
    RESTAURANT_FOOD: restaurants_food,
    HEALTH_WELLNESS: health_wellness,
    CAR_REPAIR_AUTOMOTIVE_SERVICES: car_repair_automotive_services,
    CLEANING_SERVICES: cleaning_services,
    ENTERTAINMENT_ACTIVITIES: entertainment_activities,
    BEAUTY_PERSONAL_CARE: beauty_personal_care,
    SPECIALITY_SHOPS_SERVICES: specialty_shops_services,
    CITY_STATES: city_state,
    CITY_SHORT: city_short,
}


In [None]:
yelp_intent_additional_queries = generate_service_queries(service_categories, yelp_intent_additional_templates, n_queries=15000) #20000
print(len(yelp_intent_additional_queries))

In [None]:
yelp_intent_additional_queries_df = pd.DataFrame(yelp_intent_additional_queries, columns=['sequence'])
yelp_intent_additional_queries_df['target'] = 'yelp_intent'
yelp_intent_additional_queries_df

#### Navigation intent additional queries

In [None]:
navigation_intent_templates = [
    # Routing Numbers & Bank Information
    "routing number for {bank}",
    "address of {bank}",
    "what is the routing number for {bank}",
    "verify routing number for {bank}",
    "contact number for {bank} customer service",
    "find routing number of {bank} in {location}",
    "routing number for {credit_union}",

    # Company & Service Support
    "support number for {service}",
    "how to contact {service} support",
    "customer support number for {service}",
    "cancel {service} account",
    "what is the {service} customer care number",
    "fax number for {service}",
    "call {service} customer service",

    # Login or Account Information
    "login to {service} account",
    "how to login to {service} on my computer",
    "account management for {service}",
    "reset password for {service}",
    "forgot login details for {service}",
    "how to access {service} account",

    # Addresses & Locations
    "address for {location}",
    "find address of {business} in {location}",
    "location of {business}",
    "where is {place} located",
    "directions to {place}",
    "address of {store} in {location}",

    # Cancellation or Service Changes
    "cancel {service} subscription",
    "change address for {service}",
    "cancellation fee for {service}",
    "cancellation policy for {service}",
    "how to cancel {service} account",
    "cancel {service} membership",

    "features of {product}",

    # TV Shows, Movies, and Streaming Services
    "is {show} on {streaming_service}?",
    "is {movie} available on {platform}?",
    "does {device} support {service}?",
    "can I watch {show} on {device}?",
    "how to stream {show} on {platform}",
    "is {show} canceled?",

    # Educational Resources & Information
    "tuition fee for {university} in 2024",
    "how to apply for {course} on {learning_platform}",
    "contact {university} admissions office",
    "academic calendar for {university}",
    "what is the login for {learning_platform}?",

    # Shipping & Tracking
    "track my package on {shipping_service}",
    "shipping cost for {product} on {platform}",
    "what is the tracking number for {courier}?",
    "how to track {courier} delivery?",
    "where is my {shipping_service} package?",

    # Government Services & Documents
    "how to renew my driver’s license with {state_dmv}",
    "where is the closest post office?",
    "how to apply for a passport in the US",
    "IRS contact number for tax queries",
    "how to change address with {state_dmv}",

    # Finance & Banking
    "how to increase my credit limit with {bank}",
    "where to find {bank} ATM near me",
    "credit score needed for {credit_card}",
    # "what are the benefits of {credit_card}?",
    "how to apply for a mortgage with {bank}",

    # Tech Support & Troubleshooting
    "how to fix {device} screen issue",
    "support number for {tech_company}",
    "how to update {software} on {device}",
    "what to do if {device} won’t start?",
    "reset password for {account} on {device}",

    # Employment & Career
    "find job openings at {company}",
    "what are the job duties for {position}?",
    "how to apply for {job_role} at {company}",
    "contact HR at {company}",
    "career advice for {industry}",

    # Public Services & Utilities
    "pay my electricity bill with {utility_company}",
    "find waste management services near me",
    "report a power outage with {utility_company}",
    "how to sign up for {utility_service}?",
    "how to contact {utility_company} support?",

    # Events & Ticketing
    "find concert tickets for {artist} on {platform}",
    "how to book tickets for {event}?",
    "find the best seats for {concert}",
    "how to get discounts for {festival} tickets?",
    "ticket refund policy for {platform}"

    # Email & Account Access
    "login to {email_provider}",
    "access {email_provider} on my computer",
    "forgot password for {login_service}",
    "how to reset password for {login_service} account",
    "access my {email_provider} inbox",
    
    # Government Services
    "how to track my refund on {government_service} website",
    "get support from {government_service} for {topic}",
    "how to check my status with {government_service}",
    "apply for services through {government_service}",
    
    # Financial Services & Bank Support
    "login to {financial_service} account",
    "how to check balance on {financial_service}",
    "support number for {financial_service} customer service",
    "pay my bill with {financial_service}",
    
    # Software & Device Support
    "fix {device} issues with {support_service} support",
    "how to troubleshoot {software} problems",
    "download {software} for {device}",
    "check updates for {software}",
    
    # Other Services & General Queries
    "how to download {software} for {task}",
    "install {software} on {device}",
    "find customer support number for {support_service}",
    "how to change account details for {login_service}",

    # General Navigation Queries
    "how do I sign in to {domain}",
    "login to {domain} account",
    "where is the sign-in page on {domain}",
    "reset my password on {domain}",
    "authenticate my account on {domain}",
    "how to sign up for {domain} account",

    # Registration & Account Creation
    "create an account on {domain}",
    "how to register on {domain}",
    "where can I sign up for {domain}",
    "register for a new account on {domain}",
    "sign up for {domain} services",
    
    # Login, Sign-in, Authentication
    "how do I log into {domain}",
    "sign into {domain} with email",
    "can I sign in to {domain} with my phone number",
    "how do I recover my password on {domain}",
    "log out of {domain} account",
    
    # Forms & Document Submission
    "where to submit forms on {domain}",
    "download forms from {domain}",
    "upload documents to {domain}",
    "how do I submit a form on {domain}",
    "find registration forms on {domain}",
    
    # Contact & Customer Support
    "how do I contact support on {domain}",
    "where is the customer service number on {domain}",
    "how do I get help on {domain}",
    "contact support on {domain} for issues",
    "find contact info on {domain}",
    
    # Tracking & Status Updates
    "track my package on {domain}",
    "check my order status on {domain}",
    "how do I track a shipment on {domain}",
    "where is the tracking page on {domain}",
    "track delivery updates on {domain}",

    "{domain}/jobs",
    "{domain}/careers",
    "{domain}/login",
    "{domain}/signin",
    "{domain}/sign in",
]


In [None]:
len(navigation_intent_templates)

In [None]:
bank = [
    'Wells Fargo', 'Bank of America', 'Chase', 'TD Bank', 'PNC',
    'Citibank', 'US Bank', 'Capital One', 'HSBC', 'Fifth Third Bank',
    'Regions Bank', 'Ally Bank', 'SunTrust', 'KeyBank', 'M&T Bank'
]

credit_card = [
    'Chase Sapphire Preferred', 'Capital One Venture Rewards', 'American Express Platinum', 
    'Citi Double Cash', 'Discover It Cash Back', 
    'Wells Fargo Active Cash', 'Bank of America Travel Rewards', 
    'Chase Freedom Unlimited', 'Capital One Quicksilver', 'U.S. Bank Visa Platinum',
    'American Express Gold', 'Citi Premier Card', 'Discover It Miles',
    'Barclays AAdvantage Aviator Red', 'Amazon Prime Rewards Visa Signature',
    'Delta SkyMiles Platinum American Express', 'Hilton Honors American Express Surpass',
    'Southwest Rapid Rewards Plus', 'Marriott Bonvoy Boundless', 'United Explorer Card'
]

location = [
    'New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami',
    'Dallas', 'San Francisco', 'Atlanta', 'Seattle', 'Boston',
    'Phoenix', 'Orlando', 'Philadelphia', 'Denver', 'Las Vegas'
]

credit_union = [
    'Navy Federal Credit Union', 'Alliant Credit Union', 'Golden 1 Credit Union',
    'First Tech Federal Credit Union', 'America First Credit Union', 'Pentagon Federal Credit Union',
    'San Diego County Credit Union', 'Suncoast Credit Union', 'BECU', 'Teachers Federal Credit Union',
    'Keesler Federal Credit Union', 'Valley First Credit Union', 'River Region Credit Union',
    'Champion Credit Union', 'Mountain America Credit Union'
]

service = [
    'Netflix', 'Spotify', 'Amazon Prime', 'Google Drive', 'Uber',
    'Disney+', 'YouTube Premium', 'Dropbox', 'Zoom', 'Venmo',
    'Lyft', 'Twitch', 'Slack', 'LinkedIn', 'DoorDash'
]

product = [
    'iPhone', 'Samsung Galaxy', 'MacBook', 'PlayStation 5', 'AirPods',
    'Sony TV', 'Apple Watch', 'Bose headphones', 'Canon DSLR', 'GoPro',
    'Microsoft Surface', 'Google Pixel', 'Fitbit', 'Nintendo Switch', 'Xbox Series X'
]

platform = [
    'Amazon', 'eBay', 'Walmart', 'Best Buy', 'Target',
    'Apple Store', 'Google Store', 'Newegg', 'B&H', 'Costco'
]

streaming_service = [
    'Netflix', 'Hulu', 'Amazon Prime', 'Disney+', 'HBO Max',
    'Apple TV+', 'Peacock', 'Paramount+', 'YouTube TV', 'Sling TV'
]

show = [
    'Breaking Bad', 'Stranger Things', 'Game of Thrones', 'Friends', 'The Office',
    'The Mandalorian', 'The Crown', 'WandaVision', 'Loki', 'The Boys'
]

learning_platform = [
    'Coursera', 'Udemy', 'edX', 'Khan Academy', 'LinkedIn Learning',
    'Pluralsight', 'Skillshare', 'Codecademy', 'Udacity', 'FutureLearn'
]

shipping_service = [
    'FedEx', 'UPS', 'USPS', 'DHL', 'Amazon Logistics',
    'Yanwen', 'Aramex', 'Canada Post', 'Royal Mail', 'Hermes'
]

courier = [
    'FedEx', 'UPS', 'DHL', 'USPS', 'Aramex',
    'Yanwen', 'Canada Post', 'Royal Mail', 'Hermes', 'TNT'
]

university = [
    'Harvard University', 'Stanford University', 'Massachusetts Institute of Technology', 
    'University of California, Berkeley', 'Princeton University',
    'Yale University', 'Columbia University', 'University of Chicago', 'New York University', 'University of Michigan'
]

state_dmv = [
    'California DMV', 'New York DMV', 'Texas DMV', 'Florida DMV', 'Illinois DMV',
    'Pennsylvania DMV', 'Ohio BMV', 'Georgia DDS', 'Virginia DMV', 'New Jersey MVC'
]

utility_company = [
    'Pacific Gas & Electric', 'Duke Energy', 'Con Edison', 'Southern California Edison', 'National Grid',
    'Xcel Energy', 'Florida Power & Light', 'PSEG', 'Dominion Energy', 'Consumers Energy'
]

utility_service = [
    'electricity', 'water supply', 'natural gas', 'internet', 'cable TV',
    'trash collection', 'sewage service', 'recycling pickup', 'phone service',
    'solar power', 'wind energy', 'fiber internet', 'home security system', 
    'smart meter installation', 'smart thermostat installation',
    'geothermal heating', 'propane service', 'stormwater management',
    'emergency power backup', 'district heating'
]

event = [
    'Coachella', 'Lollapalooza', 'Burning Man', 'Comic-Con', 'The Oscars',
    'Super Bowl', 'World Series', 'NBA Finals', 'Wimbledon', 'Grammy Awards'
]

company = [
    'Google', 'Apple', 'Microsoft', 'Facebook', 'Amazon',
    'Tesla', 'Twitter', 'Netflix', 'Airbnb', 'Spotify'
]

device = [
    'iPhone', 'MacBook', 'Samsung Galaxy', 'iPad', 'PlayStation 5',
    'Xbox Series X', 'Apple Watch', 'Fitbit', 'Surface Pro', 'Nintendo Switch'
]

festival = [
    'Coachella', 'Lollapalooza', 'Burning Man', 'Tomorrowland', 'SXSW',
    'Glastonbury', 'Oktoberfest', 'Mardi Gras', 'Cannes Film Festival', 'Sundance Film Festival',
    'Ultra Music Festival', 'New Orleans Jazz & Heritage Festival', 'Austin City Limits', 'Bonnaroo', 'Electric Daisy Carnival',
    'Stagecoach', 'Summerfest', 'Essence Festival', 'Rock in Rio', 'Woodstock'
]

artist = [
    'Taylor Swift', 'Beyoncé', 'Ed Sheeran', 'Drake', 'Ariana Grande',
    'Billie Eilish', 'The Weeknd', 'Justin Bieber', 'Kanye West', 'Rihanna',
    'Bruno Mars', 'Shawn Mendes', 'Dua Lipa', 'Travis Scott', 'Lady Gaga',
    'Post Malone', 'Harry Styles', 'Adele', 'Coldplay', 'Imagine Dragons'
]

job_role = [
    'Software Engineer', 'Data Scientist', 'Marketing Manager', 'Graphic Designer', 'Project Manager',
    'Sales Representative', 'Accountant', 'Nurse', 'Mechanical Engineer', 'Product Manager',
    'Business Analyst', 'Consultant', 'UX/UI Designer', 'Customer Support Specialist', 'Operations Manager',
    'Human Resources Manager', 'Financial Analyst', 'Social Media Manager', 'Content Writer', 'DevOps Engineer'
]

position = [
    'Software Developer', 'Senior Manager', 'Account Executive', 'Nurse Practitioner', 'Mechanical Technician',
    'Business Consultant', 'Marketing Director', 'Sales Engineer', 'Systems Analyst', 'Financial Consultant',
    'HR Specialist', 'Executive Assistant', 'Data Engineer', 'Legal Advisor', 'Product Owner',
    'Operations Director', 'IT Administrator', 'Brand Manager', 'Customer Service Representative', 'Medical Assistant'
]

industry = [
    'technology', 'finance', 'healthcare', 'manufacturing', 'education',
    'real estate', 'marketing', 'media', 'retail', 'automotive',
    'hospitality', 'construction', 'pharmaceutical', 'telecommunications', 'energy',
    'transportation', 'insurance', 'consulting', 'legal', 'entertainment'
]

account = [
    'Google account', 'Facebook account', 'Apple account', 'Amazon account', 'Netflix account',
    'Spotify account', 'Microsoft account', 'Instagram account', 'Twitter account', 'Uber account',
    'Dropbox account', 'LinkedIn account', 'Slack account', 'Zoom account', 'Venmo account',
    'PayPal account', 'eBay account', 'Airbnb account', 'Twitch account', 'Pinterest account'
]

software = [
    'Windows 10', 'macOS', 'Microsoft Office', 'Adobe Photoshop', 'Slack',
    'Zoom', 'Google Chrome', 'Firefox', 'Visual Studio Code', 'Python',
    'Java', 'Salesforce', 'WordPress', 'AutoCAD', 'Tableau',
    'SQL Server', 'GitHub', 'IntelliJ IDEA', 'Figma', 'Trello'
]

place = [
    'Disneyland', 'Eiffel Tower', 'Statue of Liberty', 'The Grand Canyon', 'The Colosseum',
    'Empire State Building', 'Golden Gate Bridge', 'Mount Rushmore', 'Niagara Falls', 'The Louvre',
    'Big Ben', 'The Vatican', 'Great Wall of China', 'Times Square', 'Central Park',
    'Sydney Opera House', 'Stonehenge', 'Machu Picchu', 'Christ the Redeemer', 'The Pyramids of Giza'
]

email_provider = ['Gmail', 'Yahoo Mail', 'Outlook', 'iCloud', 'ProtonMail']
login_service = [
    'Netflix', 'Amazon', 'Spotify', 'Facebook', 'Instagram', 'PayPal', 'Gmail',
    'LinkedIn', 'Twitter', 'Zoom', 'Dropbox', 'Uber', 'Venmo'
]
government_service = ['IRS', 'DMV', 'SSA', 'FBI', 'DHS', 'CDC']
financial_service = [
    'Bank of America', 'Wells Fargo', 'Chase', 'Citibank', 'Capital One',
    'Discover', 'American Express', 'PayPal', 'Venmo'
]
support_service = ['Dell', 'Apple', 'Samsung', 'HP', 'Lenovo', 'Microsoft']

domain = [
    'google.com',
    'facebook.com',
    'amazon.com',
    'youtube.com',
    'wikipedia.org',
    'twitter.com',
    'reddit.com',
    'netflix.com',
    'ebay.com',
    'linkedin.com',
    'pinterest.com',
    'instagram.com',
    'craigslist.org',
    'yahoo.com',
    'hulu.com',

    # News & Media
    'espn.com',
    'foxnews.com',
    'cnn.com',
    'nytimes.com',
    'washingtonpost.com',
    'bbc.com',
    'msnbc.com',
    'theguardian.com',
    'buzzfeednews.com',
    'nbcnews.com',

    # Shopping & E-commerce
    'walmart.com',
    'apple.com',
    'target.com',
    'costco.com',
    'bestbuy.com',
    'homedepot.com',
    'lowes.com',
    'etsy.com',
    'kohls.com',
    'macys.com',

    # Government Services
    'irs.gov',
    'dmv.org',
    'ssa.gov',
    'healthcare.gov',
    'fbi.gov',
    'usps.com',
    'medicaid.gov',
    'va.gov',
    'uscis.gov',
    'cdc.gov',

    # Entertainment & Streaming
    'spotify.com',
    'disneyplus.com',
    'peacocktv.com',
    'hbomax.com',
    'paramountplus.com',
    'twitch.tv',
    'sling.com',
    'primevideo.com',
    'tv.apple.com',

    # Travel & Booking
    'expedia.com',
    'tripadvisor.com',
    'booking.com',
    'airbnb.com',
    'priceline.com',
    'southwest.com',
    'aa.com',
    'delta.com',

    # Financial Services & Payments
    'paypal.com',
    'venmo.com',
    'chase.com',
    'bankofamerica.com',
    'wellsfargo.com',
    'capitalone.com',
    'americanexpress.com',
    'discover.com',
    'stripe.com',

    # Utility Services
    'comcast.com',
    'xfinity.com',
    'att.com',
    'verizon.com',
    'spectrum.com',
    'duke-energy.com',
    'coned.com',
    'pseg.com',
    'nationalgridus.com',
    'fpl.com',

    # Health & Fitness
    'webmd.com',
    'myfitnesspal.com',
    'mayoclinic.org',
    'healthline.com',
    'bcbs.com',
    'uhc.com',
    'walgreens.com',
    'cvs.com',

    ## Additional domains
    'tiktok.com',
    'whatsapp.com',
    'messenger.com',
    'snapchat.com',
    'slack.com',
    'forbes.com',
    'bloomberg.com',
    'reuters.com',
    'usatoday.com',
    'aljazeera.com',
    'newegg.com',
    'wayfair.com',
    'zillow.com',
    'chewy.com',
    'sephora.com',
    'coursera.org',
    'udemy.com',
    'khanacademy.org',
    'edx.org',
    'duolingo.com',
    'nih.gov',
    'clevelandclinic.org',
    'robinhood.com',
    'sofi.com',
    'dropbox.com',
    'weebly.com',
    'shopify.com',
    'wordpress.com',
    'turbotax.com',
    'creditkarma.com',
    'intuit.com',
    'geico.com',
    'progressive.com',
    'statefarm.com',
    'allstate.com',
    'esurance.com',
    'pnc.com',
    'td.com',
    'citibank.com',
    'suntrust.com',
    'huntington.com',
    'ally.com',
    'navyfed.org',
    'fidelity.com',
    'vanguard.com',
    'etrade.com',
    'schwab.com',
    'ameritrade.com',
    'coinmarketcap.com',
    'yelp.com',
    'opentable.com',
    'groupon.com',
    'livingSocial.com',
    'kayak.com',
    'hotels.com',
    'orbitz.com',
    'cheapoair.com',
    'travelocity.com',
    'skyscanner.com',
    'jetblue.com',
    'alaskaair.com',
    'spirit.com',
    'nordstrom.com',
    'gap.com',
    'oldnavy.com',
    'bananaRepublic.com',
    'hottopic.com',
    'uniqlo.com',
    'jcpenney.com',
    'sears.com',
    'footlocker.com',
    'victoriassecret.com',
    'adidas.com',
    'nike.com',
    'underarmour.com'
]


In [None]:
len(navigation_intent_templates)

In [None]:
BANK_NAVIGATION = "{bank}"
CREDIT_CARD_NAVIGATION = "{credit_card}"
LOCATION_NAVIGATION = "{location}"
CREDIT_UNION_NAVIGATION = "{credit_union}"
SERVICE_NAVIGATION = "{service}"
PRODUCT_NAVIGATION = "{product}"
PLATFORM_NAVIGATION = "{platform}"
STREAMING_SERVICE_NAVIGATION = "{streaming_service}"
SHOW_NAVIGATION = "{show}"
LEARNING_PLATFORM_NAVIGATION = "{learning_platform}"
SHIPPING_SERVICE_NAVIGATION = "{shipping_service}"
COURIER_NAVIGATION = "{courier}"
UNIVERSITY_NAVIGATION = "{university}"
STATE_DMV_NAVIGATION = "{state_dmv}"
UTILITY_COMPANY_NAVIGATION = "{utility_company}"
UTILITY_SERVICE_NAVIGATION = "{utility_service}"
EVENT_NAVIGATION = "{event}"
COMPANY_NAVIGATION = "{company}"
DEVICE_NAVIGATION = "{device}"
FESTIVAL_NAVIGATION = "{festival}"
ARTIST_NAVIGATION = "{artist}"
JOBROLE_NAVIGATION = "{job_role}"
POSITION_NAVIGATION = "{position}"
INDUSTRY_NAVIGATION = "{industry}"
ACCOUNT_NAVIGATION = "{account}"
SOFTWARE_NAVIGATION = "{software}"
PLACE_NAVIGATION = "{place}"
EMAIL_PROVIDER_NAVIGATION = "{email_provider}"
LOGIN_SERVICE_NAVIGATION = "{login_service}"
GOVERNMENT_SRVICE_NAVIGATION = "{government_service}"
FINANCIAL_SERVICE_NAVIGATION = "{financial_service}"
SUPPORT_SREVICE_NAVIGATION = "{support_service}"
DOMAIN_NAVIGATION = "{domain}"

navigation_categories = {
    BANK_NAVIGATION: bank,
    CREDIT_CARD_NAVIGATION: credit_card,
    LOCATION_NAVIGATION: location,
    CREDIT_UNION_NAVIGATION: credit_union,
    SERVICE_NAVIGATION: service,
    PRODUCT_NAVIGATION: product,
    PLATFORM_NAVIGATION: platform,
    STREAMING_SERVICE_NAVIGATION: streaming_service,
    SHOW_NAVIGATION: show,
    LEARNING_PLATFORM_NAVIGATION: learning_platform,
    SHIPPING_SERVICE_NAVIGATION: shipping_service,
    COURIER_NAVIGATION: courier,
    UNIVERSITY_NAVIGATION: university,
    STATE_DMV_NAVIGATION: state_dmv,
    UTILITY_COMPANY_NAVIGATION: utility_company,
    UTILITY_SERVICE_NAVIGATION: utility_service,
    EVENT_NAVIGATION: event,
    COMPANY_NAVIGATION: company,
    DEVICE_NAVIGATION: device,
    FESTIVAL_NAVIGATION: festival,
    ARTIST_NAVIGATION: artist,
    JOBROLE_NAVIGATION: job_role,
    POSITION_NAVIGATION: position,
    INDUSTRY_NAVIGATION: industry,
    ACCOUNT_NAVIGATION: account,
    SOFTWARE_NAVIGATION: software,
    PLACE_NAVIGATION: place,
    EMAIL_PROVIDER_NAVIGATION: email_provider,
    LOGIN_SERVICE_NAVIGATION: login_service,
    GOVERNMENT_SRVICE_NAVIGATION: government_service,
    FINANCIAL_SERVICE_NAVIGATION: financial_service,
    SUPPORT_SREVICE_NAVIGATION: support_service,
    DOMAIN_NAVIGATION: domain,
}



In [None]:
navigation_intent_additional_queries = generate_service_queries(navigation_categories, navigation_intent_templates, n_queries=8900)
print(len(navigation_intent_additional_queries))

In [None]:
navigation_intent_additional_queries_df = pd.DataFrame(navigation_intent_additional_queries, columns=['sequence'])
navigation_intent_additional_queries_df['target'] = 'navigation_intent'
navigation_intent_additional_queries_df

#### Travel intent additional queries generation

In [None]:
travel_intent_templates = [
    # Visa Information & Requirements
    "What is the US cost for {country} visitor visa?",
    "Do I need a visa to visit {country}?",
    "How long can I stay in {country} with a visa?",
    "Requirements for {country} tourist visa",
    "What is the visa fee for {country} visitors?",

    # Cruise Information & Pricing
    "What are the prices for cruises to {destination}?",
    "Does {cruise_line} offer {service}?",
    "Which cruise lines sail from {location}?",
    "Best time to book cruises to {destination}",
    "What is the cost of a {cruise_line} cruise to {destination}?",
    
    # Airport & Flight Information
    "What airport is closest to {location}?",
    "What airport code is {airport_code}?",
    "Which airlines travel to {destination}?",
    "What airport is near {tourist_attraction}?",
    "What is the best airport for {city_state}?",

    # Best Time to Visit
    "When is the best time to visit {destination}?",
    "What is the best season to visit {destination}?",
    "What month should I visit {tourist_attraction}?",
    "When should I travel to {country} for good weather?",
    "Best time to visit {tourist_destination} in {country}",

    # Tourist Attractions & Tours
    "Top tourist attractions in {destination}",
    "Best tours of {destination}",
    "Guided tours to {country}",
    "What are the must-visit attractions in {location}?",
    "What are the most popular tours in {destination}?",

    # Resorts & Hotels
    "What are the best resorts in {destination}?",
    "Is {resort} all-inclusive?",
    "Does {resort} charge a resort fee?",
    "Where is the nearest resort to {location}?",
    "What resorts in {destination} offer all-inclusive packages?",

    # Weather Information
    "Best weather for visiting {destination}",
    "What is the weather like in {country} during {month}?",
    "What is the average temperature in {destination} in {season}?",
    "How does the weather in {destination} change by season?",
    "What is the weather forecast for {destination} next week?",

    # Travel Costs & Pricing
    "What is the cost of a vacation to {destination}?",
    "How much does it cost to visit {tourist_attraction}?",
    "What is the average cost of a flight to {destination}?",
    "How much do guided tours in {destination} cost?",
    "How much money should I bring for a trip to {country}?",

    # Passports & Travel Documentation
    "Do I need a passport to travel to {destination}?",
    "What documents are required to visit {country}?",
    "How to apply for a visa to visit {destination}?",
    "Can US citizens travel to {country} without a passport?",
    "Where to apply for a passport to travel to {destination}?",

    # Travel Destinations
    "Most visited places in {country}",
    "Top travel destinations in {destination}",
    "What are the top places to visit in {country}?",
    "What are the most popular tourist attractions in {city}?",
    "What are the best destinations in {region} for vacations?"
]

print(len(travel_intent_templates))

In [None]:
country = [
    'Australia', 'Israel', 'Dominican Republic', 'Mexico', 'Canada',
    'United Kingdom', 'France', 'Spain', 'Italy', 'Japan',
    'Germany', 'Brazil', 'Argentina', 'China', 'South Korea',
    'Thailand', 'India', 'Greece', 'Egypt', 'New Zealand'
]
destination = [
    'Hawaii', 'Las Vegas', 'Disney World', 'Grand Canyon', 'New Zealand',
    'Singapore', 'Bahamas', 'Switzerland', 'Ireland', 'Rome',
    'Maui', 'Bora Bora', 'Dubai', 'Bali', 'Maldives',
    'Machu Picchu', 'Reykjavik', 'Iceland', 'Paris', 'London'
]

cruise_line = [
    'Carnival Cruise', 'Royal Caribbean', 'Disney Cruise Line', 'Norwegian Cruise Line', 'Celebrity Cruises',
    'Princess Cruises', 'Holland America Line', 'MSC Cruises', 'Viking Cruises', 'Azamara Club Cruises'
]

location = [
    'Miami', 'Los Angeles', 'Orlando', 'Seattle', 'Galveston',
    'New York City', 'San Francisco', 'Tucson', 'Las Vegas', 'Phoenix',
    'Austin', 'Boston', 'Chicago', 'Houston', 'Denver',
    'Portland', 'Salt Lake City', 'Atlanta', 'Dallas', 'Nashville'
]

tourist_attraction = [
    'White House', 'Niagara Falls', 'Yosemite National Park', 'Tower of London', 'Vatican Museum',
    'Eiffel Tower', 'Mount Rushmore', 'Disneyland', 'Air Force Academy', 'The Colosseum',
    'Statue of Liberty', 'Golden Gate Bridge', 'Stonehenge', 'Machu Picchu', 'The Great Wall of China',
    'Taj Mahal', 'Petra', 'Christ the Redeemer', 'Angkor Wat', 'Sagrada Familia'
]

airport_code = [
    'JFK', 'LAX', 'IAD', 'ORD', 'ATL',
    'MCO', 'PHL', 'SFO', 'SEA', 'PHX',
    'DFW', 'MIA', 'DEN', 'BOS', 'DTW',
    'LGA', 'CLT', 'MSP', 'FLL', 'LAS'
]

resort = [
    'Port Orleans Resort', 'Westgate Resort', 'Kona Coast Resort', 'Bahia Luxury Resort', 'Elara by Hilton',
    'Nizuc Resort', 'Grand Lakes Resort', 'Ashford Castle', 'Vienna Resort', 'Koh Samui Resort',
    'Four Seasons Resort Maui', 'Ritz-Carlton Kapalua', 'Waldorf Astoria Los Cabos', 'Atlantis Paradise Island', 'Le Blanc Spa Resort'
]

region = [
    'South East Asia', 'Caribbean', 'Mediterranean', 'Pacific Islands', 'Western Europe',
    'East Africa', 'Middle East', 'South America', 'Southern Africa', 'Western US',
    'Northern Europe', 'Central America', 'Eastern Europe', 'Indian Ocean', 'Arctic Circle'
]

city_state = [
    'Washington, DC', 'Orlando, FL', 'Las Vegas, NV', 'San Diego, CA', 'New York, NY',
    'Los Angeles, CA', 'Miami, FL', 'Jacksonville, NC', 'Galveston, TX', 'Williamsburg, VA',
    'Austin, TX', 'Boston, MA', 'Phoenix, AZ', 'Dallas, TX', 'Nashville, TN'
]



In [None]:
COUNTRY_TRAVEL = "{country}"
DESTINATION_TRAVEL = "{destination}"
CRUISE_LINE_TRAVEL = "{cruise_line}"
LOCATION_TRAVEL = "{location}"
TOURIST_ATTRACTION_TRAVEL = "{tourist_attraction}"
AIRPORT_CODE_TRAVEL = "{airport_code}"
RESORT_TRAVEL = "{resort}"
REGION_TRAVEL = "{region}"
CITY_STATE_TRAVEL = "{city_state}"


travel_categories = {
    COUNTRY_TRAVEL: country,
    DESTINATION_TRAVEL: destination,
    CRUISE_LINE_TRAVEL: cruise_line,
    LOCATION_TRAVEL: location,
    TOURIST_ATTRACTION_TRAVEL: tourist_attraction,
    AIRPORT_CODE_TRAVEL: airport_code,
    RESORT_TRAVEL: resort,
    REGION_TRAVEL: region,
    CITY_STATE_TRAVEL: city_state,
}



In [None]:
travel_intent_additional_queries = generate_service_queries(travel_categories, travel_intent_templates, n_queries=1000)
print(len(travel_intent_additional_queries))

In [None]:
# travel_intent_additional_queries
travel_intent_additional_queries_df = pd.DataFrame(travel_intent_additional_queries, columns=['sequence'])
travel_intent_additional_queries_df['target'] = 'travel_intent'
travel_intent_additional_queries_df

#### Additional examples for Translation intent

In [None]:
translation_intent_templates = [
    # Basic Translations (Word or Phrase)
    "What is the translation for {word} in {language}?",
    "How do you say {phrase} in {language}?",
    "What does {word} mean in {language}?",
    "Translate {word} to {language}",
    "What is {phrase} in {language}?",
    "Translate {phrase} to {language}",

    # Meaning of Words in a Language
    "What does {word} mean in {language}?",
    "What is the meaning of {word} in {language}?",
    "Explain the meaning of {phrase} in {language}",
    "What is the translation of {phrase} in {language}?",
    "How do you express {word} in {language}?",

    # Pronunciations & Spellings
    "How do you pronounce {word} in {language}?",
    "What is the correct spelling of {word} in {language}?",
    "What is the phonetic spelling for {word} in {language}?",
    "How to spell {word} in {language}?",
    "How do you pronounce {phrase} in {language}?",
    
]


In [None]:
word = [
    'beautiful', 'friend', 'hello', 'thank you', 'family',
    'happy', 'love', 'music', 'freedom', 'peace',
    'home', 'work', 'future', 'goodbye', 'success',
    'health', 'school', 'truth', 'happiness', 'strength'
]

phrase = [
    'how are you', 'good morning', 'I love you', 'what’s your name', 'where is the bathroom',
    'see you later', 'happy birthday', 'congratulations', 'good night', 'I miss you',
    'nice to meet you', 'have a great day', 'thank you very much', 'how old are you', 'take care',
    'good afternoon', 'can you help me', 'I don’t understand', 'excuse me', 'I am sorry'
]

language = [
    'Spanish', 'French', 'German', 'Japanese', 'Chinese',
    'Russian', 'Italian', 'Portuguese', 'Korean', 'Hindi',
    'Arabic', 'Dutch', 'Greek', 'Hebrew', 'Swedish',
    'Turkish', 'Vietnamese', 'Polish', 'Thai', 'Bengali'
]



In [None]:
WORD_TRANSLATE = "{word}"
PHRASE_TRANSLATE = "{phrase}"
LANGUAGE_TRANSLATE = "{language}"


translate_categories = {
    WORD_TRANSLATE: word,
    PHRASE_TRANSLATE: phrase,
    LANGUAGE_TRANSLATE: language,
}



In [None]:
translate_intent_additional_queries = generate_service_queries(translate_categories, translation_intent_templates, n_queries=2000)
print(len(translate_intent_additional_queries))

In [None]:
translate_intent_additional_queries_df = pd.DataFrame(translate_intent_additional_queries, columns=['sequence'])
translate_intent_additional_queries_df['target'] = 'translation_intent'
translate_intent_additional_queries_df

In [None]:
# def apply_target_mapping(df, target_mapping):
#     mapped_text_set = set()
#     for ngram in target_mapping.keys():
#         # mask = df['sequence'].apply(lambda text: ngram in text)
#         mask = df['sequence'].apply(lambda text: ngram in text and text not in mapped_text_set)
#         print(f'Number of matches found for "{ngram}"  = {mask.sum()}')
#         print(f'size of mapped_text_set = {len(mapped_text_set)}')
#         df.loc[mask, 'target'] = target_mapping[ngram]
#         mapped_text_set.update(df.loc[mask, 'sequence'].values.tolist())
#         print()

In [None]:
def apply_target_mapping(df, target_mapping, ngram, mapped_text_set):
    # mapped_text_set = set()
    # for ngram in target_mapping.keys():
    # mask = df['sequence'].apply(lambda text: ngram in text)
    mask = df['sequence'].apply(lambda text: ngram in text and text not in mapped_text_set)
    print(f'Number of matches found for "{ngram}"  = {mask.sum()}')
    print(f'size of mapped_text_set = {len(mapped_text_set)}')
    df.loc[mask, 'target'] = target_mapping[ngram]
    mapped_text_set.update(df.loc[mask, 'sequence'].values.tolist())
    print()

In [None]:
to_be_labelled = marco_df.loc[marco_df['target'].isna()].copy()
labelled = marco_df.loc[~marco_df['target'].isna()].copy()

In [None]:
len(to_be_labelled), len(labelled)

In [None]:
manual_labelled = pd.read_csv("../data/manual_labels_v2.csv")
manual_labelled = manual_labelled.loc[~manual_labelled['target'].isna()]
print(len(manual_labelled))
print(manual_labelled['target'].value_counts())
manual_labelled_lkp = manual_labelled[['sequence','target']].set_index('sequence').to_dict()['target']
manual_labelled.head()

In [None]:
def apply_manual_mapping(df, manual_labelled_lkp):
    mask = df['sequence'].apply(lambda text: text in manual_labelled_lkp)
    print(f'Number of matches found in manual labels = {mask.sum()}')
    df.loc[mask, 'target'] = df.loc[mask, 'sequence'].map(manual_labelled_lkp)
    print()

In [None]:
apply_manual_mapping(to_be_labelled, manual_labelled_lkp)
labelled = pd.concat([labelled, to_be_labelled.loc[~to_be_labelled['target'].isna()]], axis=0).sample(frac=1.0)
to_be_labelled = to_be_labelled.loc[to_be_labelled['target'].isna()]
print(f"to_be_labelled: {len(to_be_labelled)}, labelled: {len(labelled)}")

In [None]:

print(f"Number of examples labeled = {len(labelled)}")
print(f"Number of examples to be labeled = {len(to_be_labelled)}")
print(f"Label stats \n{labelled['target'].value_counts()}\n")

# Step 3: Get most common n-grams for a given n
n = 2  # Change this to any n (e.g., 1 for unigrams, 3 for trigrams)
to_be_labelled_sequence_list = to_be_labelled['sequence'].values.tolist()
ngram_counter = count_ngrams(to_be_labelled_sequence_list, n)
most_common_ngrams = ngram_counter.most_common(100)

# Display the most common n-grams
print(most_common_ngrams)

# Example usage with a limit on the number of results
cnt = 0
for query in search_queries_by_words("5 star", to_be_labelled_sequence_list):
    if cnt >= 100:  # Stop after 20 results
        break
    print(cnt + 1, query)
    cnt += 1



In [None]:
labelled = pd.concat([labelled, 
                      weather_examples, 
                      yelp_examples, 
                      purchase_intent_examples, 
                      yelp_intent_additional_queries_df,
                      navigation_intent_additional_queries_df,
                      travel_intent_additional_queries_df,
                      translate_intent_additional_queries_df,
                     ], axis=0)
mapped_text_set = set()
for i, ngram in enumerate(target_mapping.keys()):
    print()
    print(f"iteration {i+1}: to_be_labelled: {len(to_be_labelled)}, labelled: {len(labelled)}")
    apply_target_mapping(to_be_labelled, target_mapping, ngram, mapped_text_set)
    labelled = pd.concat([labelled, to_be_labelled.loc[~to_be_labelled['target'].isna()]], axis=0)
    to_be_labelled = to_be_labelled.loc[to_be_labelled['target'].isna()]

In [None]:
labelled[:60]

#### Skip this for manual labeling

In [None]:
## Only if special list for manual process needed else skip this 

SKIP_MANUAL_LABEL_PREP = True
if not SKIP_MANUAL_LABEL_PREP:
    special_list = set()
    
    cnt = 0
    
    for query in search_queries_by_words("how much", to_be_labelled_sequence_list):
        if cnt >= 10000:  # Stop after 20 results
            break
        # print(cnt + 1, query)
        cnt += 1
        special_list.add(query)
    
    pd.DataFrame(special_list, columns=['sequence']).to_csv('special_list_manual_label.csv', index=False)

In [None]:
to_be_labelled

In [None]:
labelled['target'].value_counts()

In [None]:
# labelled.loc[labelled['target'] == 'translation_intent']['sequence'].sample(100).values

In [None]:
# labelled.loc[labelled['sequence'].apply(lambda q: "sf " in q)]['sequence'].values

In [None]:
combined = pd.concat([labelled, to_be_labelled], axis=0).reset_index(drop=True)
print(len(combined))
combined

In [None]:
labelled['target'].value_counts()

In [None]:
labelled.to_csv("../data/marco_train_v4.csv", index=False)

In [None]:
import pandas as pd
from umap import UMAP
from sklearn.pipeline import make_pipeline 
from embetter.text import SentenceEncoder


SKIP_ENCODING = False
if not SKIP_ENCODING:
    # Build a sentence encoder pipeline with UMAP at the end.
    enc = SentenceEncoder('all-MiniLM-L6-v2')
    umap = UMAP()
    
    text_emb_pipeline = make_pipeline(
      enc, umap
    )
    
    # Load sentences
    X = combined['sequence'].values.tolist()
    
    # Calculate embeddings 
    X_tfm = text_emb_pipeline.fit_transform(X)
    
    # Write to disk. Note! Text column must be named "text"
    df = pd.DataFrame({"text": X})
    df['x'] = X_tfm[:, 0]
    df['y'] = X_tfm[:, 1]
    df.to_csv("marco_ready.csv", index=False)
    df['target'] = combined['target'].fillna('unknown')
else:
    df = pd.read_csv("marco_ready.csv")
    df['target'] = combined['target'].fillna('unknown')

In [None]:
combined

In [None]:
df

In [None]:
import plotly.express as px

In [None]:
fig_2d = px.scatter(
    df, x='x', y='y',
    color=df['target'], labels={'color': 'target'},
    hover_name="text",
    opacity=0.3,
    title="marcos web search queries intents map"
)



In [None]:
fig_2d

In [None]:
fig_2d.write_html("../reports/web_search_intents.html")

In [None]:
# [query for query in labelled.loc[labelled['target'] == 'yelp_intent']['sequence'].values.tolist() if 'medication' in query]

In [None]:
labelled.loc[labelled['target'] == 'yelp_intent']

In [None]:
len(to_be_labelled)

In [None]:
to_be_labelled

In [None]:
to_be_labelled.to_csv('../data/to_be_labelled.csv', index=False)