# IRF (Inversed Restaurant Frequency)
- How many restaurants having this keyword in its reviews? 
- The higher the lower IRF score
- Words appearing in many restaurants doesn't convey much information 
- This is to choose keywords when testing for cold-start users, i.e., which keywords should be used
- The computation is based on information from the train set 


In [1]:
import os 
import numpy as np
import json
from pprint import pprint 

In [9]:
def compute_irf(num, N=1000):
    return np.log(N/num)


def compute_irf_for_dir(idir, odir, N=1000):
    for fname in os.listdir(idir):
        # print(fname)
        if fname.startswith(".") or not fname.endswith(".json"):
            continue 
        # print(fname)
        ifile = os.path.join(idir, fname)
        # print(ifile)
        ofile = os.path.join(odir, fname)
        dt = json.load(open(ifile))['np2rests']
        np2irf = {} 
        for n, r in dt.items():
            np2irf[n] = compute_irf(len(r), N=N)
        json.dump(np2irf, open(ofile, 'w'))
        print("Saved to", ofile)

In [10]:
idir = '/home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_spacy/train'
odir = '/home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_IRF'

compute_irf_for_dir(idir, odir, N=1000)

Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_IRF/phoenix-keywords.json
Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_IRF/edinburgh-keywords.json
Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_IRF/london-keywords.json
Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_IRF/charlotte-keywords.json
Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_IRF/lasvegas-keywords.json
Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_IRF/singapore-keywords.json
Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_IRF/pittsburgh-keywords.json


In [14]:
# analyzing irf 
dt = json.load(open('/home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_IRF/singapore-keywords.json'))
s = sorted(dt.items(), key=lambda x: x[1], reverse=True)
pprint(s[:100])

[('big supporter', 6.907755278982137),
 ('wolf', 6.907755278982137),
 ('buger fix', 6.907755278982137),
 ('soggy burger', 6.907755278982137),
 ('wolf standard', 6.907755278982137),
 ('meat rest', 6.907755278982137),
 ('ussue', 6.907755278982137),
 ('soggy french dip burger', 6.907755278982137),
 ('message email', 6.907755278982137),
 ('biz owner', 6.907755278982137),
 ('customer feedback', 6.907755278982137),
 ('wolf gospel', 6.907755278982137),
 ('sep 29th', 6.907755278982137),
 ('usual double', 6.907755278982137),
 ('shroom', 6.907755278982137),
 ('sweet potatoe fries', 6.907755278982137),
 ('pasarbella enclave', 6.907755278982137),
 ('crafted burgers', 6.907755278982137),
 ('mushroom burger patty', 6.907755278982137),
 ('melted cheers', 6.907755278982137),
 ('sliced sautéed mushroom', 6.907755278982137),
 ('puffy light bun', 6.907755278982137),
 ('buttermilk chicken burger', 6.907755278982137),
 ('outside succulent', 6.907755278982137),
 ('new go to burger place', 6.907755278982137)