In [1]:
import json 
from pprint import pprint 
import os

In [3]:
def filter_keywords(ifile, ofile, min_freq=3):
    data = json.load(open(ifile))
    np2count = data['np2count']
    valid_kws = [a for a, b in np2count.items() if b >= min_freq]
    new_dict = {}
    for k, v in data.items():
        tmp = {} 
        for k2 in valid_kws:
            tmp[k2] = v[k2]
        new_dict[k] = tmp
    json.dump(new_dict, open(ofile, 'w'))
    print("Saved to", ofile)
    

def group_keywords_for_users(ifile, ofile):
    dt = json.load(open(ifile))
    np2users = dt['np2users']
    u2kw = {}  # {user: {keyword: freq}}
    for kw, u2c in np2users.items():
        for u, c in u2c.items():
            if u not in u2kw:
                u2kw[u] = {} 
            u2kw[u][kw] = c 
    json.dump(u2kw, open(ofile, 'w'))
    print("Saved to", ofile)


def mkdir(idir):
    if not os.path.isdir(idir):
        os.makedirs(idir)


def compute_tfirf(ifile, ofile, irf, default_irf=0.01, sorting=True):
    dt = json.load(open(ifile))
    u2kw2score = {} 
    for u, kw2f in dt.items():
        kw2score = {}
        for kw, f in kw2f.items():
            kw2score[kw] = f * irf.get(kw, default_irf)
        u2kw2score[u] = kw2score
    # sort 
    if sorting:
        tmp = {} 
        for k, v in u2kw2score.items():
            vs = sorted(v.items(), key=lambda x: x[1], reverse=True)
            tmp[k] = vs 
        u2kw2score = tmp
    json.dump(u2kw2score, open(ofile, 'w'))
    print("Saved to", ofile)
    

def get_irf(city, irf_dict, irf_dir):
    if city not in irf_dict:
        irf = json.load(open(os.path.join(irf_dir, city)))
        irf_dict[city] = irf
    return irf_dict[city]

### Filter out keywords based on frequency (min=3)

In [4]:
CITIES = ['edinburgh'] #['charlotte', 'edinburgh', 'lasvegas', 'london', 'phoenix', 'pittsburgh', 'singapore']
min_freq = 3
for city in CITIES: 
    print("Processing for", city)
    ifile = '/home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_spacy/test/{}-keywords.json'.format(city)
    ofile = '/home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_spacy-min_{}/test/{}.json'.format(min_freq, city)
    filter_keywords(ifile, ofile, min_freq=min_freq)

Processing for edinburgh
Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_spacy-min_3/test/edinburgh.json


### Group keywords for users (for dev/test)

#### Raw frequency

In [3]:
names = ['test'] #['dev', 'test']
for setname in names: 
    idir = '/home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_spacy/' + setname
    odir = '/home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/user_to_keywords/raw_freq/' + setname
    mkdir(odir)
    for fname in os.listdir(idir):
        if fname.startswith('.') or not fname.endswith(".json"):
            continue 
        print("Processing for", fname)
        ifile = os.path.join(idir, fname)
        ofile = os.path.join(odir, fname)
        group_keywords_for_users(ifile, ofile)
    print("------------")


Processing for edinburgh-keywords.json
Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/user_to_keywords/raw_freq/test/edinburgh-keywords.json
------------


#### TF-IRF 

In [4]:
irf_dir = '/home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_IRF'
idir_root = '/home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/user_to_keywords/raw_freq'
odir_root = '/home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/user_to_keywords/tf_irf'

city2irf = {}

for setname in os.listdir(idir_root):
    if setname.startswith("."):
        continue 
    idir = os.path.join(idir_root, setname)
    odir = os.path.join(odir_root, setname)
    mkdir(odir)
    for fname in os.listdir(idir):
        if fname.startswith("."):
            continue
        ifile = os.path.join(idir, fname)
        ofile = os.path.join(odir, fname)
        print("Processing for", ifile)
        compute_tfirf(ifile, ofile, irf=get_irf(fname, city2irf, irf_dir))


Processing for /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/user_to_keywords/raw_freq/test/edinburgh-keywords.json
Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/user_to_keywords/tf_irf/test/edinburgh-keywords.json
Processing for /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/user_to_keywords/raw_freq/train/phoenix-keywords.json
Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/user_to_keywords/tf_irf/train/phoenix-keywords.json
Processing for /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/user_to_keywords/raw_freq/train/edinburgh-keywords.json
Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/user_to_keywords/tf_irf/train/edinburgh-keywords.json
Processing for /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/user_to_keywords/raw_freq/train/london-keyw