In [1]:
import spacy
import os 
from pprint import pprint
import json 
import pandas as pd 
from tqdm import tqdm 

2023-11-19 16:01:44.768555: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-19 16:01:44.828403: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-19 16:01:44.828441: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-19 16:01:44.828462: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-19 16:01:44.835398: I tensorflow/core/platform/cpu_feature_g

In [2]:
def mkdir(idir):
    if not os.path.isdir(idir):
        os.makedirs(idir)

def get_noun_phrases(doc, output=None, keep=None):
    if keep is None:
        return list([np.text.lower() for np in doc.noun_chunks])
    if output is None:
        output = {}
    kws = []
    for nc in doc.noun_chunks:
        ws = []
        for word in nc:
            if word.pos_ in keep:
                ws.append(word.text.lower())
        if len(ws) > 0:
            n = ' '.join(ws)
            output[n] = output.get(n, 0) + 1
            kws.append(n)
    return output, kws 

def increase_count(idict, key, freq):
    if key not in idict:
        idict[key] = 0
    idict[key] += freq


def add_to_dict(idict, key, value, freq=1):
    if key not in idict:
        idict[key] = {}
    if value not in idict[key]:
        idict[key][value] = 0
    idict[key][value] += freq


def get_unique_values(idict, count_only=False):
    if count_only:
        return {k: len(set(v)) for k, v in idict.items()}
    else:
        return {k: list(set(v)) for k, v in idict.items()}


def save_np_info(np2count, np2reviews, np2rest, np2users, ofile, count_only=False):
    # output = {"np2count": np2count, "np2review_count": count_unique_values(np2reviews), 
    #         'np2res_count': count_unique_values(np2rest), 'np2user_count': count_unique_values(np2users)}
    output = {"np2count": np2count, "np2reviews": np2reviews, 
            'np2rests': np2rest, 'np2users': np2users}
    json.dump(output, open(ofile, 'w'))
    print("Saved to", ofile)

def extract_raw_keywords_for_reviews(data, ofile, 
                                     keep=['ADJ', 'NOUN', 'PROPN', 'VERB'], overwrite=False, 
                                     review2keyword_ofile=None):
    if os.path.isfile(ofile) and not overwrite:
        print("Existing output file. Stop! (set overwrite=True to overwrite)")
        return 
    np2count = {}   # frequency 
    np2review2count = {}  # reviews 
    np2rest2count = {}  # 
    np2user2count = {} 
    counter = 0
    review2keywords = {}
    for rid, uid, restid, text in tqdm(zip(data['review_id'], data['user_id'], data['rest_id'], data['text']), total=len(data)):
        doc = nlp(text)
        tmp, keywords = get_noun_phrases(doc, keep=keep)  # np for this review 
        for np, freq in tmp.items():
            increase_count(np2count, np, freq)
            add_to_dict(np2review2count, np, rid, freq)
            add_to_dict(np2rest2count, np, restid, freq)
            add_to_dict(np2user2count, np, uid, freq)
        review2keywords[rid] = keywords
        # counter += 1
        # if counter % 2 == 0:
            # save_np_info(np2count, np2review2count, np2rest2count, np2user2count, ofile)
    save_np_info(np2count, np2review2count, np2rest2count, np2user2count, ofile)
    if review2keyword_ofile is not None: 
        df = pd.DataFrame({"Review_ID": list(review2keywords.keys()), "Keywords": list(review2keywords.values())})
        df.to_csv(review2keyword_ofile)


def load_split(sfile='/home/ubuntu/duc.nm195858/keyext_LLM/splits.json', city='singapore', setname='train'):
    return json.load(open(sfile))[city][setname]

### Extract keywords for train set
- Extract keywords from train set 
- Then use postprocessing for choosing a subset of keywords 
- Use these keywords for test, dev? (maybe don't need, but can do)

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
# setname = 'test'
CITIES =['edinburgh'] #['charlotte', 'edinburgh', 'lasvegas', 'london', 'phoenix', 'pittsburgh', 'singapore']
sets = ['test']#, 'test', 'dev']
for city in CITIES: 
    dt = pd.read_csv('/home/ubuntu/duc.nm195858/keyext_LLM/reviews/{}.csv'.format(city))
    for setname in sets: 
        print("Processing for", city, setname)
        uids = load_split(city=city, setname=setname)
        dt_set = dt[dt['user_id'].isin(uids)]
        odir = '/home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_spacy/' + setname
        mkdir(odir)
        extract_raw_keywords_for_reviews(dt_set, ofile=os.path.join(odir, city + '-keywords.json'), keep=['ADJ', 'NOUN', 'PROPN', 'VERB'], 
                                        overwrite=True, review2keyword_ofile=os.path.join(odir,city+"-review2keywords.csv"))

Processing for edinburgh test


100%|██████████| 1970/1970 [01:16<00:00, 25.86it/s]


Saved to /home/ubuntu/duc.nm195858/keyext_LLM/preprocessed/by_city-users_min_3_reviews/keywords_spacy/test/edinburgh-keywords.json
