In [3]:
import spacy
import os 
from pprint import pprint
import json 
import pandas as pd 
from tqdm import tqdm 

In [25]:
def mkdir(idir):
    if not os.path.isdir(idir):
        os.makedirs(idir)

def get_noun_phrases(doc, output=None, keep=None):
    if keep is None:
        return list([np.text.lower() for np in doc.noun_chunks])
    if output is None:
        output = {}
    kws = []
    for nc in doc.noun_chunks:
        ws = []
        for word in nc:
            if word.pos_ in keep:
                ws.append(word.text.lower())
        if len(ws) > 0:
            n = ' '.join(ws)
            output[n] = output.get(n, 0) + 1
            kws.append(n)
    return output, kws 

def increase_count(idict, key, freq):
    if key not in idict:
        idict[key] = 0
    idict[key] += freq


def add_to_dict(idict, key, value, freq=1):
    if key not in idict:
        idict[key] = {}
    if value not in idict[key]:
        idict[key][value] = 0
    idict[key][value] += freq


def get_unique_values(idict, count_only=False):
    if count_only:
        return {k: len(set(v)) for k, v in idict.items()}
    else:
        return {k: list(set(v)) for k, v in idict.items()}


def save_np_info(np2count, np2reviews, np2rest, np2users, ofile, count_only=False):
    # output = {"np2count": np2count, "np2review_count": count_unique_values(np2reviews), 
    #         'np2res_count': count_unique_values(np2rest), 'np2user_count': count_unique_values(np2users)}
    output = {"np2count": np2count, "np2reviews": np2reviews, 
            'np2rests': np2rest, 'np2users': np2users}
    json.dump(output, open(ofile, 'w'))
    print("Saved to", ofile)

def extract_raw_keywords_for_reviews(data, ofile, 
                                     keep=['ADJ', 'NOUN', 'PROPN', 'VERB'], overwrite=False, 
                                     review2keyword_ofile=None):
    if os.path.isfile(ofile) and not overwrite:
        print("Existing output file. Stop! (set overwrite=True to overwrite)")
        return 
    np2count = {}   # frequency 
    np2review2count = {}  # reviews 
    np2rest2count = {}  # 
    np2user2count = {} 
    counter = 0
    review2keywords = {}
    for rid, uid, restid, text in tqdm(zip(data['review_id'], data['user_id'], data['rest_id'], data['text']), total=len(data)):
        doc = nlp(text)
        tmp, keywords = get_noun_phrases(doc, keep=keep)  # np for this review 
        for np, freq in tmp.items():
            increase_count(np2count, np, freq)
            add_to_dict(np2review2count, np, rid, freq)
            add_to_dict(np2rest2count, np, restid, freq)
            add_to_dict(np2user2count, np, uid, freq)
        review2keywords[rid] = keywords
        # counter += 1
        # if counter % 2 == 0:
            # save_np_info(np2count, np2review2count, np2rest2count, np2user2count, ofile)
    save_np_info(np2count, np2review2count, np2rest2count, np2user2count, ofile)
    if review2keyword_ofile is not None: 
        df = pd.DataFrame({"Review_ID": list(review2keywords.keys()), "Keywords": list(review2keywords.values())})
        df.to_csv(review2keyword_ofile)


def load_split(sfile='../../../data/preprocessed/splits.json', city='singapore', setname='train'):
    return json.load(open(sfile))[city][setname]

### Extract keywords for train set
- Extract keywords from train set 
- Then use postprocessing for choosing a subset of keywords 
- Use these keywords for test, dev? (maybe don't need, but can do)

In [14]:
nlp = spacy.load("en_core_web_sm")

In [26]:
setname = 'test'
CITIES = ['charlotte', 'edinburgh', 'lasvegas', 'london', 'phoenix', 'pittsburgh', 'singapore']
sets = ['train', 'test', 'dev']
for city in CITIES: 
    dt = pd.read_csv('../../../data/preprocessed/by_city-users_min_3_reviews/reviews/{}.csv'.format(city))
    for setname in sets: 
        print("Processing for", city, setname)
        uids = load_split(city=city, setname=setname)
        dt_set = dt[dt['user_id'].isin(uids)]
        odir = '../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/' + setname
        mkdir(odir)
        extract_raw_keywords_for_reviews(dt_set, ofile=os.path.join(odir, city + '-keywords.json'), keep=['ADJ', 'NOUN', 'PROPN', 'VERB'], 
                                        overwrite=True, review2keyword_ofile=os.path.join(odir,city+"-review2keywords.csv"))

Processing for charlotte train


100%|██████████| 90426/90426 [25:51<00:00, 58.29it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/train/charlotte-keywords.json
Processing for charlotte test


100%|██████████| 16735/16735 [04:52<00:00, 57.20it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/test/charlotte-keywords.json
Processing for charlotte dev


100%|██████████| 5611/5611 [01:33<00:00, 59.78it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/dev/charlotte-keywords.json
Processing for edinburgh train


100%|██████████| 10342/10342 [03:37<00:00, 47.51it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/train/edinburgh-keywords.json
Processing for edinburgh test


100%|██████████| 1970/1970 [00:42<00:00, 46.14it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/test/edinburgh-keywords.json
Processing for edinburgh dev


100%|██████████| 441/441 [00:08<00:00, 52.04it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/dev/edinburgh-keywords.json
Processing for lasvegas train


100%|██████████| 343524/343524 [1:44:00<00:00, 55.05it/s]  


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/train/lasvegas-keywords.json
Processing for lasvegas test


100%|██████████| 64686/64686 [19:16<00:00, 55.91it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/test/lasvegas-keywords.json
Processing for lasvegas dev


100%|██████████| 20572/20572 [05:56<00:00, 57.68it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/dev/lasvegas-keywords.json
Processing for london train


100%|██████████| 33990/33990 [11:20<00:00, 49.91it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/train/london-keywords.json
Processing for london test


100%|██████████| 6222/6222 [02:02<00:00, 50.60it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/test/london-keywords.json
Processing for london dev


100%|██████████| 1849/1849 [00:36<00:00, 51.36it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/dev/london-keywords.json
Processing for phoenix train


100%|██████████| 216488/216488 [59:12<00:00, 60.94it/s] 


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/train/phoenix-keywords.json
Processing for phoenix test


100%|██████████| 39812/39812 [10:50<00:00, 61.17it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/test/phoenix-keywords.json
Processing for phoenix dev


100%|██████████| 13571/13571 [03:44<00:00, 60.34it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/dev/phoenix-keywords.json
Processing for pittsburgh train


100%|██████████| 73558/73558 [13:06:00<00:00,  1.56it/s]    


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/train/pittsburgh-keywords.json
Processing for pittsburgh test


100%|██████████| 14107/14107 [04:41<00:00, 50.15it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/test/pittsburgh-keywords.json
Processing for pittsburgh dev


100%|██████████| 4784/4784 [01:28<00:00, 54.26it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/dev/pittsburgh-keywords.json
Processing for singapore train


100%|██████████| 10615/10615 [03:54<00:00, 45.30it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/train/singapore-keywords.json
Processing for singapore test


100%|██████████| 1556/1556 [00:32<00:00, 47.55it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/test/singapore-keywords.json
Processing for singapore dev


100%|██████████| 707/707 [00:14<00:00, 48.99it/s]


Saved to ../../../data/preprocessed/by_city-users_min_3_reviews/keywords_spacy/dev/singapore-keywords.json
