### Create splits
- Split by city (assuming knowing the city)
- Keep only users having at least k reviews (k=3)

In [1]:
import pandas as pd 
import os   
from itertools import groupby
import random
import json


In [None]:
CITIES = ['charlotte', 'edinburgh', 'lasvegas', 'london', 'phoenix', 'pittsburgh', 'singapore']
ifile = '../../../data/input/reviews_all.csv'
filtered_reviews_dir = '../../../data/preprocessed/by_city-users_min_3_reviews/reviews/'
reviews = pd.read_csv(ifile)

### Filter data: keep users having at least 3 keywords
<pre> 
Min number of reviews = 3 
City	    #org_users	#filtered_users	#org_reviews	#filtered_reviews
charlotte	69216	    13985	        178488	        112772
edinburgh	8502	    1484	        21205	        12753
lasvegas	362848	    66630	        783422	        428782
london	    31716	    5495	        73495	        42061
phoenix	    193144	    37923	        455941	        269871
pittsburgh	55554	    11395	        145384	        92449
singapore	5316	    1308	        17749	        12878

In [22]:
min_num_reviews = 3
print("\t".join(['City', '#org_users', '#filtered_users', '#org_reviews', '#filtered_reviews']))
for city in CITIES: 
    ofile = os.path.join(filtered_reviews_dir, "{}.csv".format(city))
    dt = reviews[reviews['city'].isin([city])]
    uids = dt['user_id']
    uid2freq = {k: len(list(group)) for k, group in groupby(sorted(uids))}
    uid2freq_filtered = {k: v for k, v in uid2freq.items() if v >= min_num_reviews}
    dt_filtered = dt[dt['user_id'].isin(list(uid2freq_filtered.keys()))]
    dt_filtered.to_csv(ofile)
    # print("Saved to ", ofile)
    print("\t".join([city, str(len(uid2freq)), str(len(uid2freq_filtered)), str(len(dt)), str(len(dt_filtered))]))

City	#org_users	#filtered_users	#org_reviews	#filtered_reviews
Saved to  ../../../data/input/by_city/reviews_charlotte.csv
Saved to  ../../../data/input/by_city/reviews_edinburgh.csv
Saved to  ../../../data/input/by_city/reviews_lasvegas.csv
Saved to  ../../../data/input/by_city/reviews_london.csv
Saved to  ../../../data/input/by_city/reviews_phoenix.csv
Saved to  ../../../data/input/by_city/reviews_pittsburgh.csv
Saved to  ../../../data/input/by_city/reviews_singapore.csv


### Create splits 

In [26]:
split_file = '../../../data/preprocessed/splits.json'
train_p = .8  # train proportion (#users)
dev_p = .05  # 5% for dev 

<pre>
[city, #train users, #dev users, #test users, %train users, %dev users, %test users]
['charlotte', 11188, 699, 2098, 0.8, 0.04998212370396854, 0.15001787629603147]
['edinburgh', 1187, 74, 223, 0.7998652291105122, 0.04986522911051213, 0.15026954177897575]
['lasvegas', 53304, 3331, 9995, 0.8, 0.049992495872730004, 0.15000750412726999]
['london', 4396, 274, 825, 0.8, 0.04986351228389445, 0.15013648771610555]
['phoenix', 30338, 1896, 5689, 0.7999894523112623, 0.04999604461672336, 0.15001450307201433]
['pittsburgh', 9116, 569, 1710, 0.8, 0.0499341816586222, 0.1500658183413778]
['singapore', 1046, 65, 197, 0.7996941896024465, 0.04969418960244648, 0.15061162079510704]

In [31]:
split = {}  # {city: {train: [], dev: [], test: []}}
for city in CITIES: 
    dt = pd.read_csv(os.path.join(filtered_reviews_dir, "reviews_{}.csv".format(city)))
    users = list(set(list(dt['user_id'])))  # unique set of users 
    random.shuffle(users)  # randomly shuffle users 
    num_train = int(len(users) * train_p)
    num_dev = int(len(users) * dev_p)
    train_users = users[:num_train]
    dev_users = users[num_train:num_train + num_dev]
    test_users = users[num_train+num_dev:]
    tmp = {'train': train_users, 'dev': dev_users, 'test': test_users}
    split[city] = tmp
    print([city, len(train_users), len(dev_users), len(test_users), len(train_users)/len(users), len(dev_users)/len(users), len(test_users)/len(users)])
json.dump(split, open(split_file, 'w'))

['charlotte', 11188, 699, 2098, 0.8, 0.04998212370396854, 0.15001787629603147]
['edinburgh', 1187, 74, 223, 0.7998652291105122, 0.04986522911051213, 0.15026954177897575]
['lasvegas', 53304, 3331, 9995, 0.8, 0.049992495872730004, 0.15000750412726999]
['london', 4396, 274, 825, 0.8, 0.04986351228389445, 0.15013648771610555]
['phoenix', 30338, 1896, 5689, 0.7999894523112623, 0.04999604461672336, 0.15001450307201433]
['pittsburgh', 9116, 569, 1710, 0.8, 0.0499341816586222, 0.1500658183413778]
['singapore', 1046, 65, 197, 0.7996941896024465, 0.04969418960244648, 0.15061162079510704]


In [5]:
# check statistic 
ifile = '../../../data/preprocessed/splits.json'
rdir = '../../../data/preprocessed/by_city-users_min_3_reviews/reviews/'
split = json.load(open(ifile))


In [13]:
print(','.join(['City', 'set', '#reviews', '#users', "#restaurants"]))
lines = []
for city, s in split.items():
    dt = pd.read_csv(os.path.join(rdir, '{}.csv'.format(city)))
    for setname, uids in s.items():
        dtmp = dt[dt['user_id'].isin(uids)]
        rests = dtmp['rest_id']
        lines.append(','.join([city, setname, str(len(dtmp)), str(len(uids)), str(len(set(rests)))]))
print("\n".join(lines))


City,set,#reviews,#users,#restaurants
charlotte
edinburgh
lasvegas
london
phoenix
pittsburgh
singapore
charlotte,train,90426,11188,886
charlotte,dev,5611,699,854
charlotte,test,16735,2098,885
edinburgh,train,10342,1187,938
edinburgh,dev,441,74,272
edinburgh,test,1970,223,678
lasvegas,train,343524,53304,868
lasvegas,dev,20572,3331,868
lasvegas,test,64686,9995,868
london,train,33990,4396,986
london,dev,1849,274,749
london,test,6222,825,974
phoenix,train,216488,30338,947
phoenix,dev,13571,1896,947
phoenix,test,39812,5689,947
pittsburgh,train,73558,9116,905
pittsburgh,dev,4784,569,857
pittsburgh,test,14107,1710,904
singapore,train,10615,1046,983
singapore,dev,707,65,478
singapore,test,1556,197,742


In [15]:
# check % restaurants in test and dev that appear in train 
print(','.join(['City', 'set', '#restaurants', '#rest_in_train', '%rest_in_train']))
lines = []
for city, s in split.items():
    dt = pd.read_csv(os.path.join(rdir, '{}.csv'.format(city)))
    dt_train = dt[dt['user_id'].isin(s['train'])]
    dt_dev = dt[dt['user_id'].isin(s['dev'])]
    dt_test = dt[dt['user_id'].isin(s['test'])]
    train_rests = set(dt_train['rest_id'])
    dev_rests = list(set(dt_dev['rest_id']))
    test_rests = list(set(dt_test['rest_id']))
    dev_n = len(train_rests.intersection(dev_rests))
    test_n = len(train_rests.intersection(test_rests))
    lines.append(','.join([city, 'dev', str(len(dev_rests)), str(dev_n), str(dev_n/len(dev_rests))]))
    lines.append(','.join([city, 'test', str(len(test_rests)), str(test_n), str(test_n/len(test_rests))]))
print("\n".join(lines))


City,set,#restaurants,#rest_in_train,%rest_in_train
charlotte,dev,854,854,1.0
charlotte,test,885,885,1.0
edinburgh,dev,272,272,1.0
edinburgh,test,678,669,0.9867256637168141
lasvegas,dev,868,868,1.0
lasvegas,test,868,868,1.0
london,dev,749,749,1.0
london,test,974,974,1.0
phoenix,dev,947,947,1.0
phoenix,test,947,947,1.0
pittsburgh,dev,857,857,1.0
pittsburgh,test,904,904,1.0
singapore,dev,478,478,1.0
singapore,test,742,742,1.0
