In [1]:
import pandas as pd
import numpy as np
import json
import time

leftK_path = '/home/left/github/'

filename = 'yelp_academic_dataset_business.json'


start = time.time()
# Store all businesses
data = [json.loads(line) for line in 
        open(leftK_path + filename, 'r', encoding="utf8")]

stop = time.time()
print("Read file in {:.3f} mins".format((stop-start)/60))

Read file in 0.054 mins


In [2]:
# Keep businesses which are located in Toronto
data = [x for x in data if x['city']=='Toronto']
data = np.array(data) # Convert list to numpy array

# Create an array with Toronto located businesses with more than 15 reviews
business_col = np.array([])
for i in range(len(data)):
    if data[i]['review_count']>=15:
        business_col = np.append(business_col,data[i]['business_id'])

In [3]:
# Helping function to check if a business is in business_col (Toronto and >=15 reviews)
def get_business_index(business_id):
    return np.where(business_col == business_id)[0]

In [4]:
filename = 'yelp_academic_dataset_review.json'
start = time.time()

user_business = []
# Read the json review file line by line and keep the reviews referring to business_col 
with open(leftK_path + filename,'r',encoding="utf8") as reviews_file:

    for line in reviews_file:
        line = json.loads(line)
        ind = get_business_index(line['business_id'])

        if ind.size>0:
            user_business.append([line['user_id'],line['business_id'], line['date']])
            
stop = time.time()
print("Read file in {:.3f} mins".format((stop-start)/60))

Read file in 10.453 mins


In [66]:
# User business have all the reviews we want. But using numpy unique will keep the first occurency.
# We want to keep last occurency, so a smart way to do so is to read our list with all pairs of 
# type (UserID, BusinessID, Rating) upside-down(reverse). Then call unique and reconstruct the 
# total reviews with the given indices.
user_business.reverse()
from operator import itemgetter
user_bus_sorted = sorted(user_business, key=itemgetter(0,1))

user_bus_sorted = np.array(user_bus_sorted)
pairs = user_bus_sorted[:,0:2]
_, idx = np.unique(pairs, axis=0, return_index=True)
total_reviews = user_bus_sorted[idx]

In [67]:
dates = total_reviews[:,2]

In [68]:
for i in range(len(dates)):
    dates[i] = int(datetime.fromisoformat(dates[i]).timestamp())

In [69]:
# Take the userID column and keep the unique users. Do a for-loop to keep users with >=15 reviews.
users = total_reviews[:,0]
unique_users, counts = np.unique(users, return_counts=True)
uniq_users = []

for i in range(len(unique_users)):
    if counts[i]>=15:
        uniq_users.append(unique_users[i])
uniq_users = np.array(uniq_users)

# Here keep the unique businesses column
businesses = total_reviews[:,1]
unique_businesses = np.unique(businesses)

start = time.time()
# Create the sparse_array using the unique users(rows) and unique businesses (columns)
sparse = np.zeros((len(unique_businesses), len(uniq_users)))
for i in range(len(unique_businesses)):
    user_index = np.where(unique_businesses[i] == total_reviews)[0]

    for j in range(len(user_index)):
        unique_user_ind = np.where(total_reviews[user_index[j]][0] == uniq_users) 
        if unique_user_ind[0].size>0:
            sparse[i][unique_user_ind[0][0]] = total_reviews[user_index[j]][2]
            
stop = time.time()
print("Sparse array created in {:.3f} mins".format((stop-start)/60))

# Sparse array shape indicates the initial unique users and unqiue businesses
sparse.shape

Sparse array created in 2.873 mins


(7602, 5677)

In [71]:
# Prune until nothing to delete

stop=0
while(stop==0):
    # prune the rows(businesses with below 15 reviews)
    index_to_delete = []
    for i in range(sparse.shape[0]):
        nonzero_indexes = np.nonzero(sparse[i])
        if len(nonzero_indexes[0])<15:
            index_to_delete.append(i)
            
    #make new sparse array with deleted items and unique_businesses to track ids
    unique_businesses = np.delete(unique_businesses, index_to_delete)
    sparse = np.delete(sparse, index_to_delete, axis=0)
    
    #prune the columns(users)
    transposed_sparse = np.transpose(sparse)
    columns_to_delete = []
    for i in range(transposed_sparse.shape[0]):
        nonzero_values = np.nonzero(transposed_sparse[i])
        if len(nonzero_values[0])<15:
            columns_to_delete.append(i)
            
    #make new sparse array with deleted items and uniq_users to track ids
    uniq_users = np.delete(uniq_users, columns_to_delete)
    sparse = np.delete(sparse, columns_to_delete, axis=1)
    
    # If nothing to delete -> stop
    if len(index_to_delete)==0 and len(columns_to_delete)==0:
        stop=1

# After pruning we have our sparse array reshaped
sparse.shape

(4092, 4794)

In [79]:
# Create the new data, create a list([userID,businessID,rating]) using our sparse array data
data_csv = []
start = time.time()
for i in range(sparse.shape[1]):
    for j in range(sparse.shape[0]):
        if sparse[j][i]>0:
            data_csv.append([uniq_users[i],unique_businesses[j], sparse[j][i]])
stop = time.time()
print("Data list created in {:.3f} mins".format((stop-start)/60))

Data list created in 0.202 mins


In [83]:
data = sorted(data_csv, key=itemgetter(0,2))
data

[['--BumyUHiO_7YsHurb9Hkw', 'vcxvQyAggPqxcHwvJXvjGg', 1484307635.0],
 ['--BumyUHiO_7YsHurb9Hkw', 'r_BrIgzYcwo1NAuG9dLbpg', 1484310204.0],
 ['--BumyUHiO_7YsHurb9Hkw', 'iZJ5pdY558VodrEumGyVug', 1484836408.0],
 ['--BumyUHiO_7YsHurb9Hkw', 'q5xrVJ4kivx_yEfJeOKNYQ', 1485189538.0],
 ['--BumyUHiO_7YsHurb9Hkw', 'xsl-d_opm3AU5H2Z-im33g', 1485189965.0],
 ['--BumyUHiO_7YsHurb9Hkw', '41o1FUbCYKJv2djtnlkzlg', 1485190315.0],
 ['--BumyUHiO_7YsHurb9Hkw', 'oOGLDf2rzeCPS7UQ8hhPlQ', 1485805005.0],
 ['--BumyUHiO_7YsHurb9Hkw', 'N93EYZy9R0sdlEvubu94ig', 1485805728.0],
 ['--BumyUHiO_7YsHurb9Hkw', 'sQ2dJZ-oHWe2rs0QFLOUJQ', 1485877000.0],
 ['--BumyUHiO_7YsHurb9Hkw', 'Ibp4hEKSE8JaX9OvfEiFqg', 1486404837.0],
 ['--BumyUHiO_7YsHurb9Hkw', '_cjPEH9wXhKS-HQe_U3M4Q', 1486407735.0],
 ['--BumyUHiO_7YsHurb9Hkw', 'QaxDKkqYTtVYZJcqBNTnvQ', 1487101016.0],
 ['--BumyUHiO_7YsHurb9Hkw', 'iGEvDk6hsizigmXhDKs2Vg', 1487101685.0],
 ['--BumyUHiO_7YsHurb9Hkw', '-b94nkPVLQw95zgtDhcpYA', 1488223441.0],
 ['--BumyUHiO_7YsHurb9Hkw', 'pSMK_

In [90]:
data_np = np.array(data)

In [138]:
j=0
embedding = ['']
for i in range(data_np.shape[0]):
    if data_np[i][0] == data_np[i-1][0]:
        embedding[j] += ' ' + data[i][1]
    else:
        embedding.append(data[i][1])
        j+=1

In [116]:
ok = [] 
ok.append(data[0][1])
ok[0] += ' '+'ti les a'
ok

['vcxvQyAggPqxcHwvJXvjGg ti les a']

In [111]:
ok.append('rwefewfew')
ok

['vcxvQyAggPqxcHwvJXvjGg ti les a', 'rwefewfew', 'rwefewfew']

In [120]:
len(embedding)

4795

In [143]:
embedding[-1]

'Qmwfg-PtYuCo5Q_IwcA_HQ cCo3PiT9hm-eEA6ti0kP2w G24p1oGGfY3t-m8Z2lPCaQ nIIuxIn5RU0j-XkDWh3hlA nktQ_aBRp5CmVOkXR0Mz1g ynmTXY9yJDiNkO4HUmxRSQ 4_L9bnIvY8TLDfiTBGcnRA ZqiiLwzPLXSbZJDXFQEceQ CfxVkwEJk1NAqgqMSesLzA n2NRR4N-kmeDvhHJivHGkA WRPlRsDK47DFl3dz65zhYg g6GXqg-QdDiQGLYMVqNOUw 59c3Cp-p5i18xFCKUuCkFQ swM_9CXxkvxEk3fDkjA6PQ zFR99jgMi-qzaJXIx8MXHA v_UM-AW9YYTCbWUNa-w1Gg XSiqtcVEsP6dLOL7ZA9OxA cUxsG0rgpE18dMyvyw8Gdg ACBFbEW6oa58_RyX9Op-qQ lOKgoQtMhnlf6hWvrOiMtQ 9HWdRtNS0q4_UkEvL14IfA bNHeKmkBx5emT9xLfdWyjw FyUcIAn8timIFybYpOLbAw'

In [145]:
len(embedding)

4795