# Ερώτηση 4

In [3]:
import pandas as pd
import numpy as np
import json
import time 

dataset_path = '../yelp-dataset/' # Change this path to match your local dataset folder path

business_filename = 'yelp_academic_dataset_business.json'
users_filename = 'yelp_academic_dataset_user.json'
review_filename = 'yelp_academic_dataset_review.json'

In [4]:
# Loading total businesses with at least 50 reviews
business_list = []
with open(dataset_path + business_filename, 'r', encoding="utf8") as buisness_file:
    for line in buisness_file:
        json_dict = json.loads(line)
        if json_dict['review_count'] >= 50:
            business_list.append([ json_dict['business_id'], 
                                   json_dict['name'], 
                                   json_dict['review_count']]) 

In [85]:
business_list = np.array(business_list)
business_list

array([['oiAlXZPIFm2nBCt0DHLu_Q', 'Green World Cleaners', '81'],
       ['x3Po6tJGb729u_HJPY6UCA', 'Glendale Glitters', '52'],
       ['Mmd5WDFq9hHcQ3uClngGjQ', "Irene's Tap Room", '79'],
       ...,
       ['fNil19SUfPAPnLQrYnFrGQ', 'Cheyenne West Animal Hospital', '124'],
       ['JjcJVqhZXhP4tvOhg3fnag', 'Water Heater Pros', '217'],
       ['SYa2j1boLF8DcGVOYfHPcA', 'Five Guys', '97']], dtype='<U64')

In [6]:
# Businesses that have at least 50 reviews
B = set(business_list[:,0])
len(B)

31942

In [7]:
# User, Business, Rating (u, b, r)
start = time.time()

reviews_list = []
with open(dataset_path + review_filename, 'r', encoding="utf8") as reviews_file:
    for line in reviews_file:
        json_dict = json.loads(line)
        if json_dict['business_id'] in B:
            reviews_list.append([json_dict['user_id'], json_dict['business_id'], json_dict['stars']])

reviews_list = np.array(reviews_list)       
stop = time.time()
print("Read file in {:.3f} mins".format((stop-start)/60))

Read file in 1.155 mins


In [8]:
# Conting unique users that commented at least 20 times on businesses from B set

users = reviews_list[:,0]
users, counts = np.unique(users, return_counts=True)
unique_users = []
for i in range(len(users)):
    if counts[i] >= 20:
        unique_users.append(users[i])
unique_users = np.array(unique_users)
len(unique_users)

40225

In [9]:
# convert to set for efficiency
U = set(unique_users)

In [10]:
start = time.time()

list_users = []
with open(dataset_path + users_filename, 'r', encoding="utf8") as users_file:
    for line in users_file:
        json_dict = json.loads(line)
        if json_dict['user_id'] in U:
            list_users.append([ json_dict['user_id'], 
                                json_dict['friends'].replace(',', '').split()])
stop = time.time()
print("Read file in {:.3f} mins".format((stop-start)/60))

Read file in 0.409 mins


In [11]:
len(list_users)

40225

## Creating G graph

In [13]:
import networkx as nx

G = nx.Graph()
for user in list_users:
    for friend in user[1]: # user friendlist
        if friend in U: # don't add users that aren't in U.
            G.add_edge(user[0], friend)

In [15]:
largest_cc = max(nx.connected_components(G), key=len)
CC_max = nx.subgraph(G, largest_cc)

### Is max cc the same as the whole graph? (Should be 0)

In [17]:
len(G.nodes()) - len(CC_max.nodes())

168

## Pruning

In [96]:
user_set = set(CC_max.nodes())

In [97]:
print(reviews_list)
print("length: {}".format(len(reviews_list)))

[['-MhfebM0QIsKt87iDN-FNw' 'OwjRMXRC0KyPrIlcjaXeFQ' '2.0']
 ['HQl28KMwrEKHqhFrrDqVNQ' 'V34qejxNsCbcgD8C0HVk-Q' '5.0']
 ['5JxlZaqCnk1MnbgRirs40Q' 'ofKDkJKXSKZXu5xJNGiiBQ' '1.0']
 ...
 ['kDCyqlYcstqnoqnfBRS5Og' 'Scmyz7MK4TbXXYcaLZxIxQ' '5.0']
 ['VKVDDHKtsdrnigeIf9S8RA' 'lBuAACBEThaQHQGMzAlKpg' '3.0']
 ['2SbyRgHWuWNlq18eHAx95Q' 'fiA6ztHPONUkmX6yKIXyHg' '5.0']]
length: 5852610


In [104]:
indexes_to_delete = []

for index, user in enumerate(reviews_list[:,1]):
    if user not in U:
        indexes_to_delete.append(index)

for index, user in enumerate(reviews_list[:,0]):
    if user not in B:
        indexes_to_delete.append(index)

reviews_pruned = np.delete(reviews_list, indexes_to_delete, 0)
print("{} indexes deleted".format(len(indexes_to_delete)))

3914996 indexes deleted


## Results from pruned table

In [102]:
print("Businesses: {}".format(len(np.unique(reviews_pruned[:,0]))))
print("Users: {}".format(len(np.unique(reviews_pruned[:,1]))))
print("Reviews: {}".format(len(reviews_pruned)))

Businesses: 31914
Users: 40225
Reviews: 1937614
