In [53]:
import re, pandas as pd
import time, os, json

# Part 0: Read the crawled data in Step1.
### Notice that I use .csv file and pandas package here because I need to do some complicated data preprocessing, and pandas package is extremely useful. After finishing the preprocessing, I will dump the DataFrame into dict for later use.

In [54]:
rest = pd.read_csv('./cache/restaurant_dataset.csv', index_col = 0)
rest = rest.reset_index()
rest = rest.rename(columns={'index':'id'})
rest_total = len(rest)
rest

Unnamed: 0,id,Name,url,Rating,Rating_food,Rating_service,Rating_value,cuisine,Review_num,Neighborhood,Price_low,Price_high,Strong_tag,Comments,img_url
0,0,12 Chairs,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,4.0,"['Mediterranean', 'Middle Eastern', 'Israeli']",255,Downtown Manhattan (Downtown),2,3,"['brunch', 'hummus', 'shakshuka', 'sabich', 'l...",Delicious Israeli Food 😋Fantastic FindGood foo...,https://media-cdn.tripadvisor.com/media/photo-...
1,1,15 EAST @ Tocqueville,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,4.0,"['French', 'Japanese', 'Sushi']",451,Union Square,4,4,[],Always a Perfect MealFusion French-Jpanese foo...,https://media-cdn.tripadvisor.com/media/photo-...
2,2,230 Fifth,https://www.tripadvisor.com/Restaurant_Review-...,4.0,3.5,3.5,3.5,"['American', 'Bar', 'Fusion']",4019,Tenderloin,2,3,"['brunch', 'the empire state building', 'rooft...",Completely Miss soldThe most amazing view!230 ...,https://media-cdn.tripadvisor.com/media/photo-...
3,3,2nd Avenue Deli,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,3.5,"['Deli', 'Israeli', 'Vegetarian Friendly']",1198,Kips Bay,2,3,"['corned beef', 'matzo ball soup', 'coleslaw',...",Very good but service could be betterA must in...,https://media-cdn.tripadvisor.com/media/photo-...
4,4,2 Bros Pizza,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.0,5.0,"['Italian', 'American', 'Pizza']",247,Midtown,1,1,"['pizza', 'cheese slices', 'cheap lunch', 'new...",TastyGood cheap lunchJust okay.Best pizza!Deli...,https://media-cdn.tripadvisor.com/media/photo-...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
966,966,Zabar's,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.0,3.5,4.0,"['American', 'Deli', 'Vegetarian Friendly']",522,Upper West Side,1,1,"['bagels', 'cheese selection', 'smoked salmon'...",Yum to the maxVery TastyOverratedMaybe the bes...,https://media-cdn.tripadvisor.com/media/photo-...
967,967,Zen Ramen & Sushi,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,4.0,"['Japanese', 'Sushi', 'Asian']",164,Tenderloin,2,3,"['ramen', 'chicken', 'visit nyc', 'service was...",Food was not good!Fabulous FoodSo goodAwesome ...,https://media-cdn.tripadvisor.com/media/photo-...
968,968,Zero Otto Nove,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,4.0,"['Italian', 'Pizza', 'Southern-Italian']",268,Flatiron District,2,3,"['pasta', 'pie', 'arthur avenue', 'great itali...",What a findGreat Italian foodA gem in Gramercy...,https://media-cdn.tripadvisor.com/media/photo-...
969,969,Zoob Zib Thai Authentic Noodle Bar,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,4.5,"['Asian', 'Thai', 'Vegetarian Friendly']",238,Midtown West,2,3,"['pad thai', 'noodle soup', 'grilled pork', 'l...",We’ll worth a visitAuthentic Thai food that wo...,https://media-cdn.tripadvisor.com/media/photo-...


# Part 1: Matching tags for those restaurants that don't have existing tags to crawl.
### The main procedure is shown below. 
1. Created a dictionary called tag_dict to store the inverted index of each Strong_tag: For example, "chips":{1,2,3,4} means that restaurant 1,2,3,4 has the Strong tag "chips".
2. Find the restaurants that have <8 tags, and use the review texts to match the tag_dict.
3. If new tags are matched in tag_dict, update the tag_dict and the tags for the restaurant.
4. Finally, all restaurant will have >8 tags, and the inverted index are all stored in tag_dict

In [55]:
tag_dict = {}
tag_list = []

word_to_remove = ["nyc", "ny", "new york", "", "ave", 'dish', 'dishes', 'city']

for idx in range(rest_total):
    
    tags = rest['Strong_tag'][idx].replace("'", "").strip('][').split(', ')
    
    for word in word_to_remove:
        if word in tags:
            tags.remove(word)
    
    tag_list.append(tags)
    
    for tag in tags:
        if tag in tag_dict:
            tag_dict[tag].append(idx)
        else:
            tag_dict[tag] = [idx]

for tag in list(tag_dict.keys()):
    if len(tag_dict[tag]) < 5:
        tag_dict.pop(tag)

for idx in range(rest_total):
    if len(tag_list[idx]) < 8:
        tag_count_list = []
        new_tags = tag_list[idx]
        for tag in tag_dict:
            comment = rest['Comments'][idx].lower()
            if (tag in comment) and (tag not in new_tags):
                tag_count_list.append((comment.count(tag), tag))
        tag_count_list.sort(reverse = True)
        
        new_tag_counter = 0
        for tag_count in tag_count_list:
            if new_tag_counter < 10:
                new_tags.append(tag_count[1])
                tag_dict[tag_count[1]].append(idx)
                new_tag_counter+=1
        
        tag_list[idx] = new_tags

rest['tags'] = tag_list
rest['cuisine'] = rest['cuisine'].apply(lambda x: x.replace("'", "").strip('][').split(', '))

# Then, dump the restaurant data into .json file with useful fields for later use. After this moment, we don't need .csv file any more.

In [56]:
dumped_json_cache = json.dumps(rest.drop(['Strong_tag', 'Comments'], axis=1).to_dict('index'))
fw = open('restaurants.json',"w")
fw.write(dumped_json_cache)
fw.close() 

# Part 2: Build the graph structure
### This part involves the following procedures:
1. Since we have built the inverted index, we can add the tags to the connection. For example, if "chips":{1,2}, then we add 1:(2:{["chips"]}) into the graph.
2. After all nodes are connected, filter the week connections (for example, Restaurant 1 and 2 only share 2 tags, then the link will be broken). It might occur that some restaurants are not connected to others after the filtering. I wrote some code to prevent this situation, and ensured that all nodes have at least one connection to another node. But there's still possibility that a group of nodes are not connected to another group of nodes. Therefore, it may occur in the demo saying that Restaurant 1 and Restaurant 2 are not connected.
3. Finally, I will store the graph structure into json file. The json file will be read in the demo part.

In [7]:
graph_dict = {}
for tag in tag_dict:
    for idx1 in tag_dict[tag]:
        for idx2 in tag_dict[tag]:
            if idx1 != idx2:
                if str(idx1) in graph_dict:
                    if str(idx2) in graph_dict[str(idx1)]:
                        graph_dict[str(idx1)][str(idx2)].append(tag)
                    else:
                        graph_dict[str(idx1)][str(idx2)] = [tag]
                else:
                    graph_dict[str(idx1)] = {str(idx2):[tag]}

for idx1 in graph_dict:
    longest_len = 0
    longest_idx2 = 0
    longest_rel = []
    for idx2 in list(graph_dict[idx1].keys()):
        if len(graph_dict[idx1][idx2]) > longest_len:
            longest_len = len(graph_dict[idx1][idx2])
            longest_idx2 = idx2
            longest_rel = graph_dict[idx1][idx2]
        if len(graph_dict[idx1][idx2]) < 4:
            graph_dict[idx1].pop(idx2)
    if graph_dict[idx1] == {}:
        graph_dict[idx1][longest_idx2] = longest_rel

for idx1 in range(rest_total):
    for idx2 in range(rest_total):
        if (str(idx2) in graph_dict[str(idx1)]) and (str(idx1) not in graph_dict[str(idx2)]):
            graph_dict[str(idx2)][str(idx1)] = graph_dict[str(idx1)][str(idx2)]

dumped_json_cache = json.dumps(graph_dict)
fw = open('graph_dict.json',"w")
fw.write(dumped_json_cache)
fw.close() 