### Their code

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import os
import json

In [2]:
def load_reviews(file_name):
    f = open(file_name) 
    data = json.load(f)

    df_list = []
    for restaurant, reviews in data.items():
        normalized_df = pd.json_normalize(reviews)
        normalized_df['restaurant'] = restaurant
        df_list.append(normalized_df)

    return pd.concat(df_list, ignore_index=True)

reviews = []
for file_name in os.listdir('reviews'):
    with open('reviews/' + file_name, "r") as json_file:
        reviews.append(load_reviews('reviews/' + file_name))
        
df_reviews = pd.concat(reviews, ignore_index=True)
df_reviews.drop_duplicates()

Unnamed: 0,photoCount,reviewCount,eliteYear,localizedDate,rating,comment.text,comment.language,restaurant
0,1012.0,321.0,,11/2/2018,5.0,*Tartinery is one of the food vendors at Hudso...,en,tartinery-new-york-15
1,190.0,1421.0,,8/6/2023,3.0,Overall the service is very slow. We ordered 2...,en,tartinery-new-york-15
2,117.0,294.0,2023.0,10/17/2022,4.0,Yum I love a place that has interesting drinks...,en,tartinery-new-york-15
3,154.0,171.0,2023.0,11/29/2021,5.0,I always look for a comfortable go to bar/rest...,en,tartinery-new-york-15
4,6668.0,2143.0,2023.0,4/6/2022,3.0,Tartinery is where you go for happy hour wine ...,en,tartinery-new-york-15
...,...,...,...,...,...,...,...,...
331688,301.0,77.0,2023.0,5/21/2022,4.0,Great brunch spot. Cute decor. The coffee was ...,en,brownstone-pancake-factory-edgewater
331689,324.0,114.0,2023.0,10/1/2022,4.0,An &#34;All American Dinner&#34; that is affor...,en,brownstone-pancake-factory-edgewater
331690,24.0,10.0,,9/10/2023,5.0,Went today and got the triple d buffalo chicke...,en,brownstone-pancake-factory-edgewater
331691,5.0,35.0,,5/7/2023,5.0,We came to Brownstone as a group of 30 on a bu...,en,brownstone-pancake-factory-edgewater


In [3]:
def load_restaurant_data(file_name):
    f = open(file_name) 
    data = json.load(f)
    df_list = []
    for restaurant, info in data.items():
        normalized_df = pd.json_normalize(info)
        normalized_df['restaurant'] = restaurant
        filtered_df = normalized_df.dropna(axis=1, how='all')
        df_list.append(filtered_df)

    # Concatenate all dataframes
    df_restaurants = pd.concat(df_list, ignore_index=True)
    return flatten_category(df_restaurants)


def extract_titles(row):
    # Extracting 'title' from each column if it is not None, otherwise using None
    return [row[i]['title'] if row[i] is not None else None for i in range(len(row))]

def flatten_category(df):
    new_category = pd.json_normalize(df['categories'])
    category_df = new_category.apply(extract_titles, axis=1, result_type='expand')

    # Renaming the columns
    category_df.columns = [f'category_{i}' for i in range(new_category.shape[1])]

    # Concatenate with the original dataframe if needed
    df = pd.concat([df, category_df], axis=1)
    return df


df_restaurants = load_restaurant_data('restaurants.json')

### My code

Concatenate all reviews per restaurant into a "document" that describes that restaurant.

In [3]:
restaurant_docs = {}
for restaurant, data in df_reviews.groupby('restaurant'):
    restaurant_docs[restaurant] = ' '.join(data['comment.text'])

Concatenate all reviews from all michelin restaurants into a "michelin document" that captures Michelin diction.

In [4]:
with open('michelin_alias.json') as fp:
    michelins = set(json.load(fp).values())

In [139]:
michelin_doc = ' '.join(doc for restaurant, doc in restaurant_docs.items() if restaurant in michelins)

Use TF-IDF to determine the similarities between each restaurant "document" and the "michelin document". The idea is that the restaurants with the highest similarities are most likely to be Michelin restaurants. The high ranking restaurants that are not already Michelin might be more likely to become Michelin in the future.

In [85]:
aliases = ['michelin_doc'] + list(restaurant_docs.keys())
documents = [michelin_doc] + list(restaurant_docs.values())

In [68]:
tfidf = TfidfVectorizer().fit_transform(documents)
similarities = tfidf[0] * tfidf.T

Create and rank the dataframe by similarity to the "michelin document".

In [168]:
df_similarity = pd.DataFrame({
    "alias": aliases,
    "similarity": similarities.todense().tolist()[0], #Compare each individual restaurant to the overall Michelin reviews.
    "michelin": [a in michelins for a in aliases]
})

In [169]:
df_similarity.sort_values(by="similarity", ascending=False, inplace=True)
df_similarity["rank"] = range(len(df_similarity))

Restaurants most similar to the "michelin document".

In [170]:
df_similarity.head(10)

Unnamed: 0,alias,similarity,michelin,rank
0,michelin_doc,1.0,False,0
1933,chefs-table-at-brooklyn-fare-new-york,0.940698,False,1
9810,the-modern-new-york-3,0.940447,True,2
3572,gabriel-kreuther-new-york,0.932203,True,3
8554,sixty-three-clinton-new-york,0.932165,True,4
7057,one-white-street-new-york,0.932111,True,5
6438,momofuku-ko-new-york-3,0.931969,False,6
1812,cathédrale-restaurant-new-york-city,0.930357,False,7
1809,catch-nyc-new-york,0.926517,False,8
10386,vestry-new-york,0.925894,True,9


Note interesting case above: Chef's Table at Brooklyn Fare Closes, Fires its Star Chef. Aug 22, 2023 — Chef's Table at Brooklyn Fare is reopening this fall with a "new chef," according to its publicist. Might have lost its star, but we see "trickle".

Ranks of Michelin restaurants most similar to the "michelin document".

In [171]:
df_similarity[df_similarity["michelin"] == True].head(10)

Unnamed: 0,alias,similarity,michelin,rank
9810,the-modern-new-york-3,0.940447,True,2
3572,gabriel-kreuther-new-york,0.932203,True,3
8554,sixty-three-clinton-new-york,0.932165,True,4
7057,one-white-street-new-york,0.932111,True,5
10386,vestry-new-york,0.925894,True,9
5424,le-bernardin-new-york,0.922247,True,13
9817,the-musket-room-new-york,0.921416,True,15
5481,le-pavillon-new-york,0.919724,True,16
7942,red-paper-clip-new-york,0.915057,True,22
4584,jean-georges-new-york-2,0.914246,True,25


Add our data to the restaurants dataframe.

In [172]:
df_restaurants = df_restaurants.merge(df_similarity, on="alias")

Show the restaurant with the highest similarity to the "michelin document" by group. 

In [174]:
pd.set_option('display.max_rows', None)
def display_highest_in_group(df, group):
    display(df[["name", "similarity", "rank", "michelin", group]].groupby(group).first())

In [177]:
display_highest_in_group(df_restaurants, "category_0")

Unnamed: 0_level_0,name,similarity,rank,michelin
category_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acai Bowls,Oakberry,0.604319,7718,False
Afghan,Ariana Afghan Kebab Restaurant,0.78191,3672,False
African,Kiflu's Lunch Truck,0.57963,7997,False
American,P.J. Clarke's,0.892413,187,False
Argentine,Porteno Restaurant,0.882526,357,False
Armenian,Columbia Smoke Shop,0.171313,10650,False
Asian Fusion,Wei West,0.829765,1976,False
Australian,Hole In The Wall,0.893199,173,False
Austrian,Schilling Restaurant & Bar,0.845824,1404,False
Bagels,Black Seed Bagels At Hudson Eats,0.697678,6091,False


In [178]:
display_highest_in_group(df_restaurants, "location.zip_code")

Unnamed: 0_level_0,name,similarity,rank,michelin
location.zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,Leaping Frog Cafe,0.378235,9667,False
3276.0,Lochmere Golf Y Country Club Restaurant,0.441336,9250,False
7020.0,Cafe Spice Dosateria,0.518242,8581,False
7047.0,Waterside Restaurant & Catering,0.886613,271,False
7093.0,O2 BBQ,0.843195,1495,False
7102.0,chickpea,0.363597,9775,False
7632.0,Bada Story,0.822165,2220,False
10001.0,Express Halal Cart,0.52327,8533,False
10002.0,New Khai Tri Deli,0.211011,10506,False
10003.0,Bar Fiorentino at Eataly NYC Downtown,0.674517,6592,False
