In [1]:
import pandas as pd
import json
# import cudf as pd

___
### Importing and cleaning data

In [2]:
business_df = pd.read_json('./Data/yelp_academic_dataset_business.json', lines=True)
business_df = business_df[(business_df['state'] == 'TN') & (business_df['is_open'] == 1)]
business_df['categories'] = business_df['categories'].fillna('')
business_df = business_df[business_df['categories'].str.contains('Restaurants')]

In [3]:
checkin_df = pd.read_json('./Data/yelp_academic_dataset_checkin.json', lines=True)
checkin_df = checkin_df.loc[checkin_df['business_id'].isin(business_df['business_id'])]

In [4]:
useful_chunks = []

for chunk in pd.read_json('./Data/yelp_academic_dataset_review.json', lines=True, chunksize=100000):
    filtered_chunk = chunk.loc[chunk['business_id'].isin(business_df['business_id'])]
    useful_chunks.append(filtered_chunk)

reviews_df = pd.concat(useful_chunks)
reviews_df.reset_index(drop=True, inplace=True)

In [5]:
tip_df = pd.read_json('./Data/yelp_academic_dataset_tip.json', lines=True)
tip_df = tip_df.loc[tip_df['business_id'].isin(business_df['business_id'])]

In [6]:
summary_reviews_df = pd.read_csv('./Data/nashville_business_reviews_summary.csv', sep='|')

def integrate_reviews(row, sentiment):
    try:
        review = summary_reviews_df.loc[(summary_reviews_df['sentiment'] == sentiment) & (summary_reviews_df['business_id'] == row['business_id'])]['summary'].values[0]
    except:
        review = 'No Reviews'
    return review

business_df['negative_summary'] = business_df.apply(integrate_reviews,axis=1, args=('negative',))
business_df['positive_summary'] = business_df.apply(integrate_reviews,axis=1, args=('positive',))

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
all_restaurant_types = business_df['categories'].str.split(',').explode().str.strip().value_counts().index
valid_types = all_restaurant_types[:128].tolist()
types_to_remove = ['Restaurants','Event Planning & Services','Caterers','Music Venues','Food Delivery Services','Venues & Event Spaces','Hotels & Travel','Convenience Stores','International Grocery','Performing Arts','Florists','Active Life','Food','Nightlife', 'Arcades', 'Flowers & Gifts','Butcher', 'Jazz & Blues','Party & Event Planning','Dance Clubs', "Arts & Entertainment", "Shopping", "Ethnic Food", "Street Vendors",
    "Karaoke", "Pasta Shops", "Meat Shops", "Pop-Up Restaurants", "Farmers Market","Automotive"]
for type in types_to_remove:
    valid_types.remove(type)

In [9]:
business_df.dropna(subset=['attributes'], inplace=True)
#extract: Outdoor Seating, Alcohol, RestaurantsPriceRange2
business_df['OutdoorSeating'] = business_df['attributes'].apply(lambda x: x.get('OutdoorSeating', None))
business_df['Alcohol'] = business_df['attributes'].apply(lambda x: x.get('Alcohol', None))
business_df['RestaurantsPriceRange2'] = business_df['attributes'].apply(lambda x: x.get('RestaurantsPriceRange2', None))

#fill outdoor seating with false
business_df['OutdoorSeating'].fillna(False, inplace=True)
business_df['OutdoorSeating'].replace({'False': False, 'True': True, 'None': False}, inplace=True)
#fill alcohol with none
business_df['Alcohol'].fillna('none', inplace=True)
business_df['Alcohol'].replace({
                            "u'none'" : 'none',
                            "u'full_bar'" : 'full_bar',
                            "u'beer_and_wine'" : 'beer_and_wine',
                            "'none'" : 'none',
                            "'full_bar'" : 'full_bar',
                            "'beer_and_wine'" : 'beer_and_wine',
                            }, inplace=True)
#fill price range with 2
business_df['RestaurantsPriceRange2'].fillna(2, inplace=True)
business_df['RestaurantsPriceRange2'] = business_df['RestaurantsPriceRange2'].astype(int)

#fill hours with generic hours dict
business_df['hours'].fillna("{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'Wednesday': '0:0-0:0', 'Thursday': '0:0-0:0', 'Friday': '0:0-0:0', 'Saturday': '0:0-0:0', 'Sunday': '0:0-0:0'}", inplace=True)

In [10]:
def encode_top_categories(row, valid_types):
    row_categories = set(row['categories'])
    return [1 if cat in row_categories else 0 for cat in valid_types]


business_df['categories'] = business_df['categories'].str.split(',')
business_df['categories'] = business_df['categories'].apply(lambda x: [str(cat).strip() for cat in x])

In [11]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=valid_types)

encoded_array = mlb.fit_transform(business_df['categories'])
# Create a DataFrame from the encoded array
encoded_df = pd.DataFrame(encoded_array, columns=mlb.classes_, index=business_df.index)

# Concatenate the original DataFrame with the new encoded DataFrame
business_df = pd.concat([business_df, encoded_df], axis=1)



In [12]:
#onehotencode alcohol, outdoor seating, and price using pandas get_dummies
business_df = pd.get_dummies(business_df, columns=['Alcohol', 'OutdoorSeating', 'RestaurantsPriceRange2'], dtype=int)


In [13]:
#scale the review data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(business_df[['stars']])
business_df['stars_scaled'] = scaler.transform(business_df[['stars']])

____
### NLP Summarization of the Positive and Negative reviews for each restaurant

In [14]:
# reviews_df.head()

In [15]:
# def encode_review(row):
#     if row['stars'] < 3:
#         return 'negative'
#     elif row['stars'] == 3:
#         return 'neutral'
#     else:
#         return 'positive'

In [16]:
# reviews_df['sentiment'] = reviews_df.apply(encode_review, axis=1)

In [17]:
# import torch
# from torch.nn import DataParallel
# from torch.nn.parallel import DistributedDataParallel as DDP
# from transformers import BartForConditionalGeneration, BartTokenizer

# # Load pre-trained model and tokenizer
# model_name = 'facebook/bart-large-cnn'  # BART model fine-tuned for CNN/DailyMail summarization
# tokenizer = BartTokenizer.from_pretrained(model_name)
# text_model = BartForConditionalGeneration.from_pretrained(model_name)
# # text_model = DataParallel(text_model)

# # Enable GPU usage
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# text_model.to(device)
# # Example function to summarize text using the BART model
# def summarize_text(text):
#     inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
#     inputs = inputs.to(device)
#     summary_ids = text_model.generate(inputs['input_ids'], num_beams=4, max_length=90, early_stopping=True)
#     summary_text = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
#     print(summary_text)
#     return summary_text

# # Group and summarize reviews

# grouped_reviews = reviews_df.loc[reviews_df['sentiment'] != 'neutral'].groupby(['business_id', 'sentiment'])['text'].agg(lambda x: ' '.join(x)).reset_index()

# # Apply the summarization model to the aggregated reviews
# grouped_reviews['summary'] = grouped_reviews['text'].apply(summarize_text)

# grouped_reviews.to_csv('nashville_business_reviews_summary.csv',sep='|', index=False)

In [18]:
# import torch.multiprocessing as mp

# def split_dataframe(df, num_chunks):
#     chunk_size = len(df) // num_chunks + 1  # Ensuring all data is included
#     return [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

# def process_chunk(data_chunk):
#     # Example function that could be run in each subprocess
#     print("Processing", data_chunk.shape[0], "records")
#         # Load tokenizer and model inside the function to ensure it's loaded in the correct process
#     model_name = 'facebook/bart-large-cnn'
#     tokenizer = BartTokenizer.from_pretrained(model_name)
#     text_model = BartForConditionalGeneration.from_pretrained(model_name)
    
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     text_model.to(device)

#     def summarize_text(text):
#         inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
#         inputs = inputs.to(device)
#         summary_ids = text_model.generate(inputs['input_ids'], num_beams=4, max_length=90, early_stopping=True)
#         summary_text = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
#         return summary_text

#     # Summarize each piece of text in the chunk
#     data_chunk['summary'] = data_chunk['text'].apply(summarize_text)
#     return data_chunk


# def main(rank, chunks):
#     process_chunk(chunks[rank])


# chunks = split_dataframe(reviews_df, torch.cuda.device_count())
# manager = mp.Manager()
# mp.spawn(main, args=(chunks,), nprocs=len(chunks), join=True)


In [19]:
# import re
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# strategy = tf.distribute.MirroredStrategy()

# with strategy.scope():
#     model = Sequential([
#         Embedding(input_dim=10000, output_dim=128, input_length=100),
#         LSTM(128),
#         Dense(1, activation='sigmoid')
#     ])
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# # # Continue with model training


# test_reviews_df = reviews_df.copy()

# def clean_text(text):
#     text = re.sub(r'\W', ' ', str(text))
#     text = text.lower()
#     text = re.sub(r'\s+[a-z]\s+', ' ', text)
#     text = re.sub(r'^[a-z]\s+', ' ', text)
#     text = re.sub(r'\s+', ' ', text, flags=re.I)
#     return text

    
# test_reviews_df['review_clean'] = test_reviews_df['text'].apply(clean_text)

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

# stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()

# def process_text(text):
#     tokens = nltk.word_tokenize(text)
#     tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
#     return ' '.join(tokens)

# test_reviews_df['review_final'] = test_reviews_df['review_clean'].apply(process_text)

# tokenizer = Tokenizer(num_words=10000)
# tokenizer.fit_on_texts(test_reviews_df['review_final'])
# sequences = tokenizer.texts_to_sequences(test_reviews_df['review_final'])

# max_length = max(len(x) for x in sequences)  # Or choose a fixed length
# review_padded = pad_sequences(sequences, maxlen=max_length, padding='post')

# # # Building the model
# # model = Sequential()
# # model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_length))
# # model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
# # model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification

# # # Compile the model
# # model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


____
### Sorting Valid Categories into more general categories

In [20]:
#ChatGPT was kind enough to sort my categories

american_cuisine = [
    "American (Traditional)", "American (New)", "Burgers", "Barbeque",
    "Southern", "Steakhouses", "Comfort Food", "Cajun/Creole", "Hot Dogs", 
    "New Mexican Cuisine"
]

international_cuisine = [
    "Mexican", "Tex-Mex", "Italian", "Chinese", "Japanese", "Sushi Bars",
    "Asian Fusion", "Mediterranean", "Greek", "Thai", "Latin American",
    "Middle Eastern", "Indian", "Vietnamese", "French", "Korean", "Spanish",
    "Turkish", "Caribbean", "Ramen", "Salvadoran", "Poke", "Hawaiian",
    "Laotian", "Halal", "Ethiopian", "African"
]

fast_food_casual = [
    "Fast Food", "Sandwiches", "Pizza", "Chicken Wings", "Tacos", "Diners",
    "Food Trucks", "Hot Dogs", "Fish & Chips", "Donuts", "Waffles", "Acai Bowls",
    "Wraps", "Cheesesteaks", "Food Court"
]

bars_nightlife = [
    "Bars", "Cocktail Bars", "Sports Bars", "Pubs", "Lounges", "Dive Bars",
    "Wine Bars", "Beer Bar", "Tapas/Small Plates", "Gastropubs", "Breweries",
    "Brewpubs", "Beer Gardens", "Whiskey Bars", "Hookah Bars"
]

health_specialty_foods = [
    "Salad", "Vegetarian", "Vegan", "Gluten-Free", "Juice Bars & Smoothies",
    "Health Markets"
]

beverages = [
    "Coffee & Tea", "Specialty Food", "Wine & Spirits", "Beer", "Coffee Roasteries",
    "Bubble Tea"
]

desserts_bakeries = [
    "Desserts", "Ice Cream & Frozen Yogurt", "Bakeries", "Creperies"
]

cultural_local_flavors = [
    "Local Flavor", "Soul Food"
]


In [21]:
list_of_cats = {
    'American':american_cuisine,
    'International':international_cuisine,
    'Health Food':health_specialty_foods,
    'Local and Cultural':cultural_local_flavors,
    'Fast Food':fast_food_casual,
    'Coffee and Beverages':beverages,
    'Dessert':desserts_bakeries,
    'Bars and Nightlife':bars_nightlife,
}

____
### A failed experiment in recommending restaurants

In [22]:
# from sklearn.model_selection import train_test_split


# X_list = business_df.columns[14:-1]
# X = business_df[X_list]
# y = business_df['name']

# X_train, X_test, y_train, y_test = train_test_split(X,y)

In [23]:
# with strategy.scope():
#     # Initialize the model
#     model = Sequential()

#     # Adding layers to the model
#     model.add(Dense(len(X_list), activation='relu', input_dim=X_train.shape[1]))  # Adjust input_dim to match the number of one-hot encoded features
#     model.add(Dropout(0.5))
#     model.add(Dense(64, activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(32, activation='relu'))
#     model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

#     # Compile the model
#     model.compile(optimizer='adam',
#                   loss='categorical',
#                   metrics=['mean_squared_error', 'mean_absolute_error'])


In [24]:
# history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)


In [25]:
# loss, mse, mae = model.evaluate(X_test, y_test)
# print(f"Test MSE: {mse}, Test MAE: {mae}")

In [26]:
# import matplotlib.pyplot as plt

# def plot_history(history):
#     # Plotting training & validation loss values
#     plt.figure(figsize=(12, 6))
#     plt.subplot(1, 2, 1)
#     plt.plot(history.history['loss'])
#     plt.plot(history.history['val_loss'])
#     plt.title('Model Loss')
#     plt.ylabel('Loss')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Test'], loc='upper left')

#     # Plotting training & validation accuracy values
#     plt.subplot(1, 2, 2)
#     plt.plot(history.history['mean_squared_error'])
#     plt.plot(history.history['val_mean_squared_error'])
#     plt.title('Model Mean Squared Error')
#     plt.ylabel('Mean Squared Error')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Test'], loc='upper left')

#     plt.show()

# plot_history(history)


In [27]:
# from tensorflow.keras.callbacks import EarlyStopping

# # Early stopping callback
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# # Include in the fit function
# history_e_s = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

# plot_history(history_e_s)

In [28]:
# loss, mse, mae = model.evaluate(X_test, y_test)
# print(f"Test MSE: {mse}, Test MAE: {mae}")

In [29]:
# user_dict = {}
# for item in X_list:
#     user_dict[item] = 0

___
### Making a UI

* from the AirBNB selection, the function will receive a lat-long. That latlong will calculate the distance to every restaurant 

The user will select:

* distance they're willing to travel (slider)
* price preferences
* food category or specific food
* indoor/outdoor
* do they want drinks

In [33]:
from geopy import distance

In [32]:
import gradio as gr

In [34]:
def restaurant_distances(row,lat,long):
    return distance.distance((lat,long),(row['latitude'],row['longitude'])).miles

In [35]:
def populate_range_df(airbnb_lat, airbnb_long):
    business_df['airbnb_range'] = business_df.apply(restaurant_distances, axis=1, args=(airbnb_lat, airbnb_long))

In [61]:
%%time
populate_range_df(36.269593,-87.058943)

CPU times: user 323 ms, sys: 10.9 ms, total: 333 ms
Wall time: 332 ms


In [None]:
import gradio as gr

def closest_match(x):
    return x + ": The Definitive Edition"

def Dropdown_list(x):
    new_options =  [*options, x + " Remastered", x + ": The Remake", x + ": Game of the Year Edition", x + " Steelbook Edition"]
    return gr.Dropdown.update(choices=new_options)


def Recommend_new(x):
  return x + ": Highest Cosine Similarity"

demo = gr.Blocks()

options = ['Placeholder A', 'Placeholder B', 'Placeholder C']
with demo:
    text_input = gr.Textbox(label="Search bar")
    b1 = gr.Button("Match Closest Title")

    text_options = gr.Dropdown(options, label="Top 5 options")
    b2 = gr.Button("Provide Additional options")
    
    new_title = gr.Textbox(label="Here you go!")
    b3 = gr.Button("Recommend a new title")

    b1.click(closest_match, inputs=text_input, outputs=text_options)
    b2.click(Dropdown_list, inputs=text_input, outputs=text_options)
    b3.click(Recommend_new, inputs=text_options, outputs=new_title)
    # text_options.update(interactive=True)


demo.launch(debug=True)

In [100]:
def show_options(choice, option):
    if choice == "Specific Food":
        return f"You chose the specific food: {option}"
    elif choice == "Category":
        return f"You are viewing the category: {option}"
    
def update_options(choice):
    if choice == "Specific Food":
        return gr.Dropdown(choices=valid_types)
    elif choice == "Category":
        return gr.Dropdown(choices=list(list_of_cats.keys()))
    return []  # return an empty list if no choice is made

def query_user(cuisine, category, price, in_out, drinks, distance):
    print(f"""Price: {price}
            In/Out: {in_out}
            drinks: {drinks}
            distance: {distance}
            """)

with gr.Blocks(theme='default') as demo:
    with gr.Row():
        choice = gr.Radio(["Specific Food", "Category"], label="What would you like to do?")
        option = gr.Dropdown(['Select Specific Food or Category'],label="Choose an option", value='Select Specific Food or Category', scale=2)
    with gr.Row():
        price_range = gr.CheckboxGroup(['$','$$','$$$','$$$$'], label="Price Range", info="What Price Ranges are you feeling?")
        indoor_outdoor = gr.Radio(['Indoor Seating', 'Outdoor Seating'], label='Indoor or Outdoor seating?')
    with gr.Row():
        drinks = gr.Radio(["Doesn't Matter","Beer and Wine","Full Bar","None"], label="Alcohol Available?", info="Select what level of alcohol availability, Full Bar includes Beer and Wine")
        distance = gr.Slider(value=1, minimum=0.1, maximum=20, label='Max distance from your AirBNB', interactive=True)
        
    submit_btn = gr.Button("Submit")
    output = gr.Label()

    choice.input(update_options, inputs=choice, outputs=option)
    submit_btn.click(fn=show_options, inputs=[choice, option], outputs=output)

demo.launch()

Running on local URL:  http://127.0.0.1:7896

To create a public link, set `share=True` in `launch()`.




In [43]:
input_interface = gr.Interface(fn=create_user_profile, inputs=[gr.CheckboxGroup(list_of_cats, label="General Categories", info="What general category of food would you like?"), #category - Maybe do a select kinda thing
                                                                gr.CheckboxGroup(['$','$$','$$$','$$$$'], label="Price Range", info="What Price Ranges are you feeling?"), #price - need to change to checkboxes
                                                                  'slider', #
                                                                    'slider',
                                                                      'slider',
                                                                      'slider']
                                                                      , outputs="number")


In [44]:
input_interface.launch()

Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


