In [54]:
import time
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import emoji
import Preprocessing as preproc
import nltk
import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [69]:
restaurants_raw = pd.read_csv(r"data_hyderabad/105_restaurants.csv")
reviews_raw = pd.read_csv(r"data_hyderabad/10k_reviews.csv")

restaurants_raw.head(5)

Unnamed: 0,Name,Links,Cost,Collections,Cuisines,Timings
0,Beyond Flavours,https://www.zomato.com/hyderabad/beyond-flavou...,800,"Food Hygiene Rated Restaurants in Hyderabad, C...","Chinese, Continental, Kebab, European, South I...","12noon to 3:30pm, 6:30pm to 11:30pm (Mon-Sun)"
1,Paradise,https://www.zomato.com/hyderabad/paradise-gach...,800,Hyderabad's Hottest,"Biryani, North Indian, Chinese",11 AM to 11 PM
2,Flechazo,https://www.zomato.com/hyderabad/flechazo-gach...,1300,"Great Buffets, Hyderabad's Hottest","Asian, Mediterranean, North Indian, Desserts","11:30 AM to 4:30 PM, 6:30 PM to 11 PM"
3,Shah Ghouse Hotel & Restaurant,https://www.zomato.com/hyderabad/shah-ghouse-h...,800,Late Night Restaurants,"Biryani, North Indian, Chinese, Seafood, Bever...",12 Noon to 2 AM
4,Over The Moon Brew Company,https://www.zomato.com/hyderabad/over-the-moon...,1200,"Best Bars & Pubs, Food Hygiene Rated Restauran...","Asian, Continental, North Indian, Chinese, Med...","12noon to 11pm (Mon, Tue, Wed, Thu, Sun), 12no..."


# Review Preprocessing

In [71]:
len(reviews_raw['Restaurant'].unique())
missing_restaurants = set(restaurants_raw['Name']) - set(reviews_raw['Restaurant'])
missing_restaurants

{'Angaara Counts 3',
 'IndiBlaze',
 'Republic Of Noodles - Lemon Tree Hotel',
 'Sweet Basket',
 'Wich Please'}

In [56]:
reviews_data = reviews_raw[reviews_raw["Rating"].notna() & reviews_raw["Review"].notna()]
reviews_data.isna().sum()

Restaurant    0
Reviewer      0
Review        0
Rating        0
Metadata      0
Time          0
Pictures      0
dtype: int64

In [57]:
# split UPPERCASE WORDS 
def splitting_words_process(word):
    # only upper case letters
    if word.isupper():
        return word
    
    # more than one upper case letter inside
    elif re.search(r'[A-Z][a-z]*[A-Z]', word):
        split_word = re.findall(r'[A-Z][a-z]*', word)
        return ' '.join(split_word)
    
    # <2 upper case letters
    else:
        return word

reviews_data['Review'] = reviews_data['Review'].apply(lambda x: ' '.join([splitting_words_process(word) for word in x.split()]))

In [58]:
# Function to replace 'gud', 'goo', 'gd' with the appropriate 'good'
def replace_gud_with_good(text):
    if isinstance(text, str):
        # Define the regex pattern to match 'gud', 'goo', 'gd' in various capitalizations
        pattern = re.compile(r'\b([Gg][Uu][Dd]|[Gg][Oo][Oo]|[Gg][Dd])\b')

        # Replacement function to check the case of the first letter
        def replacement(match):
            word = match.group()
            # Check if the first letter is uppercase, then return 'Good', else 'good'
            if word[0].isupper():
                return 'Good'
            else:
                return 'good'
        
        # Use re.sub to apply the replacement function
        return pattern.sub(replacement, text)
    
    return text

# Apply the function to the 'Review' column to replace the variants of 'good'
reviews_data['Review'] = reviews_data['Review'].apply(replace_gud_with_good)

In [59]:
# Function to replace 'kk', 'Oke', 'k', 'Ok' with 'ok'
def replace_to_ok(text):
    if isinstance(text, str):
        # Define the regex pattern to match the variants of 'ok'
        pattern = re.compile(r'\b(k|kk|Ok|Oke)\b', re.IGNORECASE)

        # Replacement function to return 'ok' for all matched words
        def replacement(match):
            return 'ok'
        
        # Use re.sub to apply the replacement function
        return pattern.sub(replacement, text)
    
    return text

# Apply the function to the 'Review' column to replace the variants of 'ok'
reviews_data['Review'] = reviews_data['Review'].apply(replace_to_ok)

In [60]:
# add space after ! | " | # | $ | % | & | ( | ) | * | + | , | . | : | ; followed immediately by a word
def add_space_after_punctuation(df):

    df['Review'] = df['Review'].apply(lambda text: re.sub(r'([\u0021-\u0026\u0028-\u002C\u002E\u003A-\u003F]+(?=\w))', r'\1 ', text) if isinstance(text, str) else text)
    return df

# Example usage:
reviews_data = add_space_after_punctuation(reviews_data)

In [61]:
# remove gibberish words like "ggggggggggd", "eshjdgue"
def remove_gibberish(text):
    cleaned_text = re.sub(r'\b\w{15,}\b', '', text)  # removes 15+ words
    cleaned_text = re.sub(r'\b\w*(\w)\1{2,}\w*\b', '', cleaned_text)  # removes words that contain 3+ repeating letters

    return cleaned_text

reviews_data['Review'] = reviews_data['Review'].apply(remove_gibberish)

In [62]:
# replace numbers with blank string
reviews_data['Review'] = reviews_data['Review'].replace(r'\d+(\.\d+)?', '', regex=True)

In [None]:
preproc = reviews_data['Review'].apply(lambda x: preproc.main_pipeline(
    x, 
    print_output=False, 
    no_stopwords=False,
    custom_stopwords=[],
    convert_diacritics=True, 
    no_punctuation=False,
    remove_contractions = True,
    lowercase=False,
    lemmatized=False,
    list_pos=["n","v","a","r","s"],
    stemmed=False, 
    pos_tags_list='pos_tuples',
    tokenized_output=False
))

In [64]:
# remove stopwords
def remove_stopwords(tokens):
    punctuation_pattern = "[\u0021-\u0026\u0028-\u002C\u002E-\u002F\u003A-\u003F\u005B-\u005F\u2010-\u2028\ufeff`]+"
    stopwords = set(nltk.corpus.stopwords.words("english"))

    return [(token.lower(), pos) for token, pos in tokens if token.lower() not in stopwords and not re.match(punctuation_pattern, token)]

reviews_data['Review_Preprocessed'] = preproc.apply(remove_stopwords)

In [65]:
data_joined = pd.merge(reviews_data, restaurants_raw, left_on='Restaurant',right_on='Name', how='left')
data_joined.head(1)

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,Review_Preprocessed,Name,Links,Cost,Collections,Cuisines,Timings
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0,"[(ambience, NN), (good, JJ), (food, NN), (quit...",Beyond Flavours,https://www.zomato.com/hyderabad/beyond-flavou...,800,"Food Hygiene Rated Restaurants in Hyderabad, C...","Chinese, Continental, Kebab, European, South I...","12noon to 3:30pm, 6:30pm to 11:30pm (Mon-Sun)"


In [66]:
len(data_joined['Restaurant'].unique())

100

In [15]:
data_joined = data_joined[['Restaurant', 'Review', 'Review_Preprocessed', 'Cuisines']]
data_joined.head(3)

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Cuisines
0,Beyond Flavours,"The ambience was good, food was quite good . h...","[(ambience, NN), (good, JJ), (food, NN), (quit...","Chinese, Continental, Kebab, European, South I..."
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,"[(ambience, NN), (good, JJ), (pleasant, JJ), (...","Chinese, Continental, Kebab, European, South I..."
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[(must, MD), (try, VB), (great, JJ), (food, NN...","Chinese, Continental, Kebab, European, South I..."


In [16]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer_roberta = AutoTokenizer.from_pretrained("Dizex/InstaFoodRoBERTa-NER")
model_roberta = AutoModelForTokenClassification.from_pretrained("Dizex/InstaFoodRoBERTa-NER")

In [17]:
pipe = pipeline("ner", model=model_roberta, tokenizer=tokenizer_roberta)

In [18]:
from tqdm import tqdm

ner_entity_results = pipe(list(data_joined['Review']), aggregation_strategy="simple")

KeyboardInterrupt: 

In [None]:
def convert_entities_to_list(df: pd.DataFrame, entities: list[list[dict]]) -> pd.DataFrame:
    def extract_entities(text, entity_list):
        ents = []
        for ent in entity_list:
            e = {"start": ent["start"], "end": ent["end"], "label": ent["entity_group"]}
            if ents and (-1 <= ent["start"] - ents[-1]["end"] <= 1) and (ents[-1]["label"] == e["label"]):
                ents[-1]["end"] = e["end"]
                continue
            ents.append(e)
        return [text[e["start"]:e["end"]] for e in ents]

    df['meals'] = [extract_entities(text, entity_list) for text, entity_list in zip(df['Review'], entities)]
    return df

In [None]:
data_joined = convert_entities_to_list(data_joined, ner_entity_results)

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Cuisines,meals
0,Beyond Flavours,"The ambience was good, food was quite good . h...","[(ambience, NN), (good, JJ), (food, NN), (quit...","Chinese, Continental, Kebab, European, South I...",[]
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,"[(ambience, NN), (good, JJ), (pleasant, JJ), (...","Chinese, Continental, Kebab, European, South I...",[]
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[(must, MD), (try, VB), (great, JJ), (food, NN...","Chinese, Continental, Kebab, European, South I...",[Penne Alfredo Pasta]
3,Beyond Flavours,Soumen das and Arun was a great guy. Only beca...,"[(soumen, NNP), (das, NNS), (arun, NNP), (grea...","Chinese, Continental, Kebab, European, South I...",[]
4,Beyond Flavours,Food is good. we ordered Kodi drumsticks and b...,"[(food, NN), (good, JJ), (ordered, VBD), (kodi...","Chinese, Continental, Kebab, European, South I...","[Kodi drumsticks, basket mutton biryani]"
...,...,...,...,...,...
95,Beyond Flavours,Was there for office lunch outing. Rating woul...,"[(office, NN), (lunch, NN), (outing, VBG), (ra...","Chinese, Continental, Kebab, European, South I...",[]
96,Beyond Flavours,"I really enjoyed the follows.... The entrance,...","[(really, RB), (enjoyed, VBD), (entrance, NN),...","Chinese, Continental, Kebab, European, South I...",[]
97,Beyond Flavours,I came first time in this restaurant. The entr...,"[(came, VBD), (first, JJ), (time, NN), (restau...","Chinese, Continental, Kebab, European, South I...",[]
98,Beyond Flavours,Pathetic and horrible experience Ambience and ...,"[(pathetic, JJ), (horrible, JJ), (experience, ...","Chinese, Continental, Kebab, European, South I...",[]


# Find out all the possible labels

In [145]:
from sklearn.preprocessing import MultiLabelBinarizer

def split_list_into_onehot_labels(dataframe, column_name):
    """
    Splits a list of comma-separated values in a specified column of a DataFrame into one-hot encoded labels.
    Args:
        dataframe (pd.DataFrame): The input DataFrame containing the data.
        column_name (str): The name of the column containing comma-separated values to be one-hot encoded.
    Returns:
        pd.DataFrame: The DataFrame with the original column split into a single column containing one-hot encoded vectors.

    """
    mlb = MultiLabelBinarizer()
    dataframe[column_name] = dataframe[column_name].apply(lambda x: x.split(", "))

    cuisine_encoded = mlb.fit_transform(dataframe[column_name])

    dataframe['Cuisine_Vector'] = list(cuisine_encoded)
    return dataframe

In [None]:
import pandas as pd
data_joined = pd.read_csv(r"data_hyderabad/data_preprocessed_classification.pkl")

#data_joined['meals'] = data_joined['meals'].apply(lambda x: [meal.lower() for meal in eval(x)])


data_joined = split_list_into_onehot_labels(data_joined, 'Cuisines')
data_joined.head(3)

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Cuisines,meals,Cuisine_Vector
0,Beyond Flavours,"The ambience was good, food was quite good . h...","[('ambience', 'NN'), ('good', 'JJ'), ('food', ...","[Chinese, Continental, Kebab, European, South ...",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ..."
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,"[('ambience', 'NN'), ('good', 'JJ'), ('pleasan...","[Chinese, Continental, Kebab, European, South ...",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ..."
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[('must', 'MD'), ('try', 'VB'), ('great', 'JJ'...","[Chinese, Continental, Kebab, European, South ...",['Penne Alfredo Pasta'],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ..."


In [183]:
# Filter out rows where 'meals' column is empty
data_joined = data_joined[data_joined['meals'].apply(lambda x: x != "[]")]

# Convert 'Review_Preprocessed' from string to list of tuples
data_joined['Review_Preprocessed'] = data_joined['Review_Preprocessed'].apply(eval)

# Extract tokens from 'Review_Preprocessed' and store in a new column
data_joined['Review_Preprocessed_No_Pos'] = data_joined['Review_Preprocessed'].apply(lambda x: [token for token, pos in x])

# Reorder columns to place 'Review_Preprocessed_No_Pos' after 'Review_Preprocessed'
cols = list(data_joined.columns)
review_index = cols.index('Review_Preprocessed')
cols.insert(review_index + 1, cols.pop(cols.index('Review_Preprocessed_No_Pos')))
data_joined = data_joined[cols]

# Create a copy of the dataframe for further processing
data_preprocessed_many_rows = data_joined.copy()

# Group by 'Restaurant' and aggregate reviews and meals
data_preprocessed_groupedby_restaurant = data_preprocessed_many_rows.groupby('Restaurant').agg({
    'Review': lambda x: ', '.join(x),
    'Review_Preprocessed_No_Pos': lambda x: ', '.join([', '.join(tokens) for tokens in x]),
    'Review_Preprocessed': lambda x: ', '.join([', '.join([f"({token}, {pos})" for token, pos in tokens]) for tokens in x]),
    'meals': lambda x: ', '.join([meal for sublist in x for meal in eval(sublist)])
}).reset_index().rename(columns={"Review_Preprocessed": "Review_Preprocessed_Pos"})

data_preprocessed_groupedby_restaurant

Unnamed: 0,Restaurant,Review,Review_Preprocessed_No_Pos,Review_Preprocessed_Pos,meals
0,10 Downing Street,I've been to this place about two times and i ...,"'ve, place, two, times, really, liked, ambienc...","('ve, VBP), (place, NN), (two, CD), (times, NN...","lasagna, veg Platter, lasagna rolls, beers, ve..."
1,13 Dhaba,I didn't go and eat at the Dhaba. I had ordere...,"go, eat, dhaba, ordered, taste, amazing, te, i...","(go, VB), (eat, VB), (dhaba, NNP), (ordered, V...","lassi, Chole bhature, Lassi, chole bhature pan..."
2,"3B's - Buddies, Bar & Barbecue",Gobind Passionate in serving Polite in nature ...,"gobind, passionate, serving, polite, nature, s...","(gobind, NNP), (passionate, NNP), (serving, VB...","Polite, Pan ice cream, pan ice cream, pan ice ..."
3,AB's - Absolute Barbecues,Excellent service by nandan and rahmat and rip...,"excellent, service, nandan, rahmat, ripan, fee...","(excellent, JJ), (service, NN), (nandan, NN), ...","ripan, politley sarvice, fish, pankaj, cake, b..."
4,Absolute Sizzlers,Service was pathetic. Ordered a sizzler with l...,"service, pathetic, ordered, sizzler, lamb, tol...","(service, NNP), (pathetic, JJ), (ordered, VBD)...","ler, lamb, lamb, Noodles, rice, noodle, chilli..."
...,...,...,...,...,...
95,Urban Asia - Kitchen & Bar,This place is highly recommended. It is workin...,"place, highly, recommended, working, eat, indi...","(place, NN), (highly, RB), (recommended, JJ), ...","noodles, Sanghai Fried Rice, Fish, sauce, nood..."
96,Yum Yum Tree - The Arabian Food Court,It is at th floor of Act Boutique building tha...,"th, floor, act, boutique, building, entrance, ...","(th, JJ), (floor, NN), (act, NNP), (boutique, ...","mutton Haleem, Chicken Fahm Mandi, chicken hal..."
97,Zega - Sheraton Hyderabad Hotel,"My husband and I, visited Zega for their dimsu...","husband, visited, zega, dimsum, festival, disa...","(husband, NN), (visited, VBD), (zega, NNP), (d...","thukpa, spice, dimsums, chicken Gyoza, dimsums..."
98,Zing's Northeast Kitchen,After so many of goody goody excellent reviews...,"many, goody, goody, excellent, reviews, n, exc...","(many, JJ), (goody, NN), (goody, NN), (excelle...","chalega, Pork, beef, meat, meat, veg momo, veg..."


In [184]:
# Merge 'Cuisine_Vector' from 'data_joined' into 'data_preprocessed_groupedby_restaurant'
data_preprocessed_groupedby_restaurant = data_preprocessed_groupedby_restaurant.merge(
    data_joined[['Restaurant', 'Cuisine_Vector']],
    on='Restaurant',
    how='left'
)

# Display the updated DataFrame
data_preprocessed_groupedby_restaurant.drop_duplicates(subset=['Restaurant'], inplace=True)
data_preprocessed_groupedby_restaurant.reset_index(drop=True)

Unnamed: 0,Restaurant,Review,Review_Preprocessed_No_Pos,Review_Preprocessed_Pos,meals,Cuisine_Vector
0,10 Downing Street,I've been to this place about two times and i ...,"'ve, place, two, times, really, liked, ambienc...","('ve, VBP), (place, NN), (two, CD), (times, NN...","lasagna, veg Platter, lasagna rolls, beers, ve...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
1,13 Dhaba,I didn't go and eat at the Dhaba. I had ordere...,"go, eat, dhaba, ordered, taste, amazing, te, i...","(go, VB), (eat, VB), (dhaba, NNP), (ordered, V...","lassi, Chole bhature, Lassi, chole bhature pan...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"3B's - Buddies, Bar & Barbecue",Gobind Passionate in serving Polite in nature ...,"gobind, passionate, serving, polite, nature, s...","(gobind, NNP), (passionate, NNP), (serving, VB...","Polite, Pan ice cream, pan ice cream, pan ice ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
3,AB's - Absolute Barbecues,Excellent service by nandan and rahmat and rip...,"excellent, service, nandan, rahmat, ripan, fee...","(excellent, JJ), (service, NN), (nandan, NN), ...","ripan, politley sarvice, fish, pankaj, cake, b...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
4,Absolute Sizzlers,Service was pathetic. Ordered a sizzler with l...,"service, pathetic, ordered, sizzler, lamb, tol...","(service, NNP), (pathetic, JJ), (ordered, VBD)...","ler, lamb, lamb, Noodles, rice, noodle, chilli...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
...,...,...,...,...,...,...
95,Urban Asia - Kitchen & Bar,This place is highly recommended. It is workin...,"place, highly, recommended, working, eat, indi...","(place, NN), (highly, RB), (recommended, JJ), ...","noodles, Sanghai Fried Rice, Fish, sauce, nood...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
96,Yum Yum Tree - The Arabian Food Court,It is at th floor of Act Boutique building tha...,"th, floor, act, boutique, building, entrance, ...","(th, JJ), (floor, NN), (act, NNP), (boutique, ...","mutton Haleem, Chicken Fahm Mandi, chicken hal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
97,Zega - Sheraton Hyderabad Hotel,"My husband and I, visited Zega for their dimsu...","husband, visited, zega, dimsum, festival, disa...","(husband, NN), (visited, VBD), (zega, NNP), (d...","thukpa, spice, dimsums, chicken Gyoza, dimsums...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
98,Zing's Northeast Kitchen,After so many of goody goody excellent reviews...,"many, goody, goody, excellent, reviews, n, exc...","(many, JJ), (goody, NN), (goody, NN), (excelle...","chalega, Pork, beef, meat, meat, veg momo, veg...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
data_preprocessed_groupedby_restaurant['meals'] = data_preprocessed_groupedby_restaurant['meals'].apply(lambda x: x.split(", "))
data_preproc_grouped = data_preprocessed_groupedby_restaurant.copy()

data_joined['meals'] = data_joined['meals'].apply(lambda x: [meal.lower() for meal in eval(x)])
data_preproc_review_per_row = data_joined.copy()

In [250]:
data_preproc_grouped.head(3)

Unnamed: 0,Restaurant,Review,Review_Preprocessed_No_Pos,Review_Preprocessed_Pos,meals,Cuisine_Vector
0,10 Downing Street,I've been to this place about two times and i ...,"'ve, place, two, times, really, liked, ambienc...","('ve, VBP), (place, NN), (two, CD), (times, NN...","[lasagna,, veg, Platter,, lasagna, rolls,, bee...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
61,13 Dhaba,I didn't go and eat at the Dhaba. I had ordere...,"go, eat, dhaba, ordered, taste, amazing, te, i...","(go, VB), (eat, VB), (dhaba, NNP), (ordered, V...","[lassi,, Chole, bhature,, Lassi,, chole, bhatu...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
132,"3B's - Buddies, Bar & Barbecue",Gobind Passionate in serving Polite in nature ...,"gobind, passionate, serving, polite, nature, s...","(gobind, NNP), (passionate, NNP), (serving, VB...","[Polite,, Pan, ice, cream,, pan, ice, cream,, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."


In [249]:
data_preproc_review_per_row.head(3)

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Review_Preprocessed_No_Pos,Cuisines,meals,Cuisine_Vector,Word2Vec_Vector
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[(must, MD), (try, VB), (great, JJ), (food, NN...","[must, try, great, food, great, ambience, thnx...","[Chinese, Continental, Kebab, European, South ...",[penne alfredo pasta],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...","[0.10267433, 0.1892341, 0.15835893, -0.5145784..."
4,Beyond Flavours,Food is good. we ordered Kodi drumsticks and b...,"[(food, NN), (good, JJ), (ordered, VBD), (kodi...","[food, good, ordered, kodi, drumsticks, basket...","[Chinese, Continental, Kebab, European, South ...","[kodi drumsticks, basket mutton biryani]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...","[0.03495726, 0.06261446, 0.059172727, -0.38602..."
7,Beyond Flavours,Well after reading so many reviews finally vis...,"[(well, RB), (reading, VBG), (many, JJ), (revi...","[well, reading, many, reviews, finally, visite...","[Chinese, Continental, Kebab, European, South ...","[corn, tawa fish, basket biryani, biryani]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...","[0.009220402, 0.05624652, 0.11652868, -0.53158..."


# **Futher steps**

## Final preprocessing steps:
1. Create two datasets: 
2. 1st with 9k+ rows, so every review has a label vector
3. 2nd with 105 rows, so every restaurant has a joint review vector of food names


## Modeling
I see two possible approaches to solve this problem. 

1. We treat the dishes as categorical instances and build a model that takes counts
It will be a categorical to categorical model so Decision Tree or Naive Bayes should do

2. Encoding the words with (no tf-idf not appropriate for this appraoch coz no documents), Word2Vec and Glove and categorizing the labels based on those. Here we can think about adding/multiplying the vectors to get the overall or dot product and using this numerical representation for classification

# Word2Vec

In [244]:
from gensim.models import Word2Vec

# Train the Word2Vec model on the 'meals' column
reviews_skipgram_model = Word2Vec(
    sentences=data_preproc_review_per_row["Review_Preprocessed_No_Pos"].tolist(),
    vector_size=25,
    window=5,
    min_count=1,
    workers=4,
    sg=1  # Skip-gram model
)

#Generate vectors for each review
def get_review_vector(review):
    words = review
    vector = sum(reviews_skipgram_model.wv[word] for word in words if word in reviews_skipgram_model.wv)
    return vector / len(words) if words else [0] * 25

# Apply the function to each review and create a new column
data_preproc_review_per_row['Word2Vec_Vector'] = data_preproc_review_per_row['Review_Preprocessed_No_Pos'].apply(get_review_vector)

# Display the DataFrame with the new column
data_preproc_review_per_row.head()

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Review_Preprocessed_No_Pos,Cuisines,meals,Cuisine_Vector,Word2Vec_Vector
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[(must, MD), (try, VB), (great, JJ), (food, NN...","[must, try, great, food, great, ambience, thnx...","[Chinese, Continental, Kebab, European, South ...",[penne alfredo pasta],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...","[0.10267433, 0.1892341, 0.15835893, -0.5145784..."
4,Beyond Flavours,Food is good. we ordered Kodi drumsticks and b...,"[(food, NN), (good, JJ), (ordered, VBD), (kodi...","[food, good, ordered, kodi, drumsticks, basket...","[Chinese, Continental, Kebab, European, South ...","[kodi drumsticks, basket mutton biryani]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...","[0.03495726, 0.06261446, 0.059172727, -0.38602..."
7,Beyond Flavours,Well after reading so many reviews finally vis...,"[(well, RB), (reading, VBG), (many, JJ), (revi...","[well, reading, many, reviews, finally, visite...","[Chinese, Continental, Kebab, European, South ...","[corn, tawa fish, basket biryani, biryani]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...","[0.009220402, 0.05624652, 0.11652868, -0.53158..."
9,Beyond Flavours,Came for the birthday treat of a close friend....,"[(came, NN), (birthday, JJ), (treat, NN), (clo...","[came, birthday, treat, close, friend, perfect...","[Chinese, Continental, Kebab, European, South ...",[chili honey lotus stem],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...","[0.015329912, 0.08770925, 0.11428657, -0.49731..."
12,Beyond Flavours,Food was very good. Soup was as expected. In s...,"[(food, NN), (good, JJ), (soup, NNP), (expecte...","[food, good, soup, expected, starters, ordered...","[Chinese, Continental, Kebab, European, South ...","[soup, honey chilli lotus]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...","[0.08414163, 0.08062348, 0.10732855, -0.596129..."


# Doc2Vec

In [253]:
# Lets try Doc2Vec using the aggregated reviews

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Prepare the data for Doc2Vec
tagged_data = [TaggedDocument(words=review, tags=[str(i)]) for i, review in enumerate(data_preproc_grouped["Review_Preprocessed_No_Pos"].tolist())]

# Train the Doc2Vec model on the 'Review_Preprocessed_No_Pos' column
doc2vec_model = Doc2Vec(
    documents=tagged_data,
    vector_size=25,
    window=5,
    min_count=1,
    workers=4,
    epochs=100
)

# Generate vectors for each review
def get_review_vector(doc_id):
    return doc2vec_model.dv[doc_id]

# Apply the function to each review and create a new column
data_preproc_grouped['Doc2Vec_Vector'] = [get_review_vector(str(i)) for i in range(len(data_preproc_grouped))]

# Display the DataFrame with the new column
data_preproc_grouped

Unnamed: 0,Restaurant,Review,Review_Preprocessed_No_Pos,Review_Preprocessed_Pos,meals,Cuisine_Vector,Doc2Vec_Vector
0,10 Downing Street,I've been to this place about two times and i ...,"'ve, place, two, times, really, liked, ambienc...","('ve, VBP), (place, NN), (two, CD), (times, NN...","[lasagna,, veg, Platter,, lasagna, rolls,, bee...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[-0.73028654, -0.94981015, 0.8981075, -0.95641..."
61,13 Dhaba,I didn't go and eat at the Dhaba. I had ordere...,"go, eat, dhaba, ordered, taste, amazing, te, i...","(go, VB), (eat, VB), (dhaba, NNP), (ordered, V...","[lassi,, Chole, bhature,, Lassi,, chole, bhatu...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0386596, 1.25421, 1.8065908, -0.5012805, 0...."
132,"3B's - Buddies, Bar & Barbecue",Gobind Passionate in serving Polite in nature ...,"gobind, passionate, serving, polite, nature, s...","(gobind, NNP), (passionate, NNP), (serving, VB...","[Polite,, Pan, ice, cream,, pan, ice, cream,, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[-0.28862014, -1.186906, -0.28161365, -1.70050..."
159,AB's - Absolute Barbecues,Excellent service by nandan and rahmat and rip...,"excellent, service, nandan, rahmat, ripan, fee...","(excellent, JJ), (service, NN), (nandan, NN), ...","[ripan,, politley, sarvice,, fish,, pankaj,, c...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0.8380315, -1.8014015, 0.018671855, -2.029692..."
193,Absolute Sizzlers,Service was pathetic. Ordered a sizzler with l...,"service, pathetic, ordered, sizzler, lamb, tol...","(service, NNP), (pathetic, JJ), (ordered, VBD)...","[ler,, lamb,, lamb,, Noodles,, rice,, noodle,,...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0.3182445, 0.8543685, -0.23095621, -0.8876323..."
...,...,...,...,...,...,...,...
5561,Urban Asia - Kitchen & Bar,This place is highly recommended. It is workin...,"place, highly, recommended, working, eat, indi...","(place, NN), (highly, RB), (recommended, JJ), ...","[noodles,, Sanghai, Fried, Rice,, Fish,, sauce...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.59476155, -1.0905102, -0.45617875, -0.41898..."
5627,Yum Yum Tree - The Arabian Food Court,It is at th floor of Act Boutique building tha...,"th, floor, act, boutique, building, entrance, ...","(th, JJ), (floor, NN), (act, NNP), (boutique, ...","[mutton, Haleem,, Chicken, Fahm, Mandi,, chick...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.75107616, -1.4869246, -0.94342816, -0.03576..."
5700,Zega - Sheraton Hyderabad Hotel,"My husband and I, visited Zega for their dimsu...","husband, visited, zega, dimsum, festival, disa...","(husband, NN), (visited, VBD), (zega, NNP), (d...","[thukpa,, spice,, dimsums,, chicken, Gyoza,, d...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.26957774, -0.03986285, -0.6722499, -0.47506..."
5753,Zing's Northeast Kitchen,After so many of goody goody excellent reviews...,"many, goody, goody, excellent, reviews, n, exc...","(many, JJ), (goody, NN), (goody, NN), (excelle...","[chalega,, Pork,, beef,, meat,, meat,, veg, mo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.06245933, -0.1281365, -0.4542947, 0.640352..."


In [286]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from tqdm import tqdm

def cooccurrence_matrix_sentence_generator(preproc_sentences):
    # Flatten the list of sentences to get all unique words
    unique_words = list(set(word for sentence in preproc_sentences for word in sentence))
    word_index = {word: idx for idx, word in enumerate(unique_words)}
    vocab_size = len(unique_words)

    # Initialize the co-occurrence matrix
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=int)

    # Compute co-occurrences
    for sentence in tqdm(preproc_sentences):
        word_indices = [word_index[word] for word in sentence if word in word_index]
        for i in range(len(word_indices)):
            co_matrix[word_indices[i], word_indices[i:]] += 1
            co_matrix[word_indices[i:], word_indices[i]] += 1

    # Create a DataFrame for better readability
    co_matrix_df = pd.DataFrame(co_matrix, index=unique_words, columns=unique_words)

    co_matrix_df = co_matrix_df.reindex(co_matrix_df.sum().sort_values(ascending=False).index, axis=1)
    co_matrix_df = co_matrix_df.reindex(co_matrix_df.sum().sort_values(ascending=False).index, axis=0)

    # Return the co-occurrence matrix
    return co_matrix_df

In [None]:
# CuPy GPU version:

import cupy as cp
import pandas as pd
from tqdm import tqdm

def cooccurrence_matrix_sentence_generator(preproc_sentences):
    # Flatten the list of sentences to get all unique words
    unique_words = list(set(word for sentence in preproc_sentences for word in sentence))
    word_index = {word: idx for idx, word in enumerate(unique_words)}
    vocab_size = len(unique_words)

    # Initialize the co-occurrence matrix
    co_matrix = cp.zeros((vocab_size, vocab_size), dtype=cp.int32)

    # Compute co-occurrences
    for sentence in tqdm(preproc_sentences):
        word_indices = [word_index[word] for word in sentence if word in word_index]
        for i in range(len(word_indices)):
            co_matrix[word_indices[i], word_indices[i:]] += 1
            co_matrix[word_indices[i:], word_indices[i]] += 1

    # Convert the co-occurrence matrix to a NumPy array for compatibility with pandas
    co_matrix_np = cp.asnumpy(co_matrix)

    # Create a DataFrame for better readability
    co_matrix_df = pd.DataFrame(co_matrix_np, index=unique_words, columns=unique_words)

    co_matrix_df = co_matrix_df.reindex(co_matrix_df.sum().sort_values(ascending=False).index, axis=1)
    co_matrix_df = co_matrix_df.reindex(co_matrix_df.sum().sort_values(ascending=False).index, axis=0)

    # Return the co-occurrence matrix
    return co_matrix_df

In [285]:
co_occr_matrix = cooccurrence_matrix_sentence_generator(data_preproc_grouped['meals'])

 27%|██▋       | 27/100 [00:03<00:08,  8.17it/s]


KeyboardInterrupt: 

In [None]:
from mittens import GloVe
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Train GloVe model on the co-occurrence matrix
glove_model = GloVe(n=200, max_iter=150, display_progress=10)  # 25 is the embedding dimension
embeddings = glove_model.fit(co_occr_matrix)

# Generate vectors for each review using GloVe embeddings
def get_review_vector_glove(review, embeddings, vocab, vector_size=25):
    words = review.split()
    vector = np.sum([embeddings[vocab[word]] for word in words if word in vocab], axis=0)
    return vector / len(words) if words else np.zeros(vector_size)

# Apply the function to each review and create a new column
data_preproc_review_per_row['Glove_Vector'] = data_preproc_review_per_row['Review_Preprocessed_No_Pos'].apply(lambda x: get_review_vector_glove(x, embeddings, vocab))

# Assuming 'cuisine_vector' is the target column
X = np.array(data_preproc_review_per_row['Glove_Vector'].tolist())
y = np.array(data_preproc_review_per_row['cuisine_vector'].tolist())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Predict the cuisine vector
y_pred = nb_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

AttributeError: 'Series' object has no attribute 'reshape'

# **Count Vectorizer of Meals Column**

In [291]:
from sklearn.feature_extraction.text import CountVectorizer


def count_vectorizer(df, column_name):
    # Initialize the CountVectorizer
    cv = CountVectorizer()

    # Fit and transform the specified column
    cv.fit(df[column_name])

    # Transform the specified column
    count_vector = cv.transform(df[column_name])

    # Create a DataFrame for better readability
    count_vector_df = pd.DataFrame(count_vector.toarray(), columns=cv.get_feature_names_out())

    return count_vector_df

# Apply the function to the 'Review_Preprocessed' column
data_preproc_grouped = count_vectorizer(data_preproc_grouped, 'meals')
data_preproc_grouped = count_vectorizer(data_preproc_grouped, 'Review_Preprocessed_No_Pos')

data_preprocessed_many_rows = count_vectorizer(data_preprocessed_many_rows, 'meals')
data_preprocessed_many_rows = count_vectorizer(data_preprocessed_many_rows, 'Review_Preprocessed_No_Pos')

AttributeError: 'list' object has no attribute 'lower'

# **Split the data**

In [288]:
X = data_preproc_grouped['Word2Vec_Vector'].to_list()
y = data_preproc_grouped['Cuisine_Vector'].to_list()

KeyError: 'Word2Vec_Vector'

In [243]:
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()

X_one_hot = data_preproc_review_per_row['reviews'].copy()
X_one_hot = onehot.fit_transform(X).toarray()
X_one_hot.shape

(5904, 123326)

# Fitting the model

In [259]:
from sklearn.naive_bayes import GaussianNB
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [287]:
#moc = MultiOutputClassifier(GaussianNB())                                                #f1_score: 0.34
moc = MultiOutputClassifier(LogisticRegression(solver='lbfgs', class_weight="balanced"))  #f1_score: 0.45 on reviews_rows / f1_score: 0.40
#moc = MultiOutputClassifier(RandomForestClassifier())                                    #f1_score: 0.34


In [283]:
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming y_train is a list of arrays, convert it to a numpy array for easier manipulation
y_train = np.array(y_train)

# Check unique labels in y_train for each class
for i in range(y_train.shape[1]):
    unique_labels = np.unique(y_train[:, i])
    print(f"Class {i} unique labels: {unique_labels}")

# Remove classes with only one unique label
valid_classes = [i for i in range(y_train.shape[1]) if len(np.unique(y_train[:, i])) > 1]
y_train_filtered = y_train[:, valid_classes]

# Train the model with filtered y_train
moc = MultiOutputClassifier(LogisticRegression(solver='lbfgs', class_weight="balanced"))
moc.fit(X_train, y_train_filtered)

# Convert y_test to a numpy array
y_test = np.array(y_test)

# Predict and evaluate
y_pred = moc.predict(X_test)
print(classification_report(y_test[:, valid_classes], y_pred))

Class 0 unique labels: [0 1]
Class 1 unique labels: [0 1]
Class 2 unique labels: [0 1]
Class 3 unique labels: [0 1]
Class 4 unique labels: [0 1]
Class 5 unique labels: [0 1]
Class 6 unique labels: [0 1]
Class 7 unique labels: [0 1]
Class 8 unique labels: [0 1]
Class 9 unique labels: [0 1]
Class 10 unique labels: [0 1]
Class 11 unique labels: [0 1]
Class 12 unique labels: [0 1]
Class 13 unique labels: [0 1]
Class 14 unique labels: [0 1]
Class 15 unique labels: [0 1]
Class 16 unique labels: [0 1]
Class 17 unique labels: [0 1]
Class 18 unique labels: [0 1]
Class 19 unique labels: [0 1]
Class 20 unique labels: [0]
Class 21 unique labels: [0 1]
Class 22 unique labels: [0 1]
Class 23 unique labels: [0 1]
Class 24 unique labels: [0 1]
Class 25 unique labels: [0]
Class 26 unique labels: [0 1]
Class 27 unique labels: [0 1]
Class 28 unique labels: [0 1]
Class 29 unique labels: [0 1]
Class 30 unique labels: [0 1]
Class 31 unique labels: [0 1]
Class 32 unique labels: [0 1]
Class 33 unique labels: 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
