In [46]:
import time
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import emoji
import Preprocessing as preproc
import nltk
import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [47]:
restaurants_raw = pd.read_csv(r"data_hyderabad/105_restaurants.csv")
reviews_raw = pd.read_csv(r"data_hyderabad/10k_reviews.csv")

restaurants_raw.head(5)

Unnamed: 0,Name,Links,Cost,Collections,Cuisines,Timings
0,Beyond Flavours,https://www.zomato.com/hyderabad/beyond-flavou...,800,"Food Hygiene Rated Restaurants in Hyderabad, C...","Chinese, Continental, Kebab, European, South I...","12noon to 3:30pm, 6:30pm to 11:30pm (Mon-Sun)"
1,Paradise,https://www.zomato.com/hyderabad/paradise-gach...,800,Hyderabad's Hottest,"Biryani, North Indian, Chinese",11 AM to 11 PM
2,Flechazo,https://www.zomato.com/hyderabad/flechazo-gach...,1300,"Great Buffets, Hyderabad's Hottest","Asian, Mediterranean, North Indian, Desserts","11:30 AM to 4:30 PM, 6:30 PM to 11 PM"
3,Shah Ghouse Hotel & Restaurant,https://www.zomato.com/hyderabad/shah-ghouse-h...,800,Late Night Restaurants,"Biryani, North Indian, Chinese, Seafood, Bever...",12 Noon to 2 AM
4,Over The Moon Brew Company,https://www.zomato.com/hyderabad/over-the-moon...,1200,"Best Bars & Pubs, Food Hygiene Rated Restauran...","Asian, Continental, North Indian, Chinese, Med...","12noon to 11pm (Mon, Tue, Wed, Thu, Sun), 12no..."


# Review Preprocessing

In [48]:
reviews_data = reviews_raw[reviews_raw["Rating"].notna() & reviews_raw["Review"].notna()]
reviews_data.isna().sum()

Restaurant    0
Reviewer      0
Review        0
Rating        0
Metadata      0
Time          0
Pictures      0
dtype: int64

In [49]:
# split UPPERCASE WORDS 
def splitting_words_process(word):
    # only upper case letters
    if word.isupper():
        return word
    
    # more than one upper case letter inside
    elif re.search(r'[A-Z][a-z]*[A-Z]', word):
        split_word = re.findall(r'[A-Z][a-z]*', word)
        return ' '.join(split_word)
    
    # <2 upper case letters
    else:
        return word

reviews_data['Review'] = reviews_data['Review'].apply(lambda x: ' '.join([splitting_words_process(word) for word in x.split()]))

In [50]:
# Function to replace 'gud', 'goo', 'gd' with the appropriate 'good'
def replace_gud_with_good(text):
    if isinstance(text, str):
        # Define the regex pattern to match 'gud', 'goo', 'gd' in various capitalizations
        pattern = re.compile(r'\b([Gg][Uu][Dd]|[Gg][Oo][Oo]|[Gg][Dd])\b')

        # Replacement function to check the case of the first letter
        def replacement(match):
            word = match.group()
            # Check if the first letter is uppercase, then return 'Good', else 'good'
            if word[0].isupper():
                return 'Good'
            else:
                return 'good'
        
        # Use re.sub to apply the replacement function
        return pattern.sub(replacement, text)
    
    return text

# Apply the function to the 'Review' column to replace the variants of 'good'
reviews_data['Review'] = reviews_data['Review'].apply(replace_gud_with_good)

In [51]:
# Function to replace 'kk', 'Oke', 'k', 'Ok' with 'ok'
def replace_to_ok(text):
    if isinstance(text, str):
        # Define the regex pattern to match the variants of 'ok'
        pattern = re.compile(r'\b(k|kk|Ok|Oke)\b', re.IGNORECASE)

        # Replacement function to return 'ok' for all matched words
        def replacement(match):
            return 'ok'
        
        # Use re.sub to apply the replacement function
        return pattern.sub(replacement, text)
    
    return text

# Apply the function to the 'Review' column to replace the variants of 'ok'
reviews_data['Review'] = reviews_data['Review'].apply(replace_to_ok)

In [52]:
# add space after ! | " | # | $ | % | & | ( | ) | * | + | , | . | : | ; followed immediately by a word
def add_space_after_punctuation(df):

    df['Review'] = df['Review'].apply(lambda text: re.sub(r'([\u0021-\u0026\u0028-\u002C\u002E\u003A-\u003F]+(?=\w))', r'\1 ', text) if isinstance(text, str) else text)
    return df

# Example usage:
reviews_data = add_space_after_punctuation(reviews_data)

In [53]:
# remove gibberish words like "ggggggggggd", "eshjdgue"
def remove_gibberish(text):
    cleaned_text = re.sub(r'\b\w{15,}\b', '', text)  # removes 15+ words
    cleaned_text = re.sub(r'\b\w*(\w)\1{2,}\w*\b', '', cleaned_text)  # removes words that contain 3+ repeating letters

    return cleaned_text

reviews_data['Review'] = reviews_data['Review'].apply(remove_gibberish)

In [54]:
# replace numbers with blank string
reviews_data['Review'] = reviews_data['Review'].replace(r'\d+(\.\d+)?', '', regex=True)

In [55]:
preproc = reviews_data['Review'].apply(lambda x: preproc.main_pipeline(
    x, 
    print_output=False, 
    no_stopwords=False,
    custom_stopwords=[],
    convert_diacritics=True, 
    no_punctuation=False,
    lowercase=False,
    lemmatized=False,
    list_pos=["n","v","a","r","s"],
    stemmed=False, 
    pos_tags_list='pos_tuples',
    tokenized_output=False
))

In [56]:
# remove stopwords
def remove_stopwords(tokens):
    punctuation_pattern = "[\u0021-\u0026\u0028-\u002C\u002E-\u002F\u003A-\u003F\u005B-\u005F\u2010-\u2028\ufeff`]+"
    stopwords = set(nltk.corpus.stopwords.words("english"))

    return [(token.lower(), pos) for token, pos in tokens if token.lower() not in stopwords and not re.match(punctuation_pattern, token)]

reviews_data['Review_Preprocessed'] = preproc.apply(remove_stopwords)

In [57]:
data_joined = pd.merge(reviews_data, restaurants_raw, left_on='Restaurant',right_on='Name', how='left')
data_joined.head(1)

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,Review_Preprocessed,Name,Links,Cost,Collections,Cuisines,Timings
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0,"[(ambience, NN), (good, JJ), (food, NN), (quit...",Beyond Flavours,https://www.zomato.com/hyderabad/beyond-flavou...,800,"Food Hygiene Rated Restaurants in Hyderabad, C...","Chinese, Continental, Kebab, European, South I...","12noon to 3:30pm, 6:30pm to 11:30pm (Mon-Sun)"


In [58]:
data_joined = data_joined[['Restaurant', 'Review', 'Review_Preprocessed', 'Cuisines']]
data_joined.head(3)

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Cuisines
0,Beyond Flavours,"The ambience was good, food was quite good . h...","[(ambience, NN), (good, JJ), (food, NN), (quit...","Chinese, Continental, Kebab, European, South I..."
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,"[(ambience, NN), (good, JJ), (pleasant, JJ), (...","Chinese, Continental, Kebab, European, South I..."
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[(must, MD), (try, VB), (great, JJ), (food, NN...","Chinese, Continental, Kebab, European, South I..."


In [19]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer_roberta = AutoTokenizer.from_pretrained("Dizex/InstaFoodRoBERTa-NER")
model_roberta = AutoModelForTokenClassification.from_pretrained("Dizex/InstaFoodRoBERTa-NER")

In [20]:
pipe = pipeline("ner", model=model_roberta, tokenizer=tokenizer_roberta)

In [None]:
from tqdm import tqdm

ner_entity_results = pipe(list(data_joined['Review']), aggregation_strategy="simple")

In [26]:
def convert_entities_to_list(df: pd.DataFrame, entities: list[list[dict]]) -> pd.DataFrame:
    def extract_entities(text, entity_list):
        ents = []
        for ent in entity_list:
            e = {"start": ent["start"], "end": ent["end"], "label": ent["entity_group"]}
            if ents and (-1 <= ent["start"] - ents[-1]["end"] <= 1) and (ents[-1]["label"] == e["label"]):
                ents[-1]["end"] = e["end"]
                continue
            ents.append(e)
        return [text[e["start"]:e["end"]] for e in ents]

    df['meals'] = [extract_entities(text, entity_list) for text, entity_list in zip(df['Review'], entities)]
    return df

In [None]:
data_joined = convert_entities_to_list(data_joined, ner_entity_results)

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Cuisines,meals
0,Beyond Flavours,"The ambience was good, food was quite good . h...","[(ambience, NN), (good, JJ), (food, NN), (quit...","Chinese, Continental, Kebab, European, South I...",[]
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,"[(ambience, NN), (good, JJ), (pleasant, JJ), (...","Chinese, Continental, Kebab, European, South I...",[]
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[(must, MD), (try, VB), (great, JJ), (food, NN...","Chinese, Continental, Kebab, European, South I...",[Penne Alfredo Pasta]
3,Beyond Flavours,Soumen das and Arun was a great guy. Only beca...,"[(soumen, NNP), (das, NNS), (arun, NNP), (grea...","Chinese, Continental, Kebab, European, South I...",[]
4,Beyond Flavours,Food is good. we ordered Kodi drumsticks and b...,"[(food, NN), (good, JJ), (ordered, VBD), (kodi...","Chinese, Continental, Kebab, European, South I...","[Kodi drumsticks, basket mutton biryani]"
...,...,...,...,...,...
95,Beyond Flavours,Was there for office lunch outing. Rating woul...,"[(office, NN), (lunch, NN), (outing, VBG), (ra...","Chinese, Continental, Kebab, European, South I...",[]
96,Beyond Flavours,"I really enjoyed the follows.... The entrance,...","[(really, RB), (enjoyed, VBD), (entrance, NN),...","Chinese, Continental, Kebab, European, South I...",[]
97,Beyond Flavours,I came first time in this restaurant. The entr...,"[(came, VBD), (first, JJ), (time, NN), (restau...","Chinese, Continental, Kebab, European, South I...",[]
98,Beyond Flavours,Pathetic and horrible experience Ambience and ...,"[(pathetic, JJ), (horrible, JJ), (experience, ...","Chinese, Continental, Kebab, European, South I...",[]


# Find out all the possible labels

In [87]:
data_joined = pd.read_csv(r"data_hyderabad/data_preprocessed_classification.csv")

In [88]:
data_joined.head(3)

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Cuisines,meals
0,Beyond Flavours,"The ambience was good, food was quite good . h...","[('ambience', 'NN'), ('good', 'JJ'), ('food', ...","Chinese, Continental, Kebab, European, South I...",[]
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,"[('ambience', 'NN'), ('good', 'JJ'), ('pleasan...","Chinese, Continental, Kebab, European, South I...",[]
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[('must', 'MD'), ('try', 'VB'), ('great', 'JJ'...","Chinese, Continental, Kebab, European, South I...",['Penne Alfredo Pasta']


In [83]:
# delete rows with empty 'meals' column

data_joined = data_joined[data_joined['meals'].apply(lambda x: x != [])]

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

def split_list_into_onehot_labels(dataframe, column_name):
    """
    Splits a list of comma-separated values in a specified column of a DataFrame into one-hot encoded labels.
    Args:
        dataframe (pd.DataFrame): The input DataFrame containing the data.
        column_name (str): The name of the column containing comma-separated values to be one-hot encoded.
    Returns:
        pd.DataFrame: The DataFrame with the original column split into one-hot encoded labels.
    Example:
        >>> df = pd.DataFrame({'cuisine': ['Italian, Mexican', 'Chinese, Thai', 'Mexican']})
        >>> split_list_into_onehot_labels(df, 'cuisine')
           cuisine  Chinese  Italian  Mexican  Thai
        0  [Italian, Mexican]        0        1        1     0
        1  [Chinese, Thai]            1        0        0     1
        2  [Mexican]                  0        0        1     0
    """
    mlb = MultiLabelBinarizer()
    dataframe[column_name] = dataframe[column_name].apply(lambda x: x.split(", "))

    cuisine_encoded = mlb.fit_transform(dataframe[column_name])

    dataframe = pd.concat([dataframe, pd.DataFrame(cuisine_encoded, columns=mlb.classes_)], axis=1)
    return dataframe

data_joined = split_list_into_onehot_labels(data_joined, 'Cuisines')
data_joined.head(3)

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Cuisines,meals,American,Andhra,Arabian,Asian,BBQ,...,North Indian,Pizza,Salad,Seafood,South Indian,Spanish,Street Food,Sushi,Thai,Wraps
0,Beyond Flavours,"The ambience was good, food was quite good . h...","[('ambience', 'NN'), ('good', 'JJ'), ('food', ...","[Chinese, Continental, Kebab, European, South ...",[],0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,"[('ambience', 'NN'), ('good', 'JJ'), ('pleasan...","[Chinese, Continental, Kebab, European, South ...",[],0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[('must', 'MD'), ('try', 'VB'), ('great', 'JJ'...","[Chinese, Continental, Kebab, European, South ...",['Penne Alfredo Pasta'],0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [91]:
data_joined.shape

(9955, 47)