In [6]:
import time
import numpy as np
import pandas as pd
import tqdm
import re
import matplotlib.pyplot as plt
import emoji
import Preprocessing as preproc
import nltk
import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\wojci\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [4]:
import pickle

with open('data_hyderabad/data_preprocessed_classification.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

In [5]:
loaded_data.head(3)

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Cuisines,meals
0,Beyond Flavours,"The ambience was good, food was quite good . h...","[(ambience, NN), (good, JJ), (food, NN), (quit...","Chinese, Continental, Kebab, European, South I...",[]
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,"[(ambience, NN), (good, JJ), (pleasant, JJ), (...","Chinese, Continental, Kebab, European, South I...",[]
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[(must, MD), (try, VB), (great, JJ), (food, NN...","Chinese, Continental, Kebab, European, South I...",[Penne Alfredo Pasta]


In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

def split_list_into_onehot_labels(dataframe, column_name):
    """
    Splits a list of comma-separated values in a specified column of a DataFrame into one-hot encoded labels.
    Args:
        dataframe (pd.DataFrame): The input DataFrame containing the data.
        column_name (str): The name of the column containing comma-separated values to be one-hot encoded.
    Returns:
        pd.DataFrame: The DataFrame with the original column split into a single column containing one-hot encoded vectors.

    """
    mlb = MultiLabelBinarizer()
    dataframe[column_name] = dataframe[column_name].apply(lambda x: x.split(", "))

    cuisine_encoded = mlb.fit_transform(dataframe[column_name])

    dataframe['Cuisine_Vector'] = list(cuisine_encoded)
    return dataframe

data_joined = split_list_into_onehot_labels(loaded_data, 'Cuisines')

In [10]:
data_joined.head(3)

Unnamed: 0,Restaurant,Review,Review_Preprocessed,Cuisines,meals,Cuisine_Vector
0,Beyond Flavours,"The ambience was good, food was quite good . h...","[(ambience, NN), (good, JJ), (food, NN), (quit...","[Chinese, Continental, Kebab, European, South ...",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ..."
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,"[(ambience, NN), (good, JJ), (pleasant, JJ), (...","[Chinese, Continental, Kebab, European, South ...",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ..."
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,"[(must, MD), (try, VB), (great, JJ), (food, NN...","[Chinese, Continental, Kebab, European, South ...",[Penne Alfredo Pasta],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ..."


In [13]:
data_joined['meals'][0]

[]

In [None]:
data_preprocessed_groupedby_restaurant = data_preprocessed_many_rows.groupby('Restaurant').agg({
    'Review': lambda x: ', '.join(x),
    'Review_Preprocessed_No_Pos': lambda x: ', '.join([', '.join(tokens) for tokens in x]),
    'Review_Preprocessed': lambda x: ', '.join([', '.join([f"({token}, {pos})" for token, pos in tokens]) for tokens in x]),
    'meals': lambda x: ', '.join([meal for sublist in x for meal in eval(sublist)])
}).reset_index().rename(columns={"Review_Preprocessed": "Review_Preprocessed_Pos"})

In [40]:
import spacy

# Load the spacy model
nlp = spacy.load("en_core_web_sm")


def lemmatize_text(text):
        doc = nlp(text)
        return " ".join([token.lemma_ for token in doc])


def transform_and_group_dataframe(df):
    
    df = df[df['meals'].apply(lambda x: len(x) > 0)]  # drop the rows where there were no meals mentioned

    df['Review_Preprocessed_No_Pos'] = df['Review_Preprocessed'].apply(lambda x: [token for token, pos in x])  # extract token from the tuple

    # Grouping
    df = df.groupby('Restaurant').agg({
        'Review': lambda x: ', '.join(x),
        'Review_Preprocessed_No_Pos': lambda x: ', '.join([', '.join(tokens) for tokens in x]),
        'Review_Preprocessed': lambda x: ', '.join([', '.join([f"({token}, {pos})" for token, pos in tokens]) for tokens in x]),
        'meals': lambda x: ', '.join([meal for sublist in x for meal in sublist])
    }).reset_index().rename(columns={"Review_Preprocessed": "Review_Preprocessed_Pos"})

    columns_to_lower = ['Review_Preprocessed_No_Pos', 'meals']

    df[columns_to_lower] = df[columns_to_lower].apply(lambda x: x.apply(lemmatize_text))

    return df


test = transform_and_group_dataframe(data_joined)
test

Unnamed: 0,Restaurant,Review,Review_Preprocessed_No_Pos,Review_Preprocessed_Pos,meals
0,10 Downing Street,I've been to this place about two times and i ...,"' ve , place , two , time , really , like , am...","('ve, VBP), (place, NN), (two, CD), (times, NN...","lasagna , veg Platter , lasagna roll , beer , ..."
1,13 Dhaba,I didn't go and eat at the Dhaba. I had ordere...,"go , eat , dhaba , ordered , taste , amazing ,...","(go, VB), (eat, VB), (dhaba, NNP), (ordered, V...","lassi , Chole bhature , Lassi , chole bhature ..."
2,"3B's - Buddies, Bar & Barbecue",Gobind Passionate in serving Polite in nature ...,"gobind , passionate , serving , polite , natur...","(gobind, NNP), (passionate, NNP), (serving, VB...","Polite , Pan ice cream , pan ice cream , pan i..."
3,AB's - Absolute Barbecues,Excellent service by nandan and rahmat and rip...,"excellent , service , nandan , rahmat , ripan ...","(excellent, JJ), (service, NN), (nandan, NN), ...","ripan , politley sarvice , fish , pankaj , cak..."
4,Absolute Sizzlers,Service was pathetic. Ordered a sizzler with l...,"service , pathetic , order , sizzler , lamb , ...","(service, NNP), (pathetic, JJ), (ordered, VBD)...","ler , lamb , lamb , Noodles , rice , noodle , ..."
...,...,...,...,...,...
95,Urban Asia - Kitchen & Bar,This place is highly recommended. It is workin...,"place , highly , recommend , work , eat , indi...","(place, NN), (highly, RB), (recommended, JJ), ...","noodle , Sanghai Fried Rice , fish , sauce , n..."
96,Yum Yum Tree - The Arabian Food Court,It is at th floor of Act Boutique building tha...,"th , floor , act , boutique , building , entra...","(th, JJ), (floor, NN), (act, NNP), (boutique, ...","mutton Haleem , Chicken Fahm Mandi , chicken h..."
97,Zega - Sheraton Hyderabad Hotel,"My husband and I, visited Zega for their dimsu...","husband , visit , zega , dimsum , festival , d...","(husband, NN), (visited, VBD), (zega, NNP), (d...","thukpa , spice , dimsums , chicken Gyoza , dim..."
98,Zing's Northeast Kitchen,After so many of goody goody excellent reviews...,"many , goody , goody , excellent , review , n ...","(many, JJ), (goody, NN), (goody, NN), (excelle...","chalega , pork , beef , meat , meat , veg momo..."
