### Import Libraries

In [None]:
import os
import json
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from langdetect import detect
import regex as re

import spacy
myspacy = spacy.load('en_core_web_sm')

import warnings
warnings.filterwarnings('ignore')

### Import Dataset

#### Converting JSON to DataFrame

In [None]:
reviews = {'review_id' : [], 'business_id' : [], 'user_id' : [], 
           'stars': [], 'text': [], 'date' : [], 
           'useful' : [], 'funny': [], 'cool': []}

business = {'business_id' : [], 'name' : [], 'address' : [], 
           'city': [], 'state': [], 'postal_code' : [], 
           'stars' : [], 'review_count': [], 'is_open': [], 'categories': []}

users = {'user_id': [],'name': [],'review_count': [],
        'yelping_since': [],'useful': [],'funny': [],
        'cool': [],'elite': [],'fans': [],'friends': []}

def convert2df(filename,df_structure):
    with open(filename) as f:
        for line in tqdm(f):
            row = json.loads(line)
                for i in df_structure.keys():
                    df_structure[i].append(row[i])
    return df_structure

In [None]:
business_dict = convert2df("yelp_academic_dataset_business.json",business)
business_df = pd.DataFrame.from_dict(business_dict)
business_df.head()

In [None]:
reviews_dict = convert2df("yelp_academic_dataset_reviews.json",reviews)
reviews_df = pd.DataFrame.from_dict(reviews_dict)
reviews_df.head()

In [None]:
users_dict = convert2df("yelp_academic_dataset_users.json",users)
users_df = pd.DataFrame.from_dict(users_dict)
users_df.head()

### Data Filtering

#### Filter 1 ( you can save the file into .csv after every step to avoid repeating steps due to system hangs due to OutOfMemoryError )
- Filtering only Restaurant Businesses , which are open and have more than 50 reviews.
- Retain the reviews of the filtered restaurants
- Retain the users who have given more than 50 reviews

In [None]:
business_df = business_df[(business_df['categories'].str.contains('Restaurants')==True) & 
                            (business_df["is_open"]== 1) & (business_df["review_count"] >= 50)

In [None]:
restaurant_ids = business_df["business_id"].unique()
reviews_df =  reviews_df[reviews_df.business_id.isin(restaurant_ids)]

In [None]:
users_df = users_df[users_df.review_count>50]
users_df.shape

#### Filter 2 ( you can save the file into .csv after every step to avoid repeating steps due to system hangs due to OutOfMemoryError )
- Group by Restaurants based on City and State , and aggregate the review count to find the City-State having 100000 reviews
- Retain the reviews of filtered restaurants
- Retain the users whose reviews are retained in reviews_df

In [None]:
business_df.groupby(['state','city']).agg({"review_count":"sum"}).sort_values(by='review_count',ascending=False)

In [None]:
business_df = business_df[((business_df['state'] == 'MA') & (business_df['city'] == 'Cambridge'))]

In [None]:
restaurant_ids = business_df["business_id"].unique()
reviews_df =  reviews_df[reviews_df.business_id.isin(restaurant_ids)]

In [None]:
unique_users = pd.DataFrame(cambridge_reviews["user_id"].unique(),columns=['user_id'])

In [None]:
filterd_user_ids = unique_users['user_id'].tolist()
users_df =  users_df[users_df.user_id.isin(filterd_user_ids)]

#### Saving the dataframes into .csv to avoid repeating the steps again

In [None]:
business_df.to_csv('filtered_business.csv',index= False)
reviews_df.to_csv('filtered_reviews.csv',index= False)
users_df.to_csv('filtered_users.csv',index= False)

In [None]:
business_df = pd.read_csv('filtered_business.csv')
business_df = pd.read_csv('filtered_reviews.csv')
business_df = pd.read_csv('filtered_users.csv')

### Data Cleaning

#### To remove Non-English Reviews from the dataset ( and update reviews_df and users_df )

In [None]:
non_eng_review_ids = []
reviews_df_1 = reviews_df
reviews_df_1['lang'] = reviews_df_1['text'].apply(detect)
reviews_df_1 = reviews_df_1[reviews_df_1.lang!='en']

In [None]:
non_eng_review_ids.append(reviews_df_1.review_id)
non_enlish_review_id_alone = []
                          
for i in non_eng_review_ids:
    for j in i:
        non_enlish_review_id_alone.append(j)
total_non_english_review = len(non_enlish_review_id_alone)
print("total non english reviews :",total_non_english_review)
df = pd.DataFrame(non_enlish_review_id_alone)
df.to_csv('non_eng_review.csv')

In [None]:
df = pd.read_csv('non_eng_review.csv')
non_enlish_review_id_alone = pd.DataFrame(df).to_numpy()

In [None]:
index_reviews_to_be_dropped = []
for i in range(len(non_enlish_review_id_alone)):
    reviews_to_be_dropped = reviews_df[reviews_df['review_id']==non_enlish_review_id_alone[i][1]]['text']
    index_reviews_to_be_dropped.append(reviews_to_be_dropped.first_valid_index())
reviews_df = reviews_df.drop(index_reviews_to_be_dropped)

In [None]:
unique_users = pd.DataFrame(reviews_df["user_id"].unique(),columns=['user_id'])
filterd_user_ids = unique_users['user_id'].tolist()
users_df =  users_df[users_df.user_id.isin(filterd_user_ids)]

### Exploratory Data Analysis