## 1. Load libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

## 2. Load Data

In [3]:
files = {
    "business": "/content/drive/MyDrive/DAB322_Capstone 1_Group 9/Yelp_Datasets/Original_JSON_files/yelp_academic_dataset_business.json",
    "checkin": "/content/drive/MyDrive/DAB322_Capstone 1_Group 9/Yelp_Datasets/Original_JSON_files/yelp_academic_dataset_checkin.json",
    "photos": "/content/drive/MyDrive/DAB322_Capstone 1_Group 9/Yelp_Datasets/Original_JSON_files/photos.json",
    "review": "/content/drive/MyDrive/DAB322_Capstone 1_Group 9/Yelp_Datasets/Original_JSON_files/yelp_academic_dataset_review.json",
    "tip": "/content/drive/MyDrive/DAB322_Capstone 1_Group 9/Yelp_Datasets/Original_JSON_files/yelp_academic_dataset_tip.json",
    "user": "/content/drive/MyDrive/DAB322_Capstone 1_Group 9/Yelp_Datasets/Original_JSON_files/yelp_academic_dataset_user.json"
}

# limit the number of rows
limit = 80000

datasets = {}
for name, path in files.items():
    df = pd.read_json(path, lines=True, nrows=limit)
    datasets[name] = df
    print(f"{name}: {len(df)} rows read")

# files name
business = datasets["business"]
checkin = datasets["checkin"]
photos = datasets["photos"]
review = datasets["review"]
tip = datasets["tip"]
user = datasets["user"]

business: 80000 rows read
checkin: 80000 rows read
photos: 80000 rows read
review: 80000 rows read
tip: 80000 rows read
user: 80000 rows read


## 3. Data Overview

### Business Data

In [4]:
business.info()
business.head()
business.isnull().sum()
business.describe()
business['categories'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   80000 non-null  object 
 1   name          80000 non-null  object 
 2   address       80000 non-null  object 
 3   city          80000 non-null  object 
 4   state         80000 non-null  object 
 5   postal_code   80000 non-null  object 
 6   latitude      80000 non-null  float64
 7   longitude     80000 non-null  float64
 8   stars         80000 non-null  float64
 9   review_count  80000 non-null  int64  
 10  is_open       80000 non-null  int64  
 11  attributes    72775 non-null  object 
 12  categories    79947 non-null  object 
 13  hours         67684 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 8.5+ MB


Unnamed: 0_level_0,count
categories,Unnamed: 1_level_1
"Beauty & Spas, Nail Salons",513
"Restaurants, Pizza",469
"Nail Salons, Beauty & Spas",469
"Pizza, Restaurants",455
"Restaurants, Chinese",395
...,...
"Appliances & Repair, Shopping, Home & Garden, Hardware Stores, Local Services, Home Services, Building Supplies, Contractors, Appliances, Nurseries & Gardening",1
"Restaurants, Sandwiches, Breakfast & Brunch, Steakhouses, Comfort Food, Burgers",1
"Cocktail Bars, Bars, Beer Bar, Nightlife, Music Venues, Arts & Entertainment",1
"Beauty & Spas, Reflexology, Reiki, Massage, Event Planning & Services, Party & Event Planning, Professional Services, Health & Medical",1


### Checkin Data

In [5]:
checkin.head()
checkin.info()
checkin.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   business_id  80000 non-null  object
 1   date         80000 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


Unnamed: 0,0
business_id,0
date,0


### User Data

In [6]:
user.info()
user.head(10)
user.isnull().sum()
user.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             80000 non-null  object 
 1   name                80000 non-null  object 
 2   review_count        80000 non-null  int64  
 3   yelping_since       80000 non-null  object 
 4   useful              80000 non-null  int64  
 5   funny               80000 non-null  int64  
 6   cool                80000 non-null  int64  
 7   elite               80000 non-null  object 
 8   friends             80000 non-null  object 
 9   fans                80000 non-null  int64  
 10  average_stars       80000 non-null  float64
 11  compliment_hot      80000 non-null  int64  
 12  compliment_more     80000 non-null  int64  
 13  compliment_profile  80000 non-null  int64  
 14  compliment_cute     80000 non-null  int64  
 15  compliment_list     80000 non-null  int64  
 16  comp

Unnamed: 0,review_count,useful,funny,cool,fans,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
count,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0
mean,105.25235,258.949988,110.985362,158.059025,8.9044,3.858785,11.320863,1.844013,1.213763,0.838387,0.498075,8.859288,20.06255,17.9295,17.9295,7.273038,6.185812
std,258.167414,1821.486859,1180.987783,1561.391419,64.13221,0.643144,137.593105,24.275875,35.209503,14.030645,14.016817,77.750325,264.741345,192.494866,192.494866,78.529912,116.468333
min,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12.0,9.0,1.0,2.0,0.0,3.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31.0,32.0,7.0,9.0,1.0,3.9,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
75%,94.0,119.0,31.0,40.0,4.0,4.25,1.0,1.0,0.0,0.0,0.0,3.0,3.0,3.0,3.0,2.0,1.0
max,17473.0,206296.0,185823.0,195814.0,12497.0,5.0,12391.0,4347.0,7039.0,1744.0,2607.0,8616.0,28974.0,13280.0,13280.0,9821.0,14045.0


### Tip Data

In [7]:
tip.head()
tip.isnull().sum()
tip.describe()
tip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   user_id           80000 non-null  object        
 1   business_id       80000 non-null  object        
 2   text              80000 non-null  object        
 3   date              80000 non-null  datetime64[ns]
 4   compliment_count  80000 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 3.1+ MB


### Review Data

In [8]:
review.info()
review.head(10)
review.isnull().sum()
review.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   review_id    80000 non-null  object        
 1   user_id      80000 non-null  object        
 2   business_id  80000 non-null  object        
 3   stars        80000 non-null  int64         
 4   useful       80000 non-null  int64         
 5   funny        80000 non-null  int64         
 6   cool         80000 non-null  int64         
 7   text         80000 non-null  object        
 8   date         80000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 5.5+ MB


Unnamed: 0,stars,useful,funny,cool,date
count,80000.0,80000.0,80000.0,80000.0,80000
mean,3.844512,0.89655,0.257675,0.347212,2015-05-03 02:10:53.944837632
min,1.0,0.0,0.0,0.0,2005-03-01 17:47:15
25%,3.0,0.0,0.0,0.0,2013-12-08 04:12:03.500000
50%,4.0,0.0,0.0,0.0,2015-09-27 03:43:38
75%,5.0,1.0,0.0,0.0,2017-04-08 14:10:36.500000
max,5.0,171.0,98.0,49.0,2018-10-04 18:22:35
std,1.353924,1.98193,1.022994,1.06127,


### Photos Data

In [9]:
photos.info()
photos.head(10)
photos.isnull().sum()
photos.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   photo_id     80000 non-null  object
 1   business_id  80000 non-null  object
 2   caption      80000 non-null  object
 3   label        80000 non-null  object
dtypes: object(4)
memory usage: 2.4+ MB


Unnamed: 0,photo_id,business_id,caption,label
count,80000,80000,80000.0,80000
unique,80000,26366,32301.0,5
top,m5svD4kUQ3wWAwSTb-ydkw,FEXhWNCMkv22qG04E83Qjg,,food
freq,1,200,41510.0,43083
