In [1]:
import pandas as pd
import numpy as np
import json
import sys
sys.path.append('..')

In [17]:
with open('../assets/yelp_academic.json', 'r') as file:
    yelp_academic_info = json.load(file)

with open('../assets/yelp_business.json', 'r') as file:
    yelp_business_info = json.load(file)

# 构建dtype字典
yelp_academic_dtype = {key: value['type'] for key, value in yelp_academic_info.items()}
yelp_business_dtype = {key: value['type'] for key, value in yelp_business_info.items()}

In [18]:
yelp_academic = pd.read_csv(
    '../data/yelp_academic_dataset_review.csv',
    encoding='utf-8',
    dtype=yelp_academic_dtype)

yelp_business = pd.read_csv(
    '../data/yelp_academic_dataset_business.csv',
    encoding='utf-8',
    dtype=yelp_business_dtype)

这里查看每个城市的不同种类数量

In [None]:
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
PingFang = FontProperties(fname='../assets/PingFang.ttc')
plt.style.use('ggplot')

In [None]:
color_code = '#E5E5E5'
palette_key = 'Twisted Spot Palette'

try:
    with open('../assets/Palette.json', 'r') as file:
        data = json.load(file)
    if color_code in data:
        main_color = data[color_code].get(palette_key, [])
        print("Main Color Palette:", main_color)
except FileNotFoundError:
    ImportError("JSON文件格式不正确。")
except json.JSONDecodeError:
    ImportError("JSON文件格式不正确。")

In [None]:
# 按城市分组并计算商家数量
yelp_business_groups = yelp_business.groupby('city').count()
yelp_business_city = yelp_business_groups['name']
yelp_business_city_sorted = yelp_business_city.sort_values(ascending=False)

# 绘制前30个城市的柱状图
ax = yelp_business_city_sorted[:30].plot(
    kind='bar',  figsize=(12, 8), color=main_color[0])
ax.set_xticklabels(
    yelp_business_city_sorted[:30].index, fontproperties=PingFang, rotation=45, ha='right')

ax.set_xlabel('城市', fontproperties=PingFang)
ax.set_ylabel('商家数量', fontproperties=PingFang)
plt.tight_layout()
plt.savefig('../images/01_每个城市的商家数量.png', dpi=600)
ax.set_title('每个城市的商家数量（前30名）', fontproperties=PingFang, fontsize=16)
plt.show()

查看不同类型的企业及其数量

In [None]:
# 首先计算每个类别的出现次数
categories_dict = {}
for i in yelp_business.categories:
    cat = str(i).split(',')
    for j in cat:
        if j not in categories_dict.keys():
            categories_dict[j] = 1
        else:
            categories_dict[j] += 1

categories_series = pd.Series(categories_dict)
categories_series.sort_values(ascending=False, inplace=True)

# 绘制前30个类别的柱状图
ax = categories_series[:30].plot(
    kind='bar', color=main_color[0], figsize=(12, 8))
ax.set_xticklabels(
    categories_series[:30].index, fontproperties=PingFang,
    rotation=45, ha='right', wrap=True,
    fontsize=8)


ax.set_xlabel('类别', fontproperties=PingFang)
ax.set_ylabel('数量', fontproperties=PingFang)
plt.tight_layout()
plt.savefig('../images/02_最受欢迎的商家类别.png', dpi=600)
ax.set_title('最受欢迎的商家类别（前30名）', fontproperties=PingFang, fontsize=16)
plt.show()

## 1.3. 商业和城市有序对及其计数

In [None]:
# 创建business_city字典，用于存储城市和分类的商家数量
business_city = {}
n = len(yelp_business)

for i in range(n):
    k1 = str(yelp_business.categories.iloc[i]).split(',')
    k2 = yelp_business.city.iloc[i]
    for j in k1:
        k = (j, k2)
        if k not in business_city.keys():
            business_city[k] = 1
        else:
            business_city[k] += 1

# 将字典转换为Series并排序
business_city_series = pd.Series(business_city)
business_city_series.sort_values(ascending=False, inplace=True)

# 绘制前30个城市-分类的柱状图
fig, ax = plt.subplots(figsize=(12, 8))
business_city_series[:30].plot(kind='bar', color=main_color[0], ax=ax)

ax.set_xticklabels(
    business_city_series[:30].index,
    fontproperties=PingFang,
    rotation=45, ha='right',
    wrap=True, fontsize=8)

ax.set_xlabel('城市-分类', fontproperties=PingFang)
ax.set_ylabel('商家数量', fontproperties=PingFang)
plt.tight_layout()
plt.savefig('../images/03_城市和商家类别组合数量.png', dpi=600)
ax.set_title('城市和商家类别组合数量（前30名）', fontproperties=PingFang, fontsize=16)
plt.show()

可以看出，Yelp 上列出的多数商家是餐馆、购物场所和娱乐中心。对于这组数据（来自 Yelp 网站），拥有最多商家的主要城市是`Las Vegas`, `Phoenix`,和`Scottsdale`。

# 2. 清理评论数据

## 2.1 删除不需要的列
- `text`: 评论文本,
- `useful`: 认为有用的人数,
- `date`: 日期

剩下的列是 `user_id`, `business_id` 和 `stars`.

In [None]:
yelp_cleaned = yelp_academic.drop(['text',
                          'votes.useful',
                          'votes.cool',
                          'date',
                          'votes.funny'], axis=1)
yelp_cleaned.head()

## 2.2 合并数据
合并业务数据`yelp_academic_dataset_business.csv`与评论数据`yelp_academic_dataset_review.csv`

将业务数据和评论数据利用 'business_id' 为主键合并。像business的 'city' 和 'categories' 这样的信息是从业务数据文件中获取的。

In [None]:
yelp_full = yelp_cleaned.merge(
    yelp_business[['city',
                   'categories',
                   'business_id']], how='inner', on='business_id')

df = yelp_full.dropna()

## 2.3. 获取评论数量最多的前30名业务类别和城市

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
review_business_city = get_top_m_num_reviews_for_city_and_business(
    df, 10)
review_business_city[:30].plot(kind='bar', color=main_color[0], ax=ax)

ax.set_xticklabels(
    review_business_city[:30].index,
    fontproperties=PingFang,
    rotation=45, ha='right',
    wrap=True, fontsize=8
)
ax.set_xlabel('城市-业务类别', fontproperties=PingFang)
ax.set_ylabel('评论数量', fontproperties=PingFang)
plt.tight_layout()
plt.savefig('../images/04_获得评论最多的城市和商家类别组合.png', dpi=600)
ax.set_title('获得评论最多的城市和商家类别组合（前30名）', fontproperties=PingFang, fontsize=16)
plt.show()

In [None]:
df.to_csv('../data/Yelp_final.csv', index=False)