In [1]:
import pandas as pd
import numpy as np
import json
import sys
import pyecharts
sys.path.append('..')

In [2]:
from src import *
yelp_path = '../data/yelp_academic_dataset_review.csv'
yelp_business_path = '../data/yelp_academic_dataset_business.csv'
with open('../assets/yelp_dtype.json', 'r') as json_file:
    yelp_dtype = json.load(json_file)

with open('../assets/yelp_business_dtype.json', 'r') as json_file:
    yelp_business_dtype = json.load(json_file)

In [3]:
yelp_path = '../data/yelp_academic_dataset_review.csv'
yelp_business_path = '../data/yelp_academic_dataset_business.csv'
yelp = pd.read_csv(
    yelp_path, encoding='utf-8', dtype=yelp_dtype)
yelp_business = pd.read_csv(
    yelp_business_path, encoding='utf-8', dtype=yelp_business_dtype)

这里查看每个城市的不同种类数量

In [4]:
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
PingFang = FontProperties(fname='../assets/PingFang.ttc')
plt.style.use('ggplot')

In [6]:
# from src.color_generator import fetch_colors, append_to_json_file
# hex_color = 'E5E5E5'  # 要查询的HEX颜色代码
# result = fetch_colors(hex_color)  # 假设这是 color_generator 中的一个函数
# append_to_json_file(result, '../assets/Palette.json')  # 假设这是 color_generator 中的一个函数

color_code = '#E5E5E5'
palette_key = 'Twisted Spot Palette'

try:
    with open('../assets/Palette.json', 'r') as file:
        data = json.load(file)
    if color_code in data:
        main_color = data[color_code].get(palette_key, [])
        print("Main Color Palette:", main_color)
except FileNotFoundError:
    ImportError("JSON文件格式不正确。")
except json.JSONDecodeError:
    ImportError("JSON文件格式不正确。")

Main Color Palette: ['#00754B', '#C8FCEA', '#6A9A8B']


In [8]:
from pyecharts import options as opts
from pyecharts.charts import Bar
import pandas as pd

# 按城市分组并计算商家数量
yelp_business_groups = yelp_business.groupby('city').count()
yelp_business_city = yelp_business_groups['name']
yelp_business_city_sorted = yelp_business_city.sort_values(ascending=False)

# 提取前30个城市的商家数量
top_30_cities = yelp_business_city_sorted[:30]

# 使用pyecharts绘制柱状图
bar = (
    Bar()
    .add_xaxis(top_30_cities.index.tolist())
    .add_yaxis("商家数量", top_30_cities.values.tolist(), color=main_color[0])
    .set_global_opts(
        title_opts=opts.TitleOpts(title="每个城市的商家数量（前30名）", subtitle="", title_textstyle_opts=opts.TextStyleOpts(font_family="PingFang", font_size=16)),
        xaxis_opts=opts.AxisOpts(name="城市", axislabel_opts=opts.LabelOpts(rotate=45, font_family="PingFang")),
        yaxis_opts=opts.AxisOpts(name="商家数量", axislabel_opts=opts.LabelOpts(font_family="PingFang")),
        datazoom_opts=[opts.DataZoomOpts()],
    )
)

# 渲染图表
bar.render("../images/01_每个城市的商家数量.html")
bar.render_notebook()


查看不同类型的企业及其数量

In [13]:
# 计算每个类别的出现次数
categories_dict = {}
for i in yelp_business.categories:
    cat = str(i).split(',')
    for j in cat:
        if j not in categories_dict.keys():
            categories_dict[j] = 1
        else:
            categories_dict[j] += 1

categories_series = pd.Series(categories_dict)
categories_series.sort_values(ascending=False, inplace=True)

# 提取前30个类别的数量
top_30_categories = categories_series[:30]

# 使用pyecharts绘制柱状图
bar = (
    Bar()
    .add_xaxis(top_30_categories.index.tolist())
    .add_yaxis("数量", top_30_categories.values.tolist(), color=main_color[0])
    .set_global_opts(
        title_opts=opts.TitleOpts(title="最受欢迎的商家类别（前30名）", subtitle="", title_textstyle_opts=opts.TextStyleOpts(font_family="PingFang", font_size=16)),
        xaxis_opts=opts.AxisOpts(name="类别", axislabel_opts=opts.LabelOpts(rotate=45, font_family="PingFang", font_size=8)),
        yaxis_opts=opts.AxisOpts(name="数量", axislabel_opts=opts.LabelOpts(font_family="PingFang")),
        datazoom_opts=[opts.DataZoomOpts()],
    )
)

# 渲染图表
bar.render("../images/02_最受欢迎的商家类别.html")
bar.render_notebook()

## 1.3. 商业和城市有序对及其计数

In [12]:
# 创建business_city字典，用于存储城市和分类的商家数量
business_city = {}
n = len(yelp_business)

for i in range(n):
    k1 = str(yelp_business.categories.iloc[i]).split(',')
    k2 = yelp_business.city.iloc[i]
    for j in k1:
        k = (j.strip(), k2)  # 去掉类别字符串中的空格
        if k not in business_city.keys():
            business_city[k] = 1
        else:
            business_city[k] += 1

# 将字典转换为Series并排序
business_city_series = pd.Series(business_city)
business_city_series.sort_values(ascending=False, inplace=True)

# 提取前30个城市-分类的组合数量
top_30_business_city = business_city_series[:30]

# 使用pyecharts绘制柱状图
bar = (
    Bar()
    .add_xaxis([f"{k[1]} - {k[0]}" for k in top_30_business_city.index])
    .add_yaxis("商家数量", top_30_business_city.values.tolist(), color=main_color[0])
    .set_global_opts(
        title_opts=opts.TitleOpts(title="城市和商家类别组合数量（前30名）", subtitle="", title_textstyle_opts=opts.TextStyleOpts(font_family="PingFang", font_size=16)),
        xaxis_opts=opts.AxisOpts(name="城市-分类", axislabel_opts=opts.LabelOpts(rotate=45, font_family="PingFang", font_size=8)),
        yaxis_opts=opts.AxisOpts(name="商家数量", axislabel_opts=opts.LabelOpts(font_family="PingFang")),
        datazoom_opts=[opts.DataZoomOpts()],
    )
)

# 渲染图表
bar.render("../images/03_城市和商家类别组合数量.html")
bar.render_notebook()

可以看出，Yelp 上列出的多数商家是餐馆、购物场所和娱乐中心。对于这组数据（来自 Yelp 网站），拥有最多商家的主要城市是`Las Vegas`, `Phoenix`,和`Scottsdale`。

# 2. 清理评论数据

## 2.1 删除不需要的列
- `text`: 评论文本,
- `useful`: 认为有用的人数,
- `date`: 日期

剩下的列是 `user_id`, `business_id` 和 `stars`.

In [16]:
yelp_cleaned = yelp.drop(['text',
                          'votes.useful',
                          'votes.cool',
                          'date',
                          'votes.funny'], axis=1)
yelp_cleaned.head()

Unnamed: 0,user_id,review_id,business_id,stars,type
0,Xqd0DzHaiyRqVH3WRG7hzg,15SdjuK7DmYqUAj6rjGowg,vcNAWiLM4dR7D2nwwJ7nCA,5,review
1,H1kH6QZV7Le4zqTRNxoZow,RF6UnRTtG7tWMcrO2GEoAg,vcNAWiLM4dR7D2nwwJ7nCA,2,review
2,zvJCcrpm2yOZrxKffwGQLA,-TsVN230RCkLYKBeLsuz7A,vcNAWiLM4dR7D2nwwJ7nCA,4,review
3,KBLW4wJA_fwoWmMhiHRVOA,dNocEAyUucjT371NNND41Q,vcNAWiLM4dR7D2nwwJ7nCA,4,review
4,zvJCcrpm2yOZrxKffwGQLA,ebcN2aqmNUuYNoyvQErgnA,vcNAWiLM4dR7D2nwwJ7nCA,4,review


## 2.2 合并数据
合并业务数据`yelp_academic_dataset_business.csv`与评论数据`yelp_academic_dataset_review.csv`

将业务数据和评论数据利用 'business_id' 为主键合并。像business的 'city' 和 'categories' 这样的信息是从业务数据文件中获取的。

In [17]:
yelp_full = yelp_cleaned.merge(
    yelp_business[['city',
                   'categories',
                   'business_id']], how='inner', on='business_id')

df = yelp_full.dropna()

## 2.3. 获取评论数量最多的前30名业务类别和城市

In [18]:
# 获取前30个城市和商家类别组合的评论数量
review_business_city = get_top_m_num_reviews_for_city_and_business(df, 30)

# 使用pyecharts绘制柱状图
bar = (
    Bar()
    .add_xaxis([f"{k[1]} - {k[0]}" for k in review_business_city.index])
    .add_yaxis("评论数量", review_business_city.values.tolist(), color=main_color[0])
    .set_global_opts(
        title_opts=opts.TitleOpts(title="获得评论最多的城市和商家类别组合（前30名）", subtitle="", title_textstyle_opts=opts.TextStyleOpts(font_family="PingFang", font_size=16)),
        xaxis_opts=opts.AxisOpts(name="城市-业务类别", axislabel_opts=opts.LabelOpts(rotate=45, font_family="PingFang", font_size=8)),
        yaxis_opts=opts.AxisOpts(name="评论数量", axislabel_opts=opts.LabelOpts(font_family="PingFang")),
        datazoom_opts=[opts.DataZoomOpts()],
    )
)

# 渲染图表
bar.render("../images/04_获得评论最多的城市和商家类别组合.html")
bar.render_notebook()

In [None]:
df.to_csv('../data/Yelp_final.csv', index=False)