In [9]:
def web_crawling_for_mal(start_page, end_page): # 对myanimelist上的数据进行抓取并保存
    
    import json
    import re
    import requests
    from bs4 import BeautifulSoup as bsp

    url = "https://myanimelist.net/topanime.php"
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"}
    ani_list = []
    for n in range((start_page - 1) * 50, end_page * 50, 50): # 外循环：控制读取目录页数
        response = requests.get(url, params = {"limit":n}, headers=headers).text
        html = bsp(response, "lxml")
        a_tag_list = html.find_all("a", class_ = "hoverinfo_trigger fl-l fs14 fw-b", limit = 50)
        for link in a_tag_list: # 内循环：对每页中50个链接进行访问请求
            response = requests.get(link.attrs["href"], headers=headers).text
            html = bsp(response, "lxml")
            ani = {}
            name = re.match(r"\n(.*)\s-\s", html.find("title").string, re.S) # 获得动画名
            ani["Name"] = name.group(1)
            score = html.find("span", itemprop = "ratingValue").string # 获得动画评分
            ani["Score"] = score
            users = html.find("span", itemprop = "ratingCount").string # 获得用户数
            ani["Users"] = users
            rank = html.find("span", class_ = "numbers ranked").strong.string[1:] # 获得排名 
            ani["Rank"] = rank
            information = html.find("h2", string = "Information") # 定位到<h2>Information</h2>标签
            while True:
                information = information.next_sibling # 访问下一个兄弟节点
                if information.name == "h2":
                    break # 读到下一个<h2>标签，停止抓取
                elif information.name != "div":
                    continue # 跳过字符串兄弟节点
                s = list(information.stripped_strings) # 获得所有字符串子孙节点组成的列表
                if len(s) > 2: # 构造键-值对
                    ani[s[0]] = s[1:]
                else:
                    ani[s[0]] = s[1]
            ani_list.append(ani)
    ani_json = json.dumps(ani_list) # 将爬取信息导出为json文件
    with open("ani_list.json", "w") as fid:
        fid.write(ani_json)
    print("download is ok!")
    
web_crawling_for_mal(start_page, end_page)

download is ok!


In [21]:
def data_processing(): # 整理数据
    
    # 对原始数据进行处理分析，其中打“*”的步骤改变了原始数据，打“**”的步骤增加了新列
    import os
    import re
    import json
    import pandas as pd
    from datetime import datetime

    df = pd.read_json("ani_list.json", orient = "records") # 导入json文件
    df = df.set_index("Rank")

    def str_to_date(s): # 将“发行时间”由str转为datetime
        date = re.match(r"\w+\s\d+,\s\d+", s, re.S)
        if date:
            return pd.to_datetime(date.group(), format = "%b %d, %Y")
        else:
            date = re.match(r"\w+,\s\d+", s, re.S)
            return pd.to_datetime(date.group(), format = "%b, %Y") # 用以处理个别格式不规范的发行时间

    def count_and_clean(df_col): # 对于分类列属性，统计每种属性值出现次数以及该属性对应动画索引，以字典形式返回，同时将列表元素中的","清除
        dic_num = {}
        dic_index = {}
        new_col = []
        for index, each in df_col.items():
            if type(each) == str: # 对于单值，只统计次数
                if each in dic_num:
                    dic_num[each] += 1
                    dic_index[each].append(index)
                else:
                    dic_num[each] = 1
                    dic_index[each] = [index]
                new_col.append(each)
            elif each == ['None found,', 'add some']: # （*）对于缺失值，返回字符串"Unknown"，不计入统计
                new_col.append("Unknown")
                continue
            else:
                temp = [] # （*）对于多值（列表），返回删去","的新列表，同时统计次数
                for string in each: 
                    if string == ",":
                        continue
                    else:
                        temp.append(string)
                        if string in dic_num:
                            dic_num[string] += 1
                            dic_index[string].append(index)
                        else:
                            dic_num[string] = 1
                            dic_index[string] = [index]
                new_col.append(temp)        
        return (new_col, dic_num, dic_index)


    def data_cleaning(df): # 数据清洗
        df.columns = [x.rstrip(":") for x in df.columns] # （*）将列名内":"删去
        df["Users"] = [int(i.replace(",", "")) for i in df["Users"]] # （*）将"Users"数据中"," 删去，同时转为int

        # （**）以放送时间为依据，创建关于时间信息的新列，方便后续分析
        df["Datetime"] = [str_to_date(i) for i in df["Aired"]]
        df["Year"] = [i.year for i in df["Datetime"]]
        df["Month"] = [i.month for i in df["Datetime"]]

        # （*）（**） 以放送周期为依据，创建关于TV类型的新列，方便后续分析，通过转为int
        df["TV_type"] = df["Type"].copy()
        for index, row in df.iterrows():
            if row["Episodes"] != "Unknown":
                df.loc[index, "Episodes"] = int(row["Episodes"]) # 注意，此时df中的数据已经发生了修改变为int，但row中的数据仍为str
            if row["Type"] == "TV":
                if row["Episodes"] == "Unknown":
                    df.loc[index, "TV_type"] = "serials"
                else:
                    i = round(int(row["Episodes"]) / 12.0) # 由于row中数据仍为str，故需要将其转为int
                    if i > 4:
                        df.loc[index, "TV_type"] = "serials"
                    elif i < 1:
                        df.loc[index, "TV_type"] = "1 season"
                    else:
                        df.loc[index, "TV_type"] = "{} season".format(i)

        # （*）将df中某些列表元素中的","删去，同时返回分类列属性的数量统计与映射关系字典
        col_list = ["Producers", "Licensors", "Studios", "Genres"]
        num_list = {}
        index_list = {}
        for name in col_list:
            df[name], dic_num, dic_index = count_and_clean(df[name])
            num_list[name] = dic_num
            index_list[name] = dic_index
        other_list = ["Source", "Rating", "Year", "TV_type"]
        for name in other_list:
            new_dict = df[name].value_counts().to_dict()
            if "Unknown" in new_dict:
                del new_dict["Unknown"]
            num_list[name] = new_dict
        return num_list, index_list

    num_list, index_list = data_cleaning(df)
    # 由于json.dumps()无法处理datetime类，因此将其改为时间戳后再改为int类保存
    # 由于datetime模块转换时间戳是按照格林威治时区转换，但pandas恢复datetime是按照系统自带时区（北京时间，东八区）转换
    # 因此会差8个小时（28800秒），需要在转换时间戳时加以补充
    df["Datetime"] = [int(datetime.timestamp(t)) + 28800 for t in df["Datetime"]]  

    if not os.path.exists("anime_data.json"): # 首次写入文件
        with open("anime_data.json", "w") as fid:
            fid.write(df.to_json(orient = "index"))
    else: # 追加写入文件
        with open("anime_data.json", "r") as fid:
            main_dict = json.load(fid)
        with open("anime_data.json", "w") as fid:
            temp_dict = json.loads(df.to_json(orient = "index"))
            main_dict.update(temp_dict)
            fid.write(json.dumps(main_dict))
    if not os.path.exists("num_list.json"):
        with open("num_list.json", "w") as fid:
            fid.write(json.dumps(num_list))
    else:
        with open("num_list.json", "r") as fid:  
            n_list = json.load(fid)
        with open("num_list.json", "w") as fid: 
            for i in num_list.keys():
                for j in num_list[i].keys():
                    if j in n_list[i]:
                        n_list[i][j] += num_list[i][j]
                    else:
                        n_list[i][j] = num_list[i][j]
            fid.write(json.dumps(n_list))
    if not os.path.exists("index_list.json"):
        with open("index_list.json", "w") as fid:
            fid.write(json.dumps(index_list))
    else:
        with open("index_list.json", "r") as fid:
            i_list = json.load(fid)
        with open("index_list.json", "w") as fid:
            for i in index_list.keys():
                for j in index_list[i].keys():
                    if j in i_list[i]:
                        i_list[i][j] += index_list[i][j]
                    else:
                        i_list[i][j] = index_list[i][j]
            fid.write(json.dumps(i_list))
    print("file has been saved!")

data_processing()

file has been saved!


In [None]:
def boxplot_class(df_col): # 对数据类列属性做箱线图
    y = [int(i) if type(i) == str else i for i in df_col]
    fig, ax = plt.subplots()
    ax.boxplot(y)
    plt.show()
#boxplot_class(df["Score"])

def bar_class(dic_t): # 对分类列属性，按每种属性值出现的频次做柱形图
    label_list = sorted(dic_t.items(), key = lambda d: d[1], reverse = True) # 对字典元素按value值排序
    if len(label_list) > 10: # 只统计排名前十
        x = [i[0] for i in label_list[0:10]]
    else:
        x = [i[0] for i in label_list]
    y = [dic_t[i] for i in x]
    fig, ax = plt.subplots()
    ax.bar(x, y)
    ax.set_xticklabels(x, rotation = 90)
    ax.set_ylabel("Anime number")
    plt.show()

def plot_date_score(): # 评分，动画数量对时间做图
    x = df["Datetime"]
    y = df["Score"]
    fig, ax = plt.subplots(2, 1, sharex = True)
    ax[0].scatter(x, y, marker = ".")
    ax[1].hist(x, bins = 80)
    ax[1].set_xlabel("Aired time")
    ax[0].set_ylabel("Ranking score")
    ax[1].set_ylabel("Anime number")
    plt.show()
#plot_date_score()

def pie_class(dic_t): # 对单值分类列属性，按每种属性值出现的频次百分比做饼图
    sort_list = sorted(dic_t.items(), key = lambda d: d[1], reverse = True)
    n = 600 # 动画数据总量
    y = []
    labels = []
    for i in sort_list:
        if i[1] >= 20: # 调整"others"类别的大小
            y.append(i[1])
            labels.append(i[0])
            n = n - i[1]
        else:
            y.append(n)
            labels.append("others")
            break
    fig, ax = plt.subplots()
    ax.pie(y, labels = labels, labeldistance = 1.1, autopct = "%.1f%%", pctdistance = 0.8)
    plt.show()
#pie_class(rat_dict)

def acc_bar_type_year(): # 对动画类型，按年做堆积柱形图
    fig, ax = plt.subplots()
    x = [str(i) for i in range(2000, 2020)]
    temp = pd.Series([0] * 20, index = [int(i) for i in x])
    df_type = df[df["Year"] >= 2000]
    labels = ["1 season", "2 season", "3 season", "4 season", "serials"]
    for t in labels:
        df_t = df_type[df_type["TV_type"] == t]
        serie = df_t["Year"].value_counts()
        for i in range(2000, 2020):
            if not i in serie.index:
                serie[i] = 0
        serie = serie.sort_index()
        print(serie)
        print(temp)
        ax.bar(x, serie, bottom = temp.values)
        temp += serie
    ax.set_xticklabels(x, rotation = 90)
    ax.set_ylim(0, 30)
    ax.set_ylabel('Anime number')
    ax.legend(labels)
    plt.show()
#acc_bar_type_year()