In [4]:
import csv
import sys
import os.path
import json
import numpy as np
import matplotlib.pyplot as plt
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
%pylab

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


In [2]:
path = sys.path[0]
if path == '':
    path = os.getcwd()
FILEPATH = os.path.join(os.path.dirname(path), 'datafile')  # 此脚本文件上一级路径中的datafile文件夹
SUFFIX = '.csv'
TABLEHEADER = ['user_url_token', 'user_data_json', 'user_following_list']

def datajsons():
    """生成器，用于遍历所有用户的json数据
    """
    # 数据文件夹不存在，就退出
    if not os.path.exists(FILEPATH):
        return None
    
    # 从存储数据文件的文件夹中找出所有csv文件，得到一个包含所有csv绝对路径文件名的list。
    csvfilelist = list()
    for filename in os.listdir(FILEPATH):
        filename = os.path.join(FILEPATH, filename)
        if os.path.splitext(filename)[1] == SUFFIX:
            with open(filename, 'r', encoding='utf-8') as csvfile:
                reader = csv.DictReader(csvfile)
                if reader.fieldnames == TABLEHEADER:
                    csvfilelist.append(os.path.join(FILEPATH, filename))
    csvfilelist.sort()
        
    # 从上面的列表中，依次遍历每个文件，得到一个包含已经爬取用户的url token的list。
    usercrawled = list()
    for filename in csvfilelist:
        with open(filename, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                user_data_json = json.loads(row[TABLEHEADER[1]])
                yield user_data_json
    return None

In [3]:
# 
voteupCountList = list()
jsons = datajsons()
for user in jsons:
    try:
        voteupCountList.append(user['voteupCount'])
    except:
        pass

In [69]:
black = '#212121'
gray = '#727272'
red1 = '#D32F2F'
red2 = '#F44336'
orange1 = '#FF9500'
orange2 = '#FFb44A'
orange3 = '#ffd191'
orange4 = '#FFF1DE'

In [118]:
fig, axes = plt.subplots(3,2)
fig.set_size_inches(16,9)
fig.suptitle('72万知乎用户获得赞同数分布直方图', fontsize=16, color=red1)
subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.5)

tempList = [x for x in voteupCountList if x <= 100]
voteupCountArray = np.array(tempList)
axes[0,0].hist(voteupCountArray, bins=100, color=red2, alpha = 0.7)
axes[0,0].set_ylim(0,50000)
axes[0,0].set_xlim(0,100)
axes[0,0].set_title('100赞同以下用户分布（组距：1）',color=red1)

tempList = [x for x in voteupCountList if x > 100 and x <= 1000]
voteupCountArray = np.array(tempList)
axes[0,1].hist(voteupCountArray, bins=180, color=red2, alpha = 0.7)
axes[0,1].set_xlim(100,1000)
axes[0,1].set_title('100-1000赞同用户分布（组距：5）',color=red1)

tempList = [x for x in voteupCountList if x > 1000 and x <= 10000]
voteupCountArray = np.array(tempList)
axes[1,0].hist(voteupCountArray, bins=180, color=red2, alpha = 0.7)
axes[1,0].set_xlim(1000,10000)
axes[1,0].set_title('1000-10000赞同用户分布（组距：50）',color=red1)

tempList = [x for x in voteupCountList if x > 10000 and x <= 100000]
voteupCountArray = np.array(tempList)
axes[1,1].hist(voteupCountArray, bins=180, color=red2, alpha = 0.7)
axes[1,1].set_xlim(10000,100000)
axes[1,1].set_title('1万-10万赞同用户分布（组距：500）',color=red1)

tempList = [x for x in voteupCountList if x > 100000 and x <= 1000000]
voteupCountArray = np.array(tempList)
axes[2,0].hist(voteupCountArray, bins=180, color=red2, alpha = 0.7)
axes[2,0].set_xlim(100000,1000000)
axes[2,0].set_title('10万-100万赞同用户分布（组距：5000）',color=red1)

tempList = [x for x in voteupCountList if x > 1000000]
voteupCountArray = np.array(tempList)
axes[2,1].hist(voteupCountArray, bins=180, color=red2, alpha = 0.7)
axes[2,1].set_xlim(1000000,4000000)
axes[2,1].set_title('100万以上赞同用户分布（组距：50000）',color=red1)

for i in axes:
    for j in i:
        j.set_xlabel('获得赞同数（次）',color=red1)
        j.set_ylabel('用户数量（人）',color=red1)
        j.set_facecolor(orange4)
        j.grid(True, linestyle='--')