In [3]:
import csv
import sys
import os.path
import json
import matplotlib
matplotlib.use('TkAgg')
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['font.sans-serif'] = ['SimHei']

In [4]:
def datajsons():
    """生成器，用于遍历所有用户的json数据
    """
    path = sys.path[0]
    if path == '':
        path = os.getcwd()
    FILEPATH = os.path.join(os.path.dirname(path), 'datafile')  # 此脚本文件上一级路径中的datafile文件夹
    SUFFIX = '.csv'
    TABLEHEADER = ['user_url_token', 'user_data_json', 'user_following_list']
    
    # 数据文件夹不存在，就退出
    if not os.path.exists(FILEPATH):
        return None
    
    # 从存储数据文件的文件夹中找出所有csv文件，得到一个包含所有csv绝对路径文件名的list。
    csvfilelist = list()
    for filename in os.listdir(FILEPATH):
        filename = os.path.join(FILEPATH, filename)
        if os.path.splitext(filename)[1] == SUFFIX:
            with open(filename, 'r', encoding='utf-8') as csvfile:
                reader = csv.DictReader(csvfile)
                if reader.fieldnames == TABLEHEADER:
                    csvfilelist.append(os.path.join(FILEPATH, filename))
    csvfilelist.sort()
        
    # 从上面的列表中，依次遍历每个文件，得到一个包含已经爬取用户的url token的list。
    usercrawled = list()
    for filename in csvfilelist:
        with open(filename, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                user_data_json = json.loads(row[TABLEHEADER[1]])
                yield user_data_json
    return None

In [3]:
# 遍历所有用户，读取需要的信息
voteupCountList = list() # 获得赞同数
thankedCountList = list() # 获得感谢数
followingCountList = list() # 该用户关注的用户数
followerCountList = list() # 该用户被其他用户关注数
favoriteCountList = list() # 收藏数
favoritedCountList = list() # 被收藏数
answerCountList = list() # 回答数
articlesCountList = list() # 文章数
questionCountList = list() # 提问数
followingColumnsCountList = list() # 关注专栏数
followingFavlistsCountList = list() #  关注收藏夹数
followingTopicCountList = list() # 关注话题数
followingQuestionCountList = list() # 关注问题数

jsons = datajsons()
for user in jsons:
    try:
        voteupCountList.append(user['voteupCount'])
        thankedCountList.append(user['thankedCount'])
        followingCountList.append(user['followingCount'])
        followerCountList.append(user['followerCount'])
        favoriteCountList.append(user['favoriteCount'])
        favoritedCountList.append(user['favoritedCount'])
        answerCountList.append(user['answerCount'])
        articlesCountList.append(user['articlesCount'])
        questionCountList.append(user['questionCount'])
        followingColumnsCountList.append(user['followingColumnsCount'])
        followingFavlistsCountList.append(user['followingFavlistsCount'])
        followingTopicCountList.append(user['followingTopicCount'])
        followingQuestionCountList.append(user['followingQuestionCount'])
    except:
        pass

In [44]:
black = '#212121'
gray = '#727272'
red1 = '#D32F2F'
red2 = '#F44336'
orange1 = '#FF9500'
orange2 = '#FFb44A'
orange3 = '#ffd191'
orange4 = '#FFF1DE'

In [1]:
# 72万知乎用户获得赞同数分布直方图
fig, axes = plt.subplots(3,2)
fig.set_size_inches(18,10)
fig.suptitle('72万知乎用户获得赞同数分布直方图', fontsize=16, color=red1)
fig.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.5)

# 把一个二维子图数组压扁
subplotlist = list()
for i in axes:
    for j in i:
        subplotlist.append(j)

# 每个直方图的赞数统计范围
edge = [[0,100],
       [100,1000],
       [1000,10000],
       [10000,100000],
       [100000,1000000],
       [1000000,4000000]
       ]
# 每个直方图的组距
binslist = [1, 5, 50, 500, 5000, 50000]

for i in range(len(subplotlist)):
    tempList = [x for x in voteupCountList if x>=edge[i][0] and x<edge[i][1]]
    voteupCountArray = np.array(tempList)
    tempList = [x for x in thankedCountList if x>=edge[i][0] and x<edge[i][1]]
    thankedCountArray = np.array(tempList)
    subplotlist[i].hist([voteupCountArray,thankedCountArray], normed=0, histtype='barstacked', 
                        bins=int((edge[i][1]-edge[i][0])/binslist[i]), color=[red1,'k'], alpha = 0.5)
    subplotlist[i].set_xlim(edge[i][0], edge[i][1])
    subplotlist[i].set_title('%d-%d赞用户分布（组距：%d）'%(edge[i][0],edge[i][1],binslist[i]), color=red1)
    subplotlist[i].set_xlabel('获得赞同（次）',color=red1)
    subplotlist[i].set_ylabel('用户数量（人）',color=red1)
    subplotlist[i].set_facecolor(orange4)
    subplotlist[i].grid(True, linestyle='--')

# 微调
axes[0,0].set_ylim(0,300000)
fig.show()

NameError: name 'plt' is not defined

In [51]:
"""
==============================
Create 3D histogram of 2D data
==============================

Demo of a histogram for 2 dimensional data as a bar graph in 3D.
"""

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x, y = np.random.rand(2, 100) * 4

hist, xedges, yedges = np.histogram2d(x, y, bins=4, range=[[0, 4], [0, 4]])

# Construct arrays for the anchor positions of the 16 bars.
# Note: np.meshgrid gives arrays in (ny, nx) so we use 'F' to flatten xpos,
# ypos in column-major order. For numpy >= 1.7, we could instead call meshgrid
# with indexing='ij'.
xpos, ypos = np.meshgrid(xedges[:-1] + 0.25, yedges[:-1] + 0.25)
xpos = xpos.flatten('F')
ypos = ypos.flatten('F')
zpos = np.zeros_like(xpos)
# Construct arrays with the dimensions for the 16 bars.
dx = 0.9 * np.ones_like(zpos)
dy = dx.copy()
dz = hist.flatten()

ax.bar3d(xpos, ypos, zpos, dx, dy, dz, color='b', alpha=0.2)

plt.show()

KeyboardInterrupt: 

In [None]:
dz = hist.flatten