### 初始化(导入Spark)

In [1]:
# 显示cell运行时长
%load_ext klab-autotime

In [2]:
import findspark
findspark.init() #找到并激活Spark

import pyspark
from pyspark.sql import SparkSession #SparkSession

from pyspark import SparkContext #SparkContext(类)
#sc = SparkContext("local", "First App") #生成sc实例，和pyspark shell的内置sc一样都是local
sc = SparkContext()
spark = SparkSession(sc)

from pyspark.sql.functions import when,udf
from pyspark.sql.types import StringType, DateType, IntegerType

import numpy as np
import pandas as pd
import os,psutil 
from pyspark.sql.functions import *

time: 3.91 s


In [3]:
print("Spark版本:", sc.version, "\nPython版本:", sc.pythonVer, "\nmaster", sc.master)

Spark版本: 2.4.2 
Python版本: 3.6 
master local[*]
time: 2.48 ms


### 读取数据集

In [4]:
#spark = SparkSession(sc)
df = spark.read.csv("/home/kesci/input/parallel_zwl015100/ticket.csv", header = True, inferSchema = True)
#df = spark.read.csv("/home/kesci/input/parallel_zwl015100/ticket.csv")

time: 1min 6s


In [5]:
df.show(10)

+----+--------+------+---+------------------+----------+----+---+
| _c0|     _c1|   _c2|_c3|               _c4|       _c5| _c6|_c7|
+----+--------+------+---+------------------+----------+----+---+
|null|    null|  null|  9|              null|      null|null|  9|
|null|    null|  null|  9|              null|      null|null|  9|
|8009|20200222|110000|  1|140225199002140040|大同汽车站|浑源|  1|
|8009|20200222|110000|  1|140225199002140040|大同汽车站|浑源|  1|
|8006|20200224| 92000|  1|140225198309220016|大同汽车站|浑源|  1|
|8006|20200224| 92000|  1|140225198309220016|大同汽车站|浑源|  1|
|8010|20200225|104000|  1|140225199206064318|大同汽车站|浑源|  1|
|8010|20200225|104000|  1|140225199306034319|大同汽车站|浑源|  1|
|8010|20200225|104000|  1|140225199206064318|大同汽车站|浑源|  1|
|8010|20200225|104000|  1|140225199306034319|大同汽车站|浑源|  1|
+----+--------+------+---+------------------+----------+----+---+
only showing top 10 rows

time: 252 ms


In [6]:
SSN = df.select('_c5') #选择出发车站所对应的列
group = SSN.groupby('_c5').count() #将不同车站分组统计
group.show(10) #查看前10个车站及频次
group.count() #一共有4797个出发车站

+------------------------+------+
|                     _c5| count|
+------------------------+------+
|            太原汽车总站| 10646|
|                  锦州站|221000|
|      白城市中心客运总站| 20257|
|          松原公路客运站| 17082|
|               231123002|     8|
|                万达广场|    91|
|        建州汽车城公交站|     1|
|                    黄石| 37072|
|              芷江汽车站| 10660|
|北海市西区汽车客运服务站|  9184|
+------------------------+------+
only showing top 10 rows



4797

time: 2min 7s


### 数据整体预处理

In [7]:
#去除完全重复的数据
#df = df.drop_duplicates() 
#筛选列
#df = df.drop('_c3').drop('_c4')
df1 = df.select('_c1')
#对列重命名
df1 = df1.withColumnRenamed("_c1", "departdate")
#new_names = ['schedulecode','departdate','Departtime','startstationname','Reachstationname','seattype']
#df = df.toDF(*new_names)
#删除缺失值
df1 = df1.filter(df1.departdate != 'N')
#展示前十行数据
df1.show(10) 

+----------+
|departdate|
+----------+
|  20200222|
|  20200222|
|  20200224|
|  20200224|
|  20200225|
|  20200225|
|  20200225|
|  20200225|
|  20200226|
|  20200226|
+----------+
only showing top 10 rows

time: 251 ms


In [8]:
GRD = df1 #选择离站日期所对应的列
group2 = GRD.groupby('departdate').count() #将不同日期分组统计
group2.show(10) #查看前10个日期及频次
group2.count() #一共有xx个日期

+----------+-------+
|departdate|  count|
+----------+-------+
|  20200413|1372752|
|  20200218|  21452|
|    刘文东|      1|
|    田思雨|      1|
|    王乐天|      1|
|    丁国军|      1|
|    林淑强|      1|
|2020-02-26|   2076|
|      邢瑞|      1|
|    郑儒喆|      1|
+----------+-------+
only showing top 10 rows



27361

time: 2min


### 日期数据预处理

In [9]:
#规范日期数据
#将异常值补全为八位年月日的格式：
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import lit

def fix(str):
    if len(str) < 8:
        str = str + '0'* (8 - len(str))
    elif len(str) > 8:
        str = 'x'
    else:
        pass
    return str

udffix = F.udf(fix)
df1 = df1.withColumn('date_new', udffix(df1.departdate))
df1.show(10)

+----------+--------+
|departdate|date_new|
+----------+--------+
|  20200222|20200222|
|  20200222|20200222|
|  20200224|20200224|
|  20200224|20200224|
|  20200225|20200225|
|  20200225|20200225|
|  20200225|20200225|
|  20200225|20200225|
|  20200226|20200226|
|  20200226|20200226|
+----------+--------+
only showing top 10 rows

time: 952 ms


In [10]:
GRD2 = df1 #选择离站日期所对应的列
group3 = GRD2.groupby('date_new').count() #将不同日期分组统计
group3.show(10) #查看前10个日期及频次
group3.count() #一共有xx个日期

+-----------+-------+
|   date_new|  count|
+-----------+-------+
|   20200413|1372752|
|   20200218|  21452|
|   21300000|    359|
|王海华00000|      1|
|兰长宾00000|      1|
|于绿林00000|      2|
| 王宁000000|      7|
|黄燕芬00000|      1|
| 徐芳000000|      2|
|吴晓红00000|      1|
+-----------+-------+
only showing top 10 rows



26990

time: 5min 54s


In [11]:
#删除非数字形式
#df1 = df1.filter(isdigit(df1.data_new))
#df1 = df1.filter((df1.date_new).isdigit())

#将非数字格式删除：
def dele(str):
    if str.isdigit() == 0:
        #del
        str = '-1'
    else:
        #str = '-1'
        pass
    return str

udfdele = F.udf(dele)
df1 = df1.withColumn('date_new2', udfdele(df1.date_new))
df1.show(10)

+----------+--------+---------+
|departdate|date_new|date_new2|
+----------+--------+---------+
|  20200222|20200222| 20200222|
|  20200222|20200222| 20200222|
|  20200224|20200224| 20200224|
|  20200224|20200224| 20200224|
|  20200225|20200225| 20200225|
|  20200225|20200225| 20200225|
|  20200225|20200225| 20200225|
|  20200225|20200225| 20200225|
|  20200226|20200226| 20200226|
|  20200226|20200226| 20200226|
+----------+--------+---------+
only showing top 10 rows

time: 203 ms


In [12]:
#删除缺失值
df1 = df1.filter(df1.date_new2 != '-1')

time: 71.1 ms


In [13]:
GRD3 = df1 #选择离站日期所对应的列
group4 = GRD3.groupby('date_new2').count() #将不同日期分组统计
group4.show(10) #查看前10个日期及频次
group4.count() #一共有xx个日期

+---------+-------+
|date_new2|  count|
+---------+-------+
| 20200413|1372752|
| 20200218|  21452|
| 21300000|    359|
| 20200314| 662941|
| 54354300|      1|
| 20200311| 516890|
| 20200902|   1230|
| 36000000|      1|
| 15100000|     25|
| 20200202|  23252|
+---------+-------+
only showing top 10 rows



591

time: 14min 22s


### 年份分离和统计

In [14]:
#切分日期数据
#添加新列年
def splityear(x):
    return int(x[0:4])
splityear_udf_str = udf(lambda z:splityear(z),StringType())
df1 = df1.withColumn('year',splityear_udf_str(df1['date_new2']))
df1.show(5)

+----------+--------+---------+----+
|departdate|date_new|date_new2|year|
+----------+--------+---------+----+
|  20200222|20200222| 20200222|2020|
|  20200222|20200222| 20200222|2020|
|  20200224|20200224| 20200224|2020|
|  20200224|20200224| 20200224|2020|
|  20200225|20200225| 20200225|2020|
+----------+--------+---------+----+
only showing top 5 rows

time: 280 ms


In [15]:
GRD4 = df1 #选择离站日期所对应的列
group5 = GRD4.groupby('date_new2').count() #将不同日期分组统计
group5.show(10) #查看前10个日期及频次
group5.count() #一共有xx个日期

+---------+-------+
|date_new2|  count|
+---------+-------+
| 20200413|1372752|
| 20200218|  21452|
| 21300000|    359|
| 20200314| 662941|
| 54354300|      1|
| 20200311| 516890|
| 20200902|   1230|
| 36000000|      1|
| 15100000|     25|
| 20200202|  23252|
+---------+-------+
only showing top 10 rows



591

time: 14min 21s


In [17]:
df2 = df1
df2 = df2.filter(df2.year <=2021)
df2 = df2.filter(df2.year >=2020)

time: 397 ms


In [18]:
GRD5 = df2 #选择离站日期所对应的列
group6 = GRD5.groupby('date_new2').count() #将不同日期分组统计
group6.show(10) #查看前10个日期及频次
group6.count() #一共有xx个日期

+---------+-------+
|date_new2|  count|
+---------+-------+
| 20200413|1372752|
| 20200218|  21452|
| 20200314| 662941|
| 20200311| 516890|
| 20200902|   1230|
| 20200202|  23252|
| 20210501|   3867|
| 20200509| 942379|
| 20200608| 643704|
| 20200217|  18085|
+---------+-------+
only showing top 10 rows



284

time: 26min 22s


In [19]:
GRD5 = df2.select('year') #选择离站年份所对应的列
group6 = GRD5.groupby('year').count() #将不同年份分组统计
group6.show(10) #查看前10个年份及频次
group6.count() #一共有2个年份

+----+--------+
|year|   count|
+----+--------+
|2020|84570125|
|2021|14491155|
+----+--------+



2

time: 21min 56s


### 月份分离和统计

In [20]:
#添加新列月份
def splitmonth(x):
    return int(x[4:6])
splitmonth_udf_str = udf(lambda z:splitmonth(z),StringType())
df2 = df2.withColumn('month',splitmonth_udf_str(df2['date_new2']))
df2.show(5) 

+----------+--------+---------+----+-----+
|departdate|date_new|date_new2|year|month|
+----------+--------+---------+----+-----+
|  20200222|20200222| 20200222|2020|    2|
|  20200222|20200222| 20200222|2020|    2|
|  20200224|20200224| 20200224|2020|    2|
|  20200224|20200224| 20200224|2020|    2|
|  20200225|20200225| 20200225|2020|    2|
+----------+--------+---------+----+-----+
only showing top 5 rows

time: 397 ms


In [26]:
df2.show()

+----------+--------+---------+----+-----+
|departdate|date_new|date_new2|year|month|
+----------+--------+---------+----+-----+
|  20200222|20200222| 20200222|2020|    2|
|  20200222|20200222| 20200222|2020|    2|
|  20200224|20200224| 20200224|2020|    2|
|  20200224|20200224| 20200224|2020|    2|
|  20200225|20200225| 20200225|2020|    2|
|  20200225|20200225| 20200225|2020|    2|
|  20200225|20200225| 20200225|2020|    2|
|  20200225|20200225| 20200225|2020|    2|
|  20200226|20200226| 20200226|2020|    2|
|  20200226|20200226| 20200226|2020|    2|
|  20200226|20200226| 20200226|2020|    2|
|  20200226|20200226| 20200226|2020|    2|
|  20200226|20200226| 20200226|2020|    2|
|  20200226|20200226| 20200226|2020|    2|
|  20200227|20200227| 20200227|2020|    2|
|  20200227|20200227| 20200227|2020|    2|
|  20200227|20200227| 20200227|2020|    2|
|  20200227|20200227| 20200227|2020|    2|
|  20200227|20200227| 20200227|2020|    2|
|  20200227|20200227| 20200227|2020|    2|
+----------

In [27]:
GRD6 = df2.select('month') #选择离站月份所对应的列
group7 = GRD6.groupby('month').count() #将不同月份分组统计
group7.show() #查看前12个月份及频次
group7.count() #一共有9个月份？？？

+-----+--------+
|month|   count|
+-----+--------+
|    7|      21|
|    3|28561809|
|    8|  264480|
|    5|21581559|
|    6| 7123079|
|    9|   13770|
|    1|  883254|
|    4|39238298|
|    2| 1395010|
+-----+--------+



9

time: 21min 35s


In [28]:
#添加新列天
def splitday(x):
    return int(x[6:8])
splitday_udf_str = udf(lambda z:splitday(z),StringType())
df2 = df2.withColumn('day',splitday_udf_str(df2['date_new2'])) 
df2.show(5)

+----------+--------+---------+----+-----+---+
|departdate|date_new|date_new2|year|month|day|
+----------+--------+---------+----+-----+---+
|  20200222|20200222| 20200222|2020|    2| 22|
|  20200222|20200222| 20200222|2020|    2| 22|
|  20200224|20200224| 20200224|2020|    2| 24|
|  20200224|20200224| 20200224|2020|    2| 24|
|  20200225|20200225| 20200225|2020|    2| 25|
+----------+--------+---------+----+-----+---+
only showing top 5 rows

time: 325 ms


In [29]:
GRD7 = df2.select('day') #选择离站天所对应的列
group8 = GRD7.groupby('day').count() #将不同天分组统计
group8.show() #查看不同天及频次
group8.count() #一共有32个天（未剔除0）

+---+-------+
|day|  count|
+---+-------+
|  7|3144809|
| 15|3234454|
| 11|3096112|
| 29|3325031|
|  3|4070062|
| 30|3521677|
|  8|3252661|
| 22|3193929|
| 28|2840730|
| 16|2638128|
|  0|      1|
|  5|3989712|
| 31|2231384|
| 18|2896508|
| 27|3219889|
| 17|3209052|
| 26|3112279|
|  6|3484577|
| 19|3182275|
| 23|2660684|
+---+-------+
only showing top 20 rows



32

time: 21min 43s


### 绘图（未完成）

In [15]:
#df1 = df1.toPandas()
x = df.groupby('month').count()
x.show(3)
x=x.toPandas()
plt.style.use('fast')

# 支持中文
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
# 生成一张12*4的图
fig = plt.figure(figsize=(12,4))
plt.stackplot(df['month'],
              df['count'] # 面积图是在前一个序列数据基础上，逐层向上累加而构成的图
             ,alpha=0.8)
plt.xlabel('month') # 纵坐标轴标题
plt.ylabel('traffic flowing') # 纵坐标轴标题
plt.title('The Traffic Flowing of Records per month') # 图形标题
plt.legend()
ax = plt.gca()
ax.axes.xaxis.set_ticks([])
plt.show()

In [None]:
x = x.orderBy(-x['count'])
x.show(5)

### 日期-热力日历图

In [18]:
pip install pyecharts

In [24]:
import seaborn as sns
#np.random.seed(0)
sns.set()

'''
heatmap = HeatMap("车流量出行日历热力图", "xxxx", width=1100)
heatmap.add(
    "",
    group_month,  # 列表数据包含日期以及对应数据
    is_visualmap=True,  # 是否启用视觉映射组件
    visual_range=[0, 5],     # 组件所允许的最大值与最小值
    visual_text_color="#000",   # 组件两端文本颜色
    visual_range_text=["神隐", "微商"],  # 组件两端文本
    visual_range_color=["#D8BFD8", "#FFB6C1", "#EE82EE"],   # 过度颜色，Thistle，LightPink、Violet
    visual_orient="horizontal",     # 组件条的方向，vertical和horizontal可选
    visual_pos="center",    # 组件条距左侧的位置，left、center、right可选，也可用百分数或整数
    visual_top="80%",   # 组件条距离顶部的位置，有top、center、bottom可选，也可用百分数或整数
    # visual_split_number=4,  # 分段型中分段的个数
    # is_piecewise=True,  # 将组件转换为分段型，默认为连续型
    is_calendar_heatmap=True,   # 使用日历热力图
    calendar_cell_size=["auto", 30],    # 单元格大小，默认["auto", 20]
    calendar_date_range=["2018-3-9", "2019-3-10"],   # 日期跨度，可以为str/list
)
heatmap.render('moments.gif')
'''

#返回一个或一组服从“0~1”均匀分布的随机样本值
uniform_data = np.random.rand(10,12) #10个12维数据
sns.heatmap(uniform_data)

In [20]:
df['month'].value_counts()

### 发车日期（按月份）时序图

In [None]:
library(lubridate)  #R