In [1]:
import pandas as pd
from pyspark import SparkContext,SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
sc = SparkContext()

### Spark version2.0开始　SparkSession已经实现了SQLContext的所有方法，但Spark仍向后兼容SQLContext 

In [2]:
spark = SparkSession.builder \
        .master("local[*]") \
        .config("spark.driver.memory","6G") \
        .getOrCreate()

In [3]:
df = pd.DataFrame([["laozhang", 60], ["laoli",57], ["laowang", 66]], columns=["name", "age"])

In [4]:
df

Unnamed: 0,name,age
0,laozhang,60
1,laoli,57
2,laowang,66


In [5]:
sqlcontext = SQLContext(sc)

In [6]:
spark_df = sqlcontext.createDataFrame(df)

In [7]:
spark_df.select("name").show()

+--------+
|    name|
+--------+
|laozhang|
|   laoli|
| laowang|
+--------+



In [8]:
df

Unnamed: 0,name,age
0,laozhang,60
1,laoli,57
2,laowang,66


In [9]:
spark_df = spark.createDataFrame(df)

In [10]:
spark_df.select("name").show()

+--------+
|    name|
+--------+
|laozhang|
|   laoli|
| laowang|
+--------+



### 以下为类SQL语句的查询

In [11]:
spark_df.createOrReplaceTempView('people')

In [12]:
spark.sql("select * from people").show()

+--------+---+
|    name|age|
+--------+---+
|laozhang| 60|
|   laoli| 57|
| laowang| 66|
+--------+---+



In [13]:
spark.sql("select * from people where age >= 60").show()

+--------+---+
|    name|age|
+--------+---+
|laozhang| 60|
| laowang| 66|
+--------+---+



### 将spark　DataFrame简单的转成pandas的DataFrame 可以进行pandas的一些数据处理

In [14]:
spark_df.toPandas()

Unnamed: 0,name,age
0,laozhang,60
1,laoli,57
2,laowang,66


### 也可以将pandas的DataFrame转化成Spark DataFrame

In [15]:
spark_df = spark.createDataFrame(df)

In [16]:
spark_df.select("name").show()

+--------+
|    name|
+--------+
|laozhang|
|   laoli|
| laowang|
+--------+



In [17]:
df

Unnamed: 0,name,age
0,laozhang,60
1,laoli,57
2,laowang,66


### 读取外部文件　像json等结构化的数据　很容易处理

In [18]:
df = spark.read.json("data/people.json")

### 可将DataFrame再转回json文件

In [19]:
# df.write.save("people", format="json") 

### 也可将DataFrame转化成csv文件

In [20]:
# df.write.save('people', format='csv', sep=';',  header="true")　

In [21]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [22]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



### 可以进行一些类数据库的操作

In [23]:
df.select(df.name, df.age+1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [24]:
df.filter(df.age > 20).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [25]:
df.groupBy("age").count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [26]:
df.sort(df.age.desc()).show()

+----+-------+
| age|   name|
+----+-------+
|  30|   Andy|
|  19| Justin|
|null|Michael|
+----+-------+



In [27]:
df.sort(df.age.desc(), df.name.asc()).show()

+----+-------+
| age|   name|
+----+-------+
|  30|   Andy|
|  19| Justin|
|null|Michael|
+----+-------+



In [28]:
df.select(df.name.alias("username"),df.age).show()

+--------+----+
|username| age|
+--------+----+
| Michael|null|
|    Andy|  30|
|  Justin|  19|
+--------+----+



In [29]:
spark_df.show()

+--------+---+
|    name|age|
+--------+---+
|laozhang| 60|
|   laoli| 57|
| laowang| 66|
+--------+---+



### when 方法类似于if else 语句

In [30]:
df.select(when(df.age == 30, df.age).otherwise("pia pia pia").alias("age"),df.name.alias("username")).show()

+-----------+--------+
|        age|username|
+-----------+--------+
|pia pia pia| Michael|
|         30|    Andy|
|pia pia pia|  Justin|
+-----------+--------+



### 还可以使用并行方法　简单地生成RDD

In [31]:
rdd = sc.parallelize([2,3,4])

### flatMap函数与map函数的区别在于flatMap可能会生成元素个数与原数据集不一样的RDD

In [32]:
sorted(rdd.flatMap(lambda x: range(1,x)).collect())

[1, 1, 1, 2, 2, 3]

### take函数与takeOrdered函数的区别在于后者会先排序之后再取前面的n个数据

In [33]:
rdd.take(2)

[2, 3]

In [34]:
rdd.takeOrdered(2)

[2, 3]

In [35]:
sc.parallelize([9,7,3,2,6,4]).takeOrdered(3)

[2, 3, 4]

In [36]:
sc.parallelize([1, 2, 3, 4, 5]).foreach(lambda x: print(x ** 2))

### pariRDD可以帮助我们处理很多类似于字典的数据

In [37]:
sc.parallelize(["laozhang", "laoli", "laowang"]).map(lambda x:(x, 1)).collect()

[('laozhang', 1), ('laoli', 1), ('laowang', 1)]

In [38]:
sc.parallelize(["laozhang", "laoli", "laowang"]).map(lambda x:(x, 1)).sortByKey().foreach(print)

In [39]:
 pairRDD1 = sc.parallelize([('spark',1),('spark',2),('hadoop',3),('hadoop',5)])


In [40]:
pairRDD2 = sc.parallelize([('spark','fast')])


In [41]:
pairRDD1.join(pairRDD2).collect()

[('spark', (1, 'fast')), ('spark', (2, 'fast'))]

### 广播变量 在每个机器上缓存一个只读的变量 可以非常高效地给每个节点（机器）提供一个大的输入数据集的副本 

In [42]:
broadcastVar = sc.broadcast(["kelly", "Leno", "Justin"])

In [43]:
broadcastVar.value

['kelly', 'Leno', 'Justin']

### 累加器 通常可以被用来实现计数器（counter）和求和（sum)

In [44]:
accum = sc.accumulator(0)

In [45]:
sc.parallelize(range(1, 101)).foreach(lambda x : accum.add(x))

In [46]:
accum.value

5050

### spark读取csv文件 并按照分割符切割columns

In [47]:
spark_df = spark.read.format("csv").option("header", "true")\
.load("data/people.csv",sep=";",)


In [48]:
spark_df.show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+

