# From session create dataframe

In [5]:
spark = SparkSession.builder.getOrCreate()

df = spark.read.json("file:///home/lygbug666/software/spark/examples/src/main/resources/people.json")

df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



# Dataframe operations

In [36]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [7]:
df.select(df.name, df.age+1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [9]:
df.filter(df.age>20).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [11]:
df.groupBy(df.age).count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [15]:
df.groupBy("name").count().show()

+-------+-----+
|   name|count|
+-------+-----+
|Michael|    1|
|   Andy|    1|
| Justin|    1|
+-------+-----+



In [22]:
df.sort(df.age.desc()).show()
df.sort(df.age).show()


+----+-------+
| age|   name|
+----+-------+
|  30|   Andy|
|  19| Justin|
|null|Michael|
+----+-------+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  19| Justin|
|  30|   Andy|
+----+-------+



In [23]:
df.sort(df.age.desc(), df.name.asc()).show()

+----+-------+
| age|   name|
+----+-------+
|  30|   Andy|
|  19| Justin|
|null|Michael|
+----+-------+



In [24]:
df.select(df.name.alias("username"),df.age).show()

+--------+----+
|username| age|
+--------+----+
| Michael|null|
|    Andy|  30|
|  Justin|  19|
+--------+----+



# Running SQL Queries Programmatically

In [28]:
df.createOrReplaceTempView("people")

sqlDF = spark.sql("SELECT * FROM people")

sqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



# Global temporart view

In [35]:
df.createGlobalTempView("people")
spark.sql("SELECT * FROM global_temp.people").show()
spark.newSession().sql("SELECT * FROM global_temp.people").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



# RDD to DataFrames

## Reflection

In [61]:
sc.textFile("file:///home/lygbug666/software/spark/examples/src/main/resources/people.txt").collect()

['Michael, 29', 'Andy, 30', 'Justin, 19']

In [62]:
from pyspark.sql.types import Row

In [63]:
def f(x):
...     rel = {}
...     rel['name'] = x[0]
...     rel['age'] = x[1]
...     return rel

In [67]:
peopleDF = sc.textFile("file:///home/lygbug666/software/spark/examples/src/main/resources/people.txt").map(lambda line : line.split(',')).map(lambda x: Row(**f(x))).toDF()

In [68]:
peopleDF.createGlobalTempView("people2")  

In [66]:
# 必须注册为临时表才能供下面的查询使用

In [69]:
personsDF = spark.sql("select * from global_temp.people2")

In [70]:
personsDF.show()

+---+-------+
|age|   name|
+---+-------+
| 29|Michael|
| 30|   Andy|
| 19| Justin|
+---+-------+



In [71]:
personsDF.rdd.map(lambda t : "Name:"+t[0]+","+"Age:"+t[1]).collect()

['Name: 29,Age:Michael', 'Name: 30,Age:Andy', 'Name: 19,Age:Justin']

In [72]:
# must be str, not NoneType

In [74]:
lines = sc.textFile("file:///home/lygbug666/software/spark/examples/src/main/resources/people.txt")

In [75]:
parts = lines.map(lambda x : x.split(','))

In [76]:
peopleRow = parts.map(lambda x : Row(name=x[0],age=int(x[1])))

In [77]:
peopleschema = spark.createDataFrame(peopleRow)

In [79]:
peopleschema.createOrReplaceTempView("peopletable")

In [80]:
personDF = spark.sql("select * from peopletable")

In [81]:
personDF.show()

+---+-------+
|age|   name|
+---+-------+
| 29|Michael|
| 30|   Andy|
| 19| Justin|
+---+-------+



In [85]:
teenagers = spark.sql("SELECT name FROM peopletable WHERE age >= 13")

In [86]:
teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect()

In [87]:
for name in teenNames:
    print(name)

Name: Michael
Name: Andy
Name: Justin


## create schema

In [101]:
from pyspark.sql.types import *

In [102]:
lines = sc.textFile("file:///home/lygbug666/software/spark/examples/src/main/resources/people.txt")

In [103]:
parts = lines.map(lambda l: l.split(","))

In [104]:
people = parts.map(lambda p: (p[0], p[1].strip()))

In [105]:
schemaString = "name age"

In [106]:
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]

In [107]:
from pyspark.sql.types import *

In [108]:
peopleRDD = sc.textFile("file:///home/lygbug666/software/spark/examples/src/main/resources/people.txt")

In [109]:
schemaString = "name age"

In [110]:
fields = list(map( lambda fieldName : StructField(fieldName, StringType(), nullable = True), schemaString.split(" ")))

In [111]:
schema = StructType(fields)

In [113]:
rowRDD = peopleRDD.map(lambda line : line.split(',')).map(lambda attributes : Row(attributes[0], attributes[1]))

In [114]:
peopleDF = spark.createDataFrame(rowRDD, schema)

In [115]:
peopleDF.createOrReplaceTempView("people3")

In [116]:
results = spark.sql("SELECT * FROM people3")

In [118]:
result = results.rdd.map( lambda attributes : "name: " + attributes[0]+","+"age:"+attributes[1]).collect()

In [119]:
for x in result:
    print(x)

name: Michael,age: 29
name: Andy,age: 30
name: Justin,age: 19


In [125]:
for x in results.rdd.map( lambda attributes : "name: " + attributes[0]+","+"age:"+attributes[1]).collect():
    print(x)

name: Michael,age: 29
name: Andy,age: 30
name: Justin,age: 19


# DataSouce

In [127]:
peopleRDD = sc.textFile("file:///home/lygbug666/software/spark/examples/src/main/resources/people.txt")

In [152]:
df = spark.read.load("file:///home/lygbug666/software/spark/examples/src/main/resources/users.parquet")

In [153]:
df.createOrReplaceTempView("parquetFile")

In [154]:
namesdf = spark.sql("select * from parquetFile")

In [155]:
namesdf.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [146]:
    namesdf.rdd.map(lambda person : person.name).collect()

['Alyssa', 'Ben']

In [150]:
df = spark.sql("SELECT * FROM parquet.`file:///home/lygbug666/software/spark/examples/src/main/resources/users.parquet`")

In [151]:
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [149]:
df = spark.read.text("file:///home/lygbug666/software/spark/examples/src/main/resources/people.txt")