In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame([["laozhang", 60], ["laoli",57], ["laowang", 66]], columns=["name", "age"])

In [3]:
df

Unnamed: 0,name,age
0,laozhang,60
1,laoli,57
2,laowang,66


In [4]:
sqlcontext = SQLContext(sc)

In [5]:
spark_df = sqlcontext.createDataFrame(df)

In [6]:
spark_df.select("name").show()

+--------+
|    name|
+--------+
|laozhang|
|   laoli|
| laowang|
+--------+



### 以下为类SQL语句的查询

In [7]:
spark_df.createOrReplaceTempView('people')

In [8]:
spark.sql("select * from people").show()

+--------+---+
|    name|age|
+--------+---+
|laozhang| 60|
|   laoli| 57|
| laowang| 66|
+--------+---+



In [23]:
spark.sql("select * from people where age >= 60").show()

+--------+---+
|    name|age|
+--------+---+
|laozhang| 60|
| laowang| 66|
+--------+---+



### 将spark　DataFrame简单的转成pandas的DataFrame 可以进行pandas的一些数据处理

In [24]:
spark_df.toPandas()

Unnamed: 0,name,age
0,laozhang,60
1,laoli,57
2,laowang,66


In [13]:
spark_df = spark.createDataFrame(df)

In [14]:
spark_df.select("name").show()

+--------+
|    name|
+--------+
|laozhang|
|   laoli|
| laowang|
+--------+



In [15]:
df

Unnamed: 0,name,age
0,laozhang,60
1,laoli,57
2,laowang,66


### 读取外部文件　像json等结构化的数据　很容易处理

In [3]:
df = spark.read.json("/usr/local/spark/examples/src/main/resources/people.json")

In [4]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [21]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



### 可以进行一些类数据库的操作

In [22]:
df.select(df.name, df.age+1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [23]:
df.filter(df.age > 20).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [24]:
df.groupBy("age").count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [25]:
df.sort(df.age.desc()).show()

+----+-------+
| age|   name|
+----+-------+
|  30|   Andy|
|  19| Justin|
|null|Michael|
+----+-------+



In [26]:
df.sort(df.age.desc(), df.name.asc()).show()

+----+-------+
| age|   name|
+----+-------+
|  30|   Andy|
|  19| Justin|
|null|Michael|
+----+-------+



In [27]:
df.select(df.name.alias("username"),df.age).show()

+--------+----+
|username| age|
+--------+----+
| Michael|null|
|    Andy|  30|
|  Justin|  19|
+--------+----+



In [28]:
spark_df.show()

+--------+---+
|    name|age|
+--------+---+
|laozhang| 60|
|   laoli| 57|
| laowang| 66|
+--------+---+



### 还可以使用并行方法　简单地生成RDD

In [32]:
rdd = sc.parallelize([2,3,4])

### flatMap函数与map函数的区别在于flatMap可能会生成元素个数与原数据集不一样的RDD

In [34]:
sorted(rdd.flatMap(lambda x: range(1,x)).collect())

[1, 1, 1, 2, 2, 3]

### take函数与takeOrdered函数的区别在于后者会先排序之后再取前面的n个数据

In [35]:
rdd.take(2)

[2, 3]

In [36]:
rdd.takeOrdered(2)

[2, 3]

In [10]:
sc.parallelize([9,7,3,2,6,4]).takeOrdered(3)

[2, 3, 4]

In [23]:
sc.parallelize([1, 2, 3, 4, 5]).foreach(lambda x: print(x ** 2))

In [24]:
# pariRDD

sc.parallelize(["laozhang", "laoli", "laowang"]).map(lambda x:(x, 1)).foreach(print)

In [12]:
sc.parallelize(["laozhang", "laoli", "laowang"]).map(lambda x:(x, 1)).sortByKey().foreach(print)

In [13]:
 pairRDD1 = sc.parallelize([('spark',1),('spark',2),('hadoop',3),('hadoop',5)])


In [14]:
pairRDD2 = sc.parallelize([('spark','fast')])


In [15]:
pairRDD1.join(pairRDD2).foreach(print)