# DataFrames

## Acquiring a DataFrame

### Convert Pandas DataFrame to Spark DataFrame

In [1]:
import pandas as pd
from random import randint

n_cols = 10
n_rows = 10

pdf = pd.DataFrame([tuple([c for c in range(n_cols)]) for r in range(n_rows)], columns=[f'x{i}' for i in range(n_cols)])
sdf = sqlContext.createDataFrame(pdf)

In [2]:
pdf

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9
0,0,1,2,3,4,5,6,7,8,9
1,0,1,2,3,4,5,6,7,8,9
2,0,1,2,3,4,5,6,7,8,9
3,0,1,2,3,4,5,6,7,8,9
4,0,1,2,3,4,5,6,7,8,9
5,0,1,2,3,4,5,6,7,8,9
6,0,1,2,3,4,5,6,7,8,9
7,0,1,2,3,4,5,6,7,8,9
8,0,1,2,3,4,5,6,7,8,9
9,0,1,2,3,4,5,6,7,8,9


In [3]:
sdf.collect()

[Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9)]

In [4]:
sdf.show()

+---+---+---+---+---+---+---+---+---+---+
| x0| x1| x2| x3| x4| x5| x6| x7| x8| x9|
+---+---+---+---+---+---+---+---+---+---+
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
+---+---+---+---+---+---+---+---+---+---+



In [5]:
sdf.printSchema()

root
 |-- x0: long (nullable = true)
 |-- x1: long (nullable = true)
 |-- x2: long (nullable = true)
 |-- x3: long (nullable = true)
 |-- x4: long (nullable = true)
 |-- x5: long (nullable = true)
 |-- x6: long (nullable = true)
 |-- x7: long (nullable = true)
 |-- x8: long (nullable = true)
 |-- x9: long (nullable = true)



### Convert a RDD to DataFrame

In [6]:
from random import randint
from pyspark.sql.types import *

n_cols = 10
n_rows = 10

rdd = sc.parallelize([[c for c in range(n_cols)] for r in range(n_rows)])

schema = StructType([StructField(f'x{i}', IntegerType(), True) for i in range(n_cols)])
df = sqlContext.createDataFrame(rdd, schema)

In [7]:
rdd.collect()

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]

In [8]:
df.collect()

[Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9),
 Row(x0=0, x1=1, x2=2, x3=3, x4=4, x5=5, x6=6, x7=7, x8=8, x9=9)]

In [9]:
df.show()

+---+---+---+---+---+---+---+---+---+---+
| x0| x1| x2| x3| x4| x5| x6| x7| x8| x9|
+---+---+---+---+---+---+---+---+---+---+
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9|
+---+---+---+---+---+---+---+---+---+---+



In [10]:
df.printSchema()

root
 |-- x0: integer (nullable = true)
 |-- x1: integer (nullable = true)
 |-- x2: integer (nullable = true)
 |-- x3: integer (nullable = true)
 |-- x4: integer (nullable = true)
 |-- x5: integer (nullable = true)
 |-- x6: integer (nullable = true)
 |-- x7: integer (nullable = true)
 |-- x8: integer (nullable = true)
 |-- x9: integer (nullable = true)



### Convert JSON data to Spark DataFrame

In [11]:
%%sh
hdfs dfs -copyFromLocal -f /root/ipynb/people.json /people.json

2019-10-31 02:53:11,545 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false


In [12]:
df = sqlContext.read.json('hdfs://localhost/people.json')

In [13]:
df.show()

+--------------------+---+----------+------+---+---------+-----+--------------------+------+
|             address|age|first_name|height| id|last_name| male|              sports|weight|
+--------------------+---+----------+------+---+---------+-----+--------------------+------+
|[Washington, DC, ...| 27|      John|   6.5|  1|      Doe| true|    [hockey, tennis]| 155.5|
|[Washington, DC, ...| 22|      Jane|   5.7|  2|    Smith|false|[basketball, tennis]| 135.5|
|[Los Angeles, CA,...| 25|      Jack|   6.6|  3|    Smith| true|  [baseball, soccer]| 175.5|
|[Los Angeles, CA,...| 18|     Janet|   5.5|  4|      Doe|false|    [judo, baseball]| 125.5|
+--------------------+---+----------+------+---+---------+-----+--------------------+------+



In [14]:
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- zip: long (nullable = true)
 |-- age: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- height: double (nullable = true)
 |-- id: long (nullable = true)
 |-- last_name: string (nullable = true)
 |-- male: boolean (nullable = true)
 |-- sports: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- weight: double (nullable = true)



## DataFrame operations

### Create data

In [15]:
import pandas as pd
from random import randint

n_cols = 10
n_rows = 10

pdf = pd.DataFrame(
    [tuple([randint(1, 10) for _ in range(n_cols)]) for r in range(n_rows)], 
    columns=[f'x{i}' for i in range(n_cols)])
sdf = sqlContext.createDataFrame(pdf)

In [16]:
pdf

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9
0,9,1,4,6,3,4,3,9,4,4
1,5,7,5,7,4,4,10,9,6,1
2,10,10,9,2,6,2,1,8,6,8
3,1,5,7,3,8,10,3,7,9,3
4,9,1,8,1,8,2,9,8,4,7
5,8,2,7,9,3,8,7,1,5,1
6,9,7,10,3,6,2,10,3,5,2
7,7,8,4,9,7,1,7,2,3,10
8,7,3,10,6,6,4,4,2,5,1
9,9,6,1,1,2,8,7,2,1,3


### Select

In [17]:
sdf.select('x0').show()

+---+
| x0|
+---+
|  9|
|  5|
| 10|
|  1|
|  9|
|  8|
|  9|
|  7|
|  7|
|  9|
+---+



### Select multiple columns

In [18]:
sdf.select(sdf['x0'], sdf['x1']).show()

+---+---+
| x0| x1|
+---+---+
|  9|  1|
|  5|  7|
| 10| 10|
|  1|  5|
|  9|  1|
|  8|  2|
|  9|  7|
|  7|  8|
|  7|  3|
|  9|  6|
+---+---+



### Select multiple columns and modify values

In [19]:
sdf.select(sdf['x0'] * 2, sdf['x1'] * 3).show()

+--------+--------+
|(x0 * 2)|(x1 * 3)|
+--------+--------+
|      18|       3|
|      10|      21|
|      20|      30|
|       2|      15|
|      18|       3|
|      16|       6|
|      18|      21|
|      14|      24|
|      14|       9|
|      18|      18|
+--------+--------+



### Filtering

In [20]:
sdf.select(sdf['x0'] > 5).show()

+--------+
|(x0 > 5)|
+--------+
|    true|
|   false|
|    true|
|   false|
|    true|
|    true|
|    true|
|    true|
|    true|
|    true|
+--------+



### Group by

In [21]:
sdf.groupBy('x0').count().show()

+---+-----+
| x0|count|
+---+-----+
|  7|    2|
|  9|    4|
|  5|    1|
|  1|    1|
| 10|    1|
|  8|    1|
+---+-----+



### User defined function (UDF)

In [22]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

def times_two(num):
    return num * 2

times_two_udf = udf(times_two, IntegerType())
sdf.select('x0', times_two_udf('x0').alias('times_two')).show()

+---+---------+
| x0|times_two|
+---+---------+
|  9|       18|
|  5|       10|
| 10|       20|
|  1|        2|
|  9|       18|
|  8|       16|
|  9|       18|
|  7|       14|
|  7|       14|
|  9|       18|
+---+---------+



### User defined function (UDF) with annotation

In [23]:
from pyspark.sql.functions import udf

def times_two(num):
    return num * 2

@udf('int')
def times_two_udf(num):
    return times_two(num)

sdf.select('x0', times_two_udf('x0').alias('times_two')).show()

+---+---------+
| x0|times_two|
+---+---------+
|  9|       18|
|  5|       10|
| 10|       20|
|  1|        2|
|  9|       18|
|  8|       16|
|  9|       18|
|  7|       14|
|  7|       14|
|  9|       18|
+---+---------+

