In [1]:
sc

In [2]:
from pyspark.sql.types import Row
from datetime import datetime

In [3]:
simple_data = sc.parallelize([1, "Alice", 50])
simple_data

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [4]:
simple_data.count()

3

In [5]:
simple_data.first()

1

In [6]:
simple_data.take(2)

[1, 'Alice']

In [7]:
simple_data.collect()

[1, 'Alice', 50]

In [8]:
df = simple_data.toDF()

TypeError: Can not infer schema for type: <class 'int'>

In [10]:
records = sc.parallelize([[1,"Alice", 50], [2, "Bob", 80]])
records

ParallelCollectionRDD[9] at parallelize at PythonRDD.scala:195

In [11]:
records.collect()

[[1, 'Alice', 50], [2, 'Bob', 80]]

In [12]:
records.count()

2

In [13]:
records.first()

[1, 'Alice', 50]

In [14]:
records.take(2)

[[1, 'Alice', 50], [2, 'Bob', 80]]

In [15]:
df = records.toDF()

In [16]:
df

DataFrame[_1: bigint, _2: string, _3: bigint]

In [17]:
df.show()

+---+-----+---+
| _1|   _2| _3|
+---+-----+---+
|  1|Alice| 50|
|  2|  Bob| 80|
+---+-----+---+



In [20]:
data = sc.parallelize([Row(id=1, 
                            name="Alice", 
                            score=50)])
data

ParallelCollectionRDD[28] at parallelize at PythonRDD.scala:195

In [21]:
data.count()

1

In [22]:
data.collect()

[Row(id=1, name='Alice', score=50)]

In [23]:
df = data.toDF()
df.show()

+---+-----+-----+
| id| name|score|
+---+-----+-----+
|  1|Alice|   50|
+---+-----+-----+



In [24]:
data = sc.parallelize([Row(id = 1,
                          name="Alice",
                          score = 50),
                      Row(id = 2,
                         name="Bob",
                         score = 80),
                      Row(id = 3,
                         name = "Charlee",
                         score = 75)])

In [25]:
df = data.toDF()
df.show()

+---+-------+-----+
| id|   name|score|
+---+-------+-----+
|  1|  Alice|   50|
|  2|    Bob|   80|
|  3|Charlee|   75|
+---+-------+-----+



In [3]:
complex_data = sc.parallelize([Row(col_float=1.44,
                                  col_integer=10,
                                  col_string="John")])

In [5]:
complex_data_df = complex_data.toDF()
complex_data_df.show()

+---------+-----------+----------+
|col_float|col_integer|col_string|
+---------+-----------+----------+
|     1.44|         10|      John|
+---------+-----------+----------+



In [6]:
complex_data = sc.parallelize([Row(col_float=1.44,
                                  col_integer=10,
                                  col_string="John",
                                  col_list=[1,2,3])])

In [7]:
complex_data_df = complex_data.toDF()
complex_data_df.show()

+---------+-----------+---------+----------+
|col_float|col_integer| col_list|col_string|
+---------+-----------+---------+----------+
|     1.44|         10|[1, 2, 3]|      John|
+---------+-----------+---------+----------+



In [2]:
complex_data = sc.parallelize([Row(col_list=[1,2,3],
                                  col_dict={"k1": 0},
                                  col_row=Row(a=10, b=20, c=30),
                                  col_time=datetime(2020, 1, 22, 14, 1, 5),
                              ),
                              Row(col_list=[4,5,6],
                                 col_dict={"k1": 0, "k2": 1},
                                 col_row=Row(a=40, b=50, c=60),
                                 col_time=datetime(2020, 1, 23, 14, 1, 5)
                             ), 
                              Row(col_list=[7,8,9],
                                 col_dict={"k1":0, "k2":1, "k3":2},
                                 col_row=Row(a=70, b=80, c=90),
                                 col_time=datetime(2020, 1, 24, 14, 1, 5))
                              ])

In [3]:
complex_data_df = complex_data.toDF()
complex_data_df.show()

+--------------------+---------+------------+-------------------+
|            col_dict| col_list|     col_row|           col_time|
+--------------------+---------+------------+-------------------+
|           [k1 -> 0]|[1, 2, 3]|[10, 20, 30]|2020-01-22 14:01:05|
|  [k1 -> 0, k2 -> 1]|[4, 5, 6]|[40, 50, 60]|2020-01-23 14:01:05|
|[k3 -> 2, k1 -> 0...|[7, 8, 9]|[70, 80, 90]|2020-01-24 14:01:05|
+--------------------+---------+------------+-------------------+



In [3]:
sqlcontext = SQLContext(sc)

In [4]:
sqlcontext

<pyspark.sql.context.SQLContext at 0x10cd41fd0>

In [6]:
df = sqlcontext.range(5)
df

DataFrame[id: bigint]

In [7]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [5]:
data = sc.parallelize([Row("Alice", 50), 
       Row("Bob", 80), 
       Row("Charlee", 75)])
data

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [21]:
sqlcontext.createDataFrame(data).show()

+-------+---+
|     _1| _2|
+-------+---+
|  Alice| 50|
|    Bob| 80|
|Charlee| 75|
+-------+---+



In [7]:
sqlcontext.createDataFrame(data, ["name", "score"]).show()

+-------+-----+
|   name|score|
+-------+-----+
|  Alice|   50|
|    Bob|   80|
|Charlee|   75|
+-------+-----+



In [43]:
complex_data = sc.parallelize([Row(
    col_string="Alice",
    col_list=[1,2,3],
    col_dict={"k1": 0},
    col_row=Row(a=10, b=20, c=30),
    col_time=datetime(2020, 1, 22, 14, 1, 5),
    col_integer=10,
    col_float=1.44,
    col_bool=True
),
Row(
    col_string="Bob",
    col_list=[4,5,6],
    col_dict={"k1": 0, "k2": 1},
    col_row=Row(a=40, b=50, c=60),
    col_time=datetime(2020, 1, 23, 14, 1, 5),
    col_integer=20,
    col_float=2.44,
    col_bool=True
), 
Row(
    col_string="Charlee",
    col_list=[7,8,9],
    col_dict={"k1":0, "k2":1, "k3":2},
    col_row=Row(a=70, b=80, c=90),
    col_time=datetime(2020, 1, 24, 14, 1, 5),
    col_integer=30,
    col_float=3.44,
    col_bool=False
)
])
complex_data

ParallelCollectionRDD[80] at parallelize at PythonRDD.scala:195

In [11]:
sqlcontext.createDataFrame(complex_data).show()

+--------------------+---------+------------+-------------------+
|            col_dict| col_list|     col_row|           col_time|
+--------------------+---------+------------+-------------------+
|           [k1 -> 0]|[1, 2, 3]|[10, 20, 30]|2020-01-22 14:01:05|
|  [k1 -> 0, k2 -> 1]|[4, 5, 6]|[40, 50, 60]|2020-01-23 14:01:05|
|[k3 -> 2, k1 -> 0...|[7, 8, 9]|[70, 80, 90]|2020-01-24 14:01:05|
+--------------------+---------+------------+-------------------+



In [10]:
data = sc.parallelize([
    Row(1, "Alice", 50),
    Row(2, "Bob", 80),
    Row(3, "Charlee", 75)
])

In [14]:
column_name = Row('id', 'name', 'score')
students = data.map(lambda r: column_name(*r))

In [15]:
students

PythonRDD[4] at RDD at PythonRDD.scala:53

In [16]:
students.collect()

[Row(id=1, name='Alice', score=50),
 Row(id=2, name='Bob', score=80),
 Row(id=3, name='Charlee', score=75)]

In [17]:
students_df = sqlcontext.createDataFrame(students)
students_df

DataFrame[id: bigint, name: string, score: bigint]

In [18]:
students_df.show()

+---+-------+-----+
| id|   name|score|
+---+-------+-----+
|  1|  Alice|   50|
|  2|    Bob|   80|
|  3|Charlee|   75|
+---+-------+-----+



In [44]:
complex_data_df = sqlcontext.createDataFrame(complex_data)
complex_data_df

DataFrame[col_bool: boolean, col_dict: map<string,bigint>, col_float: double, col_integer: bigint, col_list: array<bigint>, col_row: struct<a:bigint,b:bigint,c:bigint>, col_string: string, col_time: timestamp]

In [36]:
complex_data_df.first()

Row(col_dict={'k1': 0}, col_list=[1, 2, 3], col_row=Row(a=10, b=20, c=30), col_string='Alice', col_time=datetime.datetime(2020, 1, 22, 14, 1, 5))

In [37]:
complex_data_df.take(2)

[Row(col_dict={'k1': 0}, col_list=[1, 2, 3], col_row=Row(a=10, b=20, c=30), col_string='Alice', col_time=datetime.datetime(2020, 1, 22, 14, 1, 5)),
 Row(col_dict={'k1': 0, 'k2': 1}, col_list=[4, 5, 6], col_row=Row(a=40, b=50, c=60), col_string='Bob', col_time=datetime.datetime(2020, 1, 23, 14, 1, 5))]

In [26]:
cell_list = complex_data_df.collect()[0][1]
cell_list

[1, 2, 3]

In [27]:
cell_list.append(100)
cell_list

In [30]:
complex_data_df.show()

+--------------------+---------+------------+-------------------+
|            col_dict| col_list|     col_row|           col_time|
+--------------------+---------+------------+-------------------+
|           [k1 -> 0]|[1, 2, 3]|[10, 20, 30]|2020-01-22 14:01:05|
|  [k1 -> 0, k2 -> 1]|[4, 5, 6]|[40, 50, 60]|2020-01-23 14:01:05|
|[k3 -> 2, k1 -> 0...|[7, 8, 9]|[70, 80, 90]|2020-01-24 14:01:05|
+--------------------+---------+------------+-------------------+



In [31]:
complex_data_df.rdd\
                .map(lambda x: (x.col_time, x.col_dict))\
                .collect()

[(datetime.datetime(2020, 1, 22, 14, 1, 5), {'k1': 0}),
 (datetime.datetime(2020, 1, 23, 14, 1, 5), {'k1': 0, 'k2': 1}),
 (datetime.datetime(2020, 1, 24, 14, 1, 5), {'k3': 2, 'k1': 0, 'k2': 1})]

In [32]:
complex_data_df.select(
    'col_row',
    'col_list'
).show()

+------------+---------+
|     col_row| col_list|
+------------+---------+
|[10, 20, 30]|[1, 2, 3]|
|[40, 50, 60]|[4, 5, 6]|
|[70, 80, 90]|[7, 8, 9]|
+------------+---------+



In [38]:
complex_data_df.rdd\
    .map(lambda x: (x.col_string + " Boo"))\
    .collect()

['Alice Boo', 'Bob Boo', 'Charlee Boo']

In [41]:
complex_data_df.select(
        'col_integer',
        'col_float'
    )\
    .withColumn(
        "col_sum",
        complex_data_df.col_integer + complex_data_df.col_float
    )\
    .show()

+-----------+---------+-------+
|col_integer|col_float|col_sum|
+-----------+---------+-------+
|         10|     1.44|  11.44|
|         20|     2.44|  22.44|
|         30|     3.44|  33.44|
+-----------+---------+-------+



In [46]:
complex_data_df.select('col_bool')\
    .withColumn(
        "col_opposite",
        complex_data_df.col_bool == False
    )\
    .show()

+--------+------------+
|col_bool|col_opposite|
+--------+------------+
|    true|       false|
|    true|       false|
|   false|        true|
+--------+------------+



In [47]:
complex_data_df.withColumnRenamed("col_dict", "col_map").show()

+--------+--------------------+---------+-----------+---------+------------+----------+-------------------+
|col_bool|             col_map|col_float|col_integer| col_list|     col_row|col_string|           col_time|
+--------+--------------------+---------+-----------+---------+------------+----------+-------------------+
|    true|           [k1 -> 0]|     1.44|         10|[1, 2, 3]|[10, 20, 30]|     Alice|2020-01-22 14:01:05|
|    true|  [k1 -> 0, k2 -> 1]|     2.44|         20|[4, 5, 6]|[40, 50, 60]|       Bob|2020-01-23 14:01:05|
|   false|[k3 -> 2, k1 -> 0...|     3.44|         30|[7, 8, 9]|[70, 80, 90]|   Charlee|2020-01-24 14:01:05|
+--------+--------------------+---------+-----------+---------+------------+----------+-------------------+



In [48]:
complex_data_df.select(complex_data_df.col_string.alias("Name")).show()

+-------+
|   Name|
+-------+
|  Alice|
|    Bob|
|Charlee|
+-------+



In [50]:
import pandas

In [51]:
df_pandas = complex_data_df.toPandas()
df_pandas

Unnamed: 0,col_bool,col_dict,col_float,col_integer,col_list,col_row,col_string,col_time
0,True,{'k1': 0},1.44,10,"[1, 2, 3]","(10, 20, 30)",Alice,2020-01-22 14:01:05
1,True,"{'k1': 0, 'k2': 1}",2.44,20,"[4, 5, 6]","(40, 50, 60)",Bob,2020-01-23 14:01:05
2,False,"{'k3': 2, 'k1': 0, 'k2': 1}",3.44,30,"[7, 8, 9]","(70, 80, 90)",Charlee,2020-01-24 14:01:05


In [52]:
df_spark = sqlContext.createDataFrame(df_pandas).show()
df_spark

+--------+--------------------+---------+-----------+---------+------------+----------+-------------------+
|col_bool|            col_dict|col_float|col_integer| col_list|     col_row|col_string|           col_time|
+--------+--------------------+---------+-----------+---------+------------+----------+-------------------+
|    true|           [k1 -> 0]|     1.44|         10|[1, 2, 3]|[10, 20, 30]|     Alice|2020-01-22 14:01:05|
|    true|  [k1 -> 0, k2 -> 1]|     2.44|         20|[4, 5, 6]|[40, 50, 60]|       Bob|2020-01-23 14:01:05|
|   false|[k3 -> 2, k1 -> 0...|     3.44|         30|[7, 8, 9]|[70, 80, 90]|   Charlee|2020-01-24 14:01:05|
+--------+--------------------+---------+-----------+---------+------------+----------+-------------------+

