In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').appName('pyspark_fresh').getOrCreate()

In [2]:
from pyspark.sql import Row

In [3]:
import datetime
data= [{
    "id":1,
    "firstName":"Terry",
    "lastName":"Medhurst",
    "email":"atuny0@sohu.com",
    "phone":Row(phone_number = "+63 791 675 8914",mobile_number = "+63 791 675 8945"),
    "courses" : [1,2],
    "is_customer" : True,
    "amount_paid" : 10200,
    "customer_from" : datetime.date(2022,3,10),
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":2,
    "firstName":"Sheldon",
    "lastName":"Quigley",
    "email":"hbingley1@plala.or.jp",
    "phone":Row(phone_number="+7 813 117",mobile_number="+7 813 119"),
    "courses" : [3],
    "is_customer" : True,
    "amount_paid" : 11000,
    "customer_from" : datetime.date(2022,5,16),
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":3,
    "firstName":"Terrill",
    "lastName":"Hills",
    "email":"rshawe2@51.la",
    "phone":Row(phone_number="+63 739 292 7942",mobile_number="+63 739 292 7943"),
    "courses" : [2,4],
    "is_customer" : True,
    "amount_paid" : 24000,
    "customer_from" : datetime.date(2022,4,11),
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":4,
    "firstName":"Miles",
    "lastName":"Cummerata",
    "email":"yraigatt3@nature.com",
    "phone":Row(phone_number=None,mobile_number=None),
    "courses" : [],
    "is_customer" : False,
    "amount_paid" : None,
    "customer_from" : None,
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":5,
    "firstName":"Mavis",
    "lastName":"Schultz",
    "email":"kmeus4@upenn.edu",
    "phone":Row(phone_number="+372 285 771 1911",mobile_number=None),
    "courses" : [],
    "is_customer" : False,
    "amount_paid" : None,
    "customer_from" : None,
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
}]

In [4]:
import pandas as pd

In [5]:
df = spark.createDataFrame(pd.DataFrame(data))

In [6]:
df.show()

+---+---------+---------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|firstName| lastName|               email|               phone|courses|is_customer|amount_paid|customer_from|     last_update_ts|
+---+---------+---------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Terry| Medhurst|     atuny0@sohu.com|{+63 791 675 8914...| [1, 2]|       true|    10200.0|   2022-03-10|2023-10-20 15:00:45|
|  2|  Sheldon|  Quigley|hbingley1@plala.o...|{+7 813 117, +7 8...|    [3]|       true|    11000.0|   2022-05-16|2023-10-20 15:00:45|
|  3|  Terrill|    Hills|       rshawe2@51.la|{+63 739 292 7942...| [2, 4]|       true|    24000.0|   2022-04-11|2023-10-20 15:00:45|
|  4|    Miles|Cummerata|yraigatt3@nature.com|        {null, null}|     []|      false|        NaN|         null|2023-10-20 15:00:45|
|  5|    Mavis|  Schultz|    kmeus4@upenn.edu|{+372 285 771 19

In [7]:
from pyspark.sql.functions import concat,lit

In [8]:
df.select('id','firstName','lastName').show()

+---+---------+---------+
| id|firstName| lastName|
+---+---------+---------+
|  1|    Terry| Medhurst|
|  2|  Sheldon|  Quigley|
|  3|  Terrill|    Hills|
|  4|    Miles|Cummerata|
|  5|    Mavis|  Schultz|
+---+---------+---------+



In [10]:
df.select('id',concat('firstName',lit(', '),'lastName').alias('fullName')).show()

+---+----------------+
| id|        fullName|
+---+----------------+
|  1| Terry, Medhurst|
|  2|Sheldon, Quigley|
|  3|  Terrill, Hills|
|  4|Miles, Cummerata|
|  5|  Mavis, Schultz|
+---+----------------+



In [11]:
df.select('id','firstName','lastName').withColumn('fullName',concat('firstName',lit(', '),'lastName')).show()

+---+---------+---------+----------------+
| id|firstName| lastName|        fullName|
+---+---------+---------+----------------+
|  1|    Terry| Medhurst| Terry, Medhurst|
|  2|  Sheldon|  Quigley|Sheldon, Quigley|
|  3|  Terrill|    Hills|  Terrill, Hills|
|  4|    Miles|Cummerata|Miles, Cummerata|
|  5|    Mavis|  Schultz|  Mavis, Schultz|
+---+---------+---------+----------------+



In [13]:
## when use withColumn ignore the alias.
df.select('id','firstName','lastName').withColumn('fullName',concat('firstName',lit(', '),'lastName').alias('fn')).show()

+---+---------+---------+----------------+
| id|firstName| lastName|        fullName|
+---+---------+---------+----------------+
|  1|    Terry| Medhurst| Terry, Medhurst|
|  2|  Sheldon|  Quigley|Sheldon, Quigley|
|  3|  Terrill|    Hills|  Terrill, Hills|
|  4|    Miles|Cummerata|Miles, Cummerata|
|  5|    Mavis|  Schultz|  Mavis, Schultz|
+---+---------+---------+----------------+



In [14]:
df.select('id','firstName','lastName').withColumn('firstName',concat('firstName',lit(', '),'lastName')).show()

+---+----------------+---------+
| id|       firstName| lastName|
+---+----------------+---------+
|  1| Terry, Medhurst| Medhurst|
|  2|Sheldon, Quigley|  Quigley|
|  3|  Terrill, Hills|    Hills|
|  4|Miles, Cummerata|Cummerata|
|  5|  Mavis, Schultz|  Schultz|
+---+----------------+---------+



In [23]:
df.select('id','firstName','lastName').withColumn('fn','firstName')

PySparkTypeError: [NOT_COLUMN] Argument `col` should be a Column, got str.

In [17]:
from pyspark.sql.functions import col

In [22]:
df.select('id','firstName','lastName').withColumn('fn',col('firstName')).show()

+---+---------+---------+-------+
| id|firstName| lastName|     fn|
+---+---------+---------+-------+
|  1|    Terry| Medhurst|  Terry|
|  2|  Sheldon|  Quigley|Sheldon|
|  3|  Terrill|    Hills|Terrill|
|  4|    Miles|Cummerata|  Miles|
|  5|    Mavis|  Schultz|  Mavis|
+---+---------+---------+-------+



In [21]:
df.select('id','firstName','lastName').withColumn('fn',df['firstName']).show()

+---+---------+---------+-------+
| id|firstName| lastName|     fn|
+---+---------+---------+-------+
|  1|    Terry| Medhurst|  Terry|
|  2|  Sheldon|  Quigley|Sheldon|
|  3|  Terrill|    Hills|Terrill|
|  4|    Miles|Cummerata|  Miles|
|  5|    Mavis|  Schultz|  Mavis|
+---+---------+---------+-------+



## Add another column by name 'course_count' wher it contain number of courses the user of enrolled for.

In [24]:
df.select('id','courses').show()

+---+-------+
| id|courses|
+---+-------+
|  1| [1, 2]|
|  2|    [3]|
|  3| [2, 4]|
|  4|     []|
|  5|     []|
+---+-------+



In [25]:
df.select('id','courses').dtypes

[('id', 'bigint'), ('courses', 'array<bigint>')]

In [27]:
from pyspark.sql.functions import size

In [28]:
df.select('id','courses').withColumn('course_count',size('courses')).show()

+---+-------+------------+
| id|courses|course_count|
+---+-------+------------+
|  1| [1, 2]|           2|
|  2|    [3]|           1|
|  3| [2, 4]|           2|
|  4|     []|           0|
|  5|     []|           0|
+---+-------+------------+

