In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').appName('pyspark_fresh').getOrCreate()

In [2]:
from pyspark.sql import Row

In [4]:
import datetime
data= [{
    "id":1,
    "firstName":"Terry",
    "lastName":"Medhurst",
    "email":"atuny0@sohu.com",
    "phone":Row(phone_number = "+63 791 675 8914",mobile_number = "+63 791 675 8945"),
    "is_customer" : True,
    "amount_paid" : 10200,
    "customer_from" : datetime.date(2022,3,10),
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":2,
    "firstName":"Sheldon",
    "lastName":"Quigley",
    "email":"hbingley1@plala.or.jp",
    "phone":Row(phone_number="+7 813 117",mobile_number="+7 813 119"),
    "is_customer" : True,
    "amount_paid" : 11000,
    "customer_from" : datetime.date(2022,5,16),
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":3,
    "firstName":"Terrill",
    "lastName":"Hills",
    "email":"rshawe2@51.la",
    "phone":Row(phone_number="+63 739 292 7942",mobile_number="+63 739 292 7943"),
    "is_customer" : True,
    "amount_paid" : 24000,
    "customer_from" : datetime.date(2022,4,11),
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":4,
    "firstName":"Miles",
    "lastName":"Cummerata",
    "email":"yraigatt3@nature.com",
    "phone":Row(phone_number=None,mobile_number=None),
    "is_customer" : False,
    "amount_paid" : None,
    "customer_from" : None,
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":5,
    "firstName":"Mavis",
    "lastName":"Schultz",
    "email":"kmeus4@upenn.edu",
    "phone":Row(phone_number="+372 285 771 1911",mobile_number=None),
    "is_customer" : False,
    "amount_paid" : None,
    "customer_from" : None,
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
}]

In [5]:
df = spark.createDataFrame([Row(**user) for user in data])

In [6]:
df.show()

+---+---------+---------+--------------------+--------------------+-----------+-----------+-------------+-------------------+
| id|firstName| lastName|               email|               phone|is_customer|amount_paid|customer_from|     last_update_ts|
+---+---------+---------+--------------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Terry| Medhurst|     atuny0@sohu.com|{+63 791 675 8914...|       true|      10200|   2022-03-10|2023-10-20 15:00:45|
|  2|  Sheldon|  Quigley|hbingley1@plala.o...|{+7 813 117, +7 8...|       true|      11000|   2022-05-16|2023-10-20 15:00:45|
|  3|  Terrill|    Hills|       rshawe2@51.la|{+63 739 292 7942...|       true|      24000|   2022-04-11|2023-10-20 15:00:45|
|  4|    Miles|Cummerata|yraigatt3@nature.com|        {null, null}|      false|       null|         null|2023-10-20 15:00:45|
|  5|    Mavis|  Schultz|    kmeus4@upenn.edu|{+372 285 771 191...|      false|       null|         null|2023-10-20 15

In [7]:
df.dtypes

[('id', 'bigint'),
 ('firstName', 'string'),
 ('lastName', 'string'),
 ('email', 'string'),
 ('phone', 'struct<phone_number:string,mobile_number:string>'),
 ('is_customer', 'boolean'),
 ('amount_paid', 'bigint'),
 ('customer_from', 'date'),
 ('last_update_ts', 'timestamp')]

In [8]:
df.select('id','phone').show(truncate=False)

+---+------------------------------------+
|id |phone                               |
+---+------------------------------------+
|1  |{+63 791 675 8914, +63 791 675 8945}|
|2  |{+7 813 117, +7 813 119}            |
|3  |{+63 739 292 7942, +63 739 292 7943}|
|4  |{null, null}                        |
|5  |{+372 285 771 1911, null}           |
+---+------------------------------------+



In [9]:
df.select('id','phone.phone_number','phone.mobile_number').show()

+---+-----------------+----------------+
| id|     phone_number|   mobile_number|
+---+-----------------+----------------+
|  1| +63 791 675 8914|+63 791 675 8945|
|  2|       +7 813 117|      +7 813 119|
|  3| +63 739 292 7942|+63 739 292 7943|
|  4|             null|            null|
|  5|+372 285 771 1911|            null|
+---+-----------------+----------------+



In [11]:
from pyspark.sql.functions import col

In [12]:
df.select('id',col('phone')['phone_number'],col('phone')['mobile_number']).show()

+---+------------------+-------------------+
| id|phone.phone_number|phone.mobile_number|
+---+------------------+-------------------+
|  1|  +63 791 675 8914|   +63 791 675 8945|
|  2|        +7 813 117|         +7 813 119|
|  3|  +63 739 292 7942|   +63 739 292 7943|
|  4|              null|               null|
|  5| +372 285 771 1911|               null|
+---+------------------+-------------------+



In [14]:
df.select('id','phone.*').show()

+---+-----------------+----------------+
| id|     phone_number|   mobile_number|
+---+-----------------+----------------+
|  1| +63 791 675 8914|+63 791 675 8945|
|  2|       +7 813 117|      +7 813 119|
|  3| +63 739 292 7942|+63 739 292 7943|
|  4|             null|            null|
|  5|+372 285 771 1911|            null|
+---+-----------------+----------------+

