In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').appName('pyspark_fresh').getOrCreate()

In [2]:
from pyspark.sql import Row

In [3]:
import datetime
data= [{
    "id":1,
    "firstName":"Terry",
    "lastName":"Medhurst",
    "email":"atuny0@sohu.com",
    "phone":Row(phone_number = "+63 791 675 8914",mobile_number = "+63 791 675 8945"),
    "courses" : [1,2],
    "is_customer" : True,
    "amount_paid" : 10200,
    "customer_from" : datetime.date(2022,3,10),
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":2,
    "firstName":"Sheldon",
    "lastName":"Quigley",
    "email":"hbingley1@plala.or.jp",
    "phone":Row(phone_number="+7 813 117",mobile_number="+7 813 119"),
    "courses" : [3],
    "is_customer" : True,
    "amount_paid" : 11000,
    "customer_from" : datetime.date(2022,5,16),
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":3,
    "firstName":"Terrill",
    "lastName":"Hills",
    "email":"rshawe2@51.la",
    "phone":Row(phone_number="+63 739 292 7942",mobile_number="+63 739 292 7943"),
    "courses" : [2,4],
    "is_customer" : True,
    "amount_paid" : 24000,
    "customer_from" : datetime.date(2022,4,11),
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":4,
    "firstName":"Miles",
    "lastName":"Cummerata",
    "email":"yraigatt3@nature.com",
    "phone":Row(phone_number=None,mobile_number=None),
    "courses" : [],
    "is_customer" : False,
    "amount_paid" : None,
    "customer_from" : None,
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
},
{
    "id":5,
    "firstName":"Mavis",
    "lastName":"Schultz",
    "email":"kmeus4@upenn.edu",
    "phone":Row(phone_number="+372 285 771 1911",mobile_number=None),
    "courses" : [],
    "is_customer" : False,
    "amount_paid" : None,
    "customer_from" : None,
    "last_update_ts" : datetime.datetime(2023,10,20, 15,0,45)
}]

In [4]:
import pandas as pd

In [5]:
df = spark.createDataFrame(pd.DataFrame(data))

In [6]:
df.show()

+---+---------+---------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|firstName| lastName|               email|               phone|courses|is_customer|amount_paid|customer_from|     last_update_ts|
+---+---------+---------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Terry| Medhurst|     atuny0@sohu.com|{+63 791 675 8914...| [1, 2]|       true|    10200.0|   2022-03-10|2023-10-20 15:00:45|
|  2|  Sheldon|  Quigley|hbingley1@plala.o...|{+7 813 117, +7 8...|    [3]|       true|    11000.0|   2022-05-16|2023-10-20 15:00:45|
|  3|  Terrill|    Hills|       rshawe2@51.la|{+63 739 292 7942...| [2, 4]|       true|    24000.0|   2022-04-11|2023-10-20 15:00:45|
|  4|    Miles|Cummerata|yraigatt3@nature.com|        {null, null}|     []|      false|        NaN|         null|2023-10-20 15:00:45|
|  5|    Mavis|  Schultz|    kmeus4@upenn.edu|{+372 285 771 19

- Rename id to user_id
- Rename firstName to user_first_name
- Rename lastName to user_last_name
- also add new column by name user_full_name which is derived by concatenating firstName and lastName with , in between

In [8]:
from pyspark.sql.functions import col,lit,concat

In [9]:
user_id = col('id')

In [10]:
help(user_id.alias)

Help on method alias in module pyspark.sql.column:

alias(*alias: str, **kwargs: Any) -> 'Column' method of pyspark.sql.column.Column instance
    Returns this column aliased with a new name or names (in the case of expressions that
    return more than one column, such as explode).
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    alias : str
        desired column names (collects all positional arguments passed)
    
    Other Parameters
    ----------------
    metadata: dict
        a dict of information to be stored in ``metadata`` attribute of the
        corresponding :class:`StructField <pyspark.sql.types.StructField>` (optional, keyword
        only argument)
    
        .. versionchanged:: 2.2.0
           Added optional ``metadata`` argument.
    
    Returns
    -------
    :class:`Column`
        Column representing whether each element of Column is aliased with new name or names.
   

In [11]:
# using select

df.\
select(
    col('id').alias('user_id'),\
    col('firstName').alias('user_first_name'),\
    col('lastName').alias('user_last_name'),\
    concat(col('firstName'),lit(', '),col('lastName')).alias('user_full_name')).show()

+-------+---------------+--------------+----------------+
|user_id|user_first_name|user_last_name|  user_full_name|
+-------+---------------+--------------+----------------+
|      1|          Terry|      Medhurst| Terry, Medhurst|
|      2|        Sheldon|       Quigley|Sheldon, Quigley|
|      3|        Terrill|         Hills|  Terrill, Hills|
|      4|          Miles|     Cummerata|Miles, Cummerata|
|      5|          Mavis|       Schultz|  Mavis, Schultz|
+-------+---------------+--------------+----------------+



In [12]:
df.\
select(
    df['id'].alias('user_id'),\
    df['firstName'].alias('user_first_name'),\
    df['lastName'].alias('user_last_name'),\
    concat(df['firstName'],lit(', '),df['lastName']).alias('user_full_name')).show()

+-------+---------------+--------------+----------------+
|user_id|user_first_name|user_last_name|  user_full_name|
+-------+---------------+--------------+----------------+
|      1|          Terry|      Medhurst| Terry, Medhurst|
|      2|        Sheldon|       Quigley|Sheldon, Quigley|
|      3|        Terrill|         Hills|  Terrill, Hills|
|      4|          Miles|     Cummerata|Miles, Cummerata|
|      5|          Mavis|       Schultz|  Mavis, Schultz|
+-------+---------------+--------------+----------------+



In [13]:
# using withColumn and alias
df.\
select(
    col('id').alias('user_id'),\
    col('firstName').alias('user_first_name'),\
    col('lastName').alias('user_last_name')
).withColumn('user_full_name',concat(col('firstName'),lit(', '),col('lastName'))).show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `firstName` cannot be resolved. Did you mean one of the following? [`user_first_name`, `user_id`, `user_last_name`].;
'Project [user_id#111L, user_first_name#112, user_last_name#113, concat('firstName, , , 'lastName) AS user_full_name#117]
+- Project [id#0L AS user_id#111L, firstName#1 AS user_first_name#112, lastName#2 AS user_last_name#113]
   +- LogicalRDD [id#0L, firstName#1, lastName#2, email#3, phone#4, courses#5, is_customer#6, amount_paid#7, customer_from#8, last_update_ts#9], false


In [14]:
df.\
select(
    col('id').alias('user_id'),\
    col('firstName').alias('user_first_name'),\
    col('lastName').alias('user_last_name')
).withColumn('user_full_name',concat(col('user_first_name'),lit(', '),col('user_last_name'))).show()

+-------+---------------+--------------+----------------+
|user_id|user_first_name|user_last_name|  user_full_name|
+-------+---------------+--------------+----------------+
|      1|          Terry|      Medhurst| Terry, Medhurst|
|      2|        Sheldon|       Quigley|Sheldon, Quigley|
|      3|        Terrill|         Hills|  Terrill, Hills|
|      4|          Miles|     Cummerata|Miles, Cummerata|
|      5|          Mavis|       Schultz|  Mavis, Schultz|
+-------+---------------+--------------+----------------+



In [16]:
df.\
withColumn('user_full_name',concat(col('firstName'),lit(', '),col('lastName'))).\
select(
    col('id').alias('user_id'),\
    col('firstName').alias('user_first_name'),\
    col('lastName').alias('user_last_name'),\
    'user_full_name'
).show()

+-------+---------------+--------------+----------------+
|user_id|user_first_name|user_last_name|  user_full_name|
+-------+---------------+--------------+----------------+
|      1|          Terry|      Medhurst| Terry, Medhurst|
|      2|        Sheldon|       Quigley|Sheldon, Quigley|
|      3|        Terrill|         Hills|  Terrill, Hills|
|      4|          Miles|     Cummerata|Miles, Cummerata|
|      5|          Mavis|       Schultz|  Mavis, Schultz|
+-------+---------------+--------------+----------------+



In [17]:
df.\
withColumn('user_full_name',concat(df['firstName'],lit(', '),df['lastName'])).\
select(
    col('id').alias('user_id'),\
    col('firstName').alias('user_first_name'),\
    col('lastName').alias('user_last_name'),\
    'user_full_name'
).show()

+-------+---------------+--------------+----------------+
|user_id|user_first_name|user_last_name|  user_full_name|
+-------+---------------+--------------+----------------+
|      1|          Terry|      Medhurst| Terry, Medhurst|
|      2|        Sheldon|       Quigley|Sheldon, Quigley|
|      3|        Terrill|         Hills|  Terrill, Hills|
|      4|          Miles|     Cummerata|Miles, Cummerata|
|      5|          Mavis|       Schultz|  Mavis, Schultz|
+-------+---------------+--------------+----------------+

