In [0]:
# 02 Creating Spark Data Frame to Select and Rename Columns
import datetime
from pyspark.sql import Row
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "phone_numbers": Row(mobile="+1 234 567 8901", home="+1 234 567 8911"),
        "courses": [1, 2],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "phone_numbers":  Row(mobile="+1 234 567 8923", home="1 234 567 8934"),
        "courses": [3],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "phone_numbers": Row(mobile="+1 714 512 9752", home="+1 714 512 6601"),
        "courses": [2, 4],
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "phone_numbers": Row(mobile=None, home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "phone_numbers": Row(mobile="+1 817 934 7142", home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]

import pandas as pd

In [0]:
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', False)

#
users_df = spark.createDataFrame(pd.DataFrame(users))
users_df.show()

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

## Overview of Narrow and Wide Transformations

Let us get an overview of Narrow and Wide Transformations.
* Here are the functions related to narrow transformations. Narrow transformations doesn't result in shuffling. These are also known as row level transformations.
  * `df.select`
  * `df.filter`
  * `df.withColumn`
  * `df.withColumnRenamed`
  * `df.drop`
* Here are the functions related to wide transformations.
  * `df.distinct`
  * `df.union` or any set operation
  * `df.join` or any join operation
  * `df.groupBy`
  * `df.sort` or `df.orderBy`
* Any function that result in shuffling is wide transformation. For all the wide transformations, we have to deal with group of records based on a key.

ie narrow transformation: select , filter,withcoulmn , withcoulmnRenamed, drop 
wide transformation : distinct , union ,join,groupBy, sort ,orderBy

In [0]:
## select function 
## select will always return dataFrame , its projection function 
users_df.select('*').show()
print(users_df.select('id', 'first_name', 'last_name').show())
print(users_df.select(['id', 'first_name', 'last_name']).show())
# Defining alias to the dataframe
print(users_df.alias('u').select('u.*').show())
## here we have given userdf alias as u and then used U.* it comes handy in joins 
## alias for dataFrame 
print(users_df.alias('u').select('u.id', 'u.first_name', 'u.last_name').show())

from pyspark.sql.functions import col
from pyspark.sql.functions import col, concat, lit
## col is convert string to column 
## col is majourly used to apply function on columns 




+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

In [0]:
users_df.select(
    col('id'), 
    'first_name', 
    'last_name',
    concat(col('first_name'), lit(', '), col('last_name')).alias('full_name')
).show()

+---+----------+------------+--------------------+
| id|first_name|   last_name|           full_name|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+



In [0]:
## select expresion 
## note:  selectExpr === sql type syntax
## note : selectExpr(*args)
print(users_df.selectExpr('*').show())
# Defining alias to the dataframe
print(users_df.alias('u').selectExpr('u.*').show())

print(users_df.selectExpr(['id', 'first_name', 'last_name']).show())



+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

In [0]:
from pyspark.sql.functions import concat, lit, col
users_df.select('id', 'first_name', 'last_name',concat(col('first_name'), lit(', '), col('last_name')).alias('full_name')).show()

## selct expresstion

+---+----------+------------+--------------------+
| id|first_name|   last_name|           full_name|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+



In [0]:
# Using selectExpr to use Spark SQL Functions
users_df.selectExpr('id', 'first_name', 'last_name', "concat(first_name, ', ', last_name) AS full_name").show()

## note how concat is using sql style syntax 

+---+----------+------------+--------------------+
| id|first_name|   last_name|           full_name|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+



In [0]:
users_df.createOrReplaceTempView('users')
spark.sql("""
    SELECT id, first_name, last_name,
        concat(first_name, ', ', last_name) AS full_name
    FROM users
"""). \
    show()

## creating temperary view and quering it 

+---+----------+------------+--------------------+
| id|first_name|   last_name|           full_name|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+



In [0]:
## refreiang column s
from pyspark.sql.functions import col
a=col("id")
a
type(a)

Out[10]: pyspark.sql.column.Column

In [0]:
print(users_df.select('id', col('first_name'), 'last_name').show())
print(users_df.selectExpr('id', 'first_name', 'last_name').show())
print(users_df.selectExpr(col('id'), 'first_name', 'last_name').show())
# This does not work as selectExpr can only take column names or SQL style expressions on column names
## note in selectExpr we dont use col 

+---+----------+------------+
| id|first_name|   last_name|
+---+----------+------------+
|  1|    Corrie|Van den Oord|
|  2|  Nikolaus|     Brewitt|
|  3|    Orelie|      Penney|
|  4|     Ashby|    Maddocks|
|  5|      Kurt|        Rome|
+---+----------+------------+

None
+---+----------+------------+
| id|first_name|   last_name|
+---+----------+------------+
|  1|    Corrie|Van den Oord|
|  2|  Nikolaus|     Brewitt|
|  3|    Orelie|      Penney|
|  4|     Ashby|    Maddocks|
|  5|      Kurt|        Rome|
+---+----------+------------+

None


[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-1965803855347220>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0mprint[0m[0;34m([0m[0musers_df[0m[0;34m.[0m[0mselect[0m[0;34m([0m[0;34m'id'[0m[0;34m,[0m [0mcol[0m[0;34m([0m[0;34m'first_name'[0m[0;34m)[0m[0;34m,[0m [0;34m'last_name'[0m[0;34m)[0m[0;34m.[0m[0mshow[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0mprint[0m[0;34m([0m[0musers_df[0m[0;34m.[0m[0mselectExpr[0m[0;34m([0m[0;34m'id'[0m[0;34m,[0m [0;34m'first_name'[0m[0;34m,[0m [0;34m'last_name'[0m[0;34m)[0m[0;34m.[0m[0mshow[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 3[0;31m [0mprint[0m[0;34m([0m[0musers_df[0m[0;34m.[0m[0mselectExpr[0m[0;34m([0m[0mcol[0m[0;34m([0m[0;34m'id'[0m[0;34m)[

In [0]:
## note imp 
users_df.alias('u').select(u['id'], col('first_name'), 'last_name').show()
# This does not work as there is no object by name u in this session. ie pandas [] style uses object 
# This shall wor
users_df.alias('u').select('u.id', col('first_name'), 'last_name').show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-1965803855347221>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0;31m## note imp[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 2[0;31m [0musers_df[0m[0;34m.[0m[0malias[0m[0;34m([0m[0;34m'u'[0m[0;34m)[0m[0;34m.[0m[0mselect[0m[0;34m([0m[0mu[0m[0;34m[[0m[0;34m'id'[0m[0;34m][0m[0;34m,[0m [0mcol[0m[0;34m([0m[0;34m'first_name'[0m[0;34m)[0m[0;34m,[0m [0;34m'last_name'[0m[0;34m)[0m[0;34m.[0m[0mshow[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      3[0m [0;31m# This does not work as there is no object by name u in this session. ie pandas [] style uses object[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      4[0m [0;31m# This shall wor[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      5[0m [0musers_df[0m[0;34m.

In [0]:
from pyspark.sql.functions import concat, lit, col
users_df. \
    select(
        'id', 'first_name', 'last_name', 
        concat(users_df['first_name'], lit(', '), col('last_name')).alias('full_name')
    ). \
    show()


+---+----------+------------+--------------------+
| id|first_name|   last_name|           full_name|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+



In [0]:
users_df.alias('u').selectExpr('id', 'first_name', 'last_name', "concat(u.first_name, ', ', u.last_name) AS full_name").show()

+---+----------+------------+--------------------+
| id|first_name|   last_name|           full_name|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+



In [0]:
## understanding col function 
users_df['id']
## creating col type object via data frame refrense 

Out[15]: Column<'id'>

In [0]:
from pyspark.sql.functions import col
col('id')


Out[16]: Column<'id'>

In [0]:
users_df.select('id', 'first_name', 'last_name').show()

+---+----------+------------+
| id|first_name|   last_name|
+---+----------+------------+
|  1|    Corrie|Van den Oord|
|  2|  Nikolaus|     Brewitt|
|  3|    Orelie|      Penney|
|  4|     Ashby|    Maddocks|
|  5|      Kurt|        Rome|
+---+----------+------------+



In [0]:
takeColumn= ['id', 'first_name', 'last_name']
users_df.select(*takeColumn).show()
## here it has  we seect columns 

+---+----------+------------+
| id|first_name|   last_name|
+---+----------+------------+
|  1|    Corrie|Van den Oord|
|  2|  Nikolaus|     Brewitt|
|  3|    Orelie|      Penney|
|  4|     Ashby|    Maddocks|
|  5|      Kurt|        Rome|
+---+----------+------------+



There are quite a few functions available on top of column type
* `cast` (can be used on all important data frame functions such as `select`, `filter`, `groupBy`, `orderBy`, etc)
* `asc`, `desc` (typically used as part of `sort` or `orderBy`)
* `contains` (typically used as part of `filter` or `where`)

In [0]:
users_df.select('id', 'customer_from').dtypes
## get data types of id, customer_from
from pyspark.sql.functions import date_format
users_df.select(
    col('id'), 
    date_format('customer_from', 'yyyyMMdd')
).show()

+---+------------------------------------+
| id|date_format(customer_from, yyyyMMdd)|
+---+------------------------------------+
|  1|                            20210115|
|  2|                            20210214|
|  3|                            20210121|
|  4|                                null|
|  5|                                null|
+---+------------------------------------+



In [0]:
cols = [col('id'), date_format('customer_from', 'yyyyMMdd').cast('int').alias('customer_from')]
users_df.select(*cols).show()




In [0]:
from pyspark.sql.functions import col, lit, concat
full_name_col = concat(col('first_name'), lit(', '), col('last_name'))
print(type(full_name_col))
## full_name_col is column of type object 
full_name_alias = full_name_col.alias('full_name')
print(type(full_name_alias))
### ie column column 




In [0]:
# * Convert data type of customer_from date to numeric type
print(users_df.columns)
users_df.customer_from.cast("int")
from pyspark.sql.functions import date_format
date_format('customer_from', 'yyyyMMdd')
customer_from_alias = date_format('customer_from', 'yyyyMMdd').cast('int').alias('customer_from')
users_df.select('id', customer_from_alias).show()

## note yyyy === year, MM === Month ,
users_df.select('id', customer_from_alias).dtypes
## here we have pass python variable




In [0]:
## understanding lit function 
users_df. \
    selectExpr('id', '(amount_paid + 25) AS amount_paid'). \
    show()
## via selecExpr 
# Returns null
# amount_paid is converted to string by implicitly using lit.
# Spark returns null when we perform arithmetic operations on noncompatible types
## 
print(users_df.select('id', 'amount_paid' + lit(25.0), lit(50) + lit(25)).show())



## same thing with select using lit
users_df.select('id',(col('amount_paid') + 25).alias('amount_paid')).show()

print(lit(25.0))



There are multiple ways to rename Spark Data Frame Columns or Expressions.
* We can rename column or expression using `alias` as part of `select`
* We can add or rename column or expression using `withColumn` on top of Data Frame.
* We can rename one column at a time using `withColumnRenamed` on top of Data Frame.
* We typically use `withColumn` to perform row level transformations and then to provide a name to the result. If we provide the same name as existing column, then the column will be replaced with new one.
* If we want to just rename the column then it is better to use `withColumnRenamed`.
* If we want to apply any transformation, we need to either use `select` or `withColumn`
* We can rename bunch of columns using `toDF`.

In [0]:
## withColumn will add or rename column or expression 
## withcoulnRenamed === rename one column at time
## withColumn === perform row level transformation then provide a name to the result , if provide same name extension then column will
## replace with new on e
## if want to do transformation we use select or withColumn
## rename with multiple column using toDF 



In [0]:
from pyspark.sql.functions import concat, lit



In [0]:
# Equivalent logic using select
users_df. \
    select(
        'id', 'first_name', 'last_name',
        concat('first_name', lit(', '), 'last_name').alias('full_name')
    ). \
    show()



In [0]:
users_df. \
    select('id', 'first_name', 'last_name'). \
    withColumn('full_name', concat('first_name', lit(', '), 'last_name')). \
    show()



In [0]:
from pyspark.sql.functions import size

users_df.select('id', 'courses'). \
    withColumn('course_count', size('courses')). \
    show()

## note : size(col) return the length of array or map columns

+---+-------+------------+
| id|courses|course_count|
+---+-------+------------+
|  1| [1, 2]|           2|
|  2|    [3]|           1|
|  3| [2, 4]|           2|
|  4|     []|           0|
|  5|     []|           0|
+---+-------+------------+



In [0]:

#     Rename id to user_id
#     Rename first_name to user_first_name
#     Rename last_name to user_last_name

users_df.select('*').withColumnRenamed('id','user_id').withColumnRenamed('first_name','user_first_name').withColumnRenamed('last_name','user_last_name').show()
  

+-------+---------------+--------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|user_id|user_first_name|user_last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+-------+---------------+--------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|      1|         Corrie|  Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|      2|       Nikolaus|       Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|      3|         Orelie|        Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|      4|          Ashby|      Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|    

In [0]:

# * Rename `id` to `user_id`
# * Rename `first_name` to `user_first_name`
# * Rename `last_name` to `user_last_name`
# * Also add new column by name `user_full_name` which is derived by concatenating `first_name` and `last_name` with `, ` in between.

# Using select
users_df. \
    select(
        col('id').alias('user_id'),
        col('first_name').alias('user_first_name'),
        col('last_name').alias('user_last_name'),
        concat(col('first_name'), lit(', '), col('last_name')).alias('user_full_name')
    ). \
    show()



+-------+---------------+--------------+--------------------+
|user_id|user_first_name|user_last_name|      user_full_name|
+-------+---------------+--------------+--------------------+
|      1|         Corrie|  Van den Oord|Corrie, Van den Oord|
|      2|       Nikolaus|       Brewitt|   Nikolaus, Brewitt|
|      3|         Orelie|        Penney|      Orelie, Penney|
|      4|          Ashby|      Maddocks|     Ashby, Maddocks|
|      5|           Kurt|          Rome|          Kurt, Rome|
+-------+---------------+--------------+--------------------+



In [0]:
users_df. \
    select(
        users_df['id'].alias('user_id'),
        users_df['first_name'].alias('user_first_name'),
        users_df['last_name'].alias('user_last_name'),
        concat(users_df['first_name'], lit(', '), users_df['last_name']).alias('user_full_name')
    ). \
    show()

In [0]:
# Using withColumn and alias (first withColumn and then select)
users_df. \
    withColumn('user_full_name', concat(col('first_name'), lit(', '), col('last_name'))). \
    select(
        users_df['id'].alias('user_id'),
        users_df['first_name'].alias('user_first_name'),
        users_df['last_name'].alias('user_last_name'),
        'user_full_name'
    ). \
    show()

In [0]:
users_df. \
    withColumn('user_full_name', concat(users_df['first_name'], lit(', '), users_df['last_name'])). \
    select(
        users_df['id'].alias('user_id'),
        users_df['first_name'].alias('user_first_name'),
        users_df['last_name'].alias('user_last_name'),
        'user_full_name'
    ). \
    show()

In [0]:
# required columns from original list
required_columns = ['id', 'first_name', 'last_name', 'email', 'phone_numbers', 'courses']

# new column name list
target_column_names = ['user_id', 'user_first_name', 'user_last_name', 'user_email', 'user_phone_numbers', 'enrolled_courses']



In [0]:


# * Get the data from required columns and rename the columns to new names as per target column names.
# * We should be able to use `select` to get the data from required columns.
# * We should be able to rename the columns using `toDF`
# * `select` and `toDF` takes variable number of arguments. We can use `*required_columns` while invoking `select` to get the data from required columns. It is applicable for `toDF` as well.


#  note: toDF(*cols) method of pyspark.sql.dataframe.DataFrame instance

In [0]:
# required columns from original list
required_columns = ['id', 'first_name', 'last_name', 'email', 'phone_numbers', 'courses']

# new column name list
target_column_names = ['user_id', 'user_first_name', 'user_last_name', 'user_email', 'user_phone_numbers', 'enrolled_courses']


print(users_df.select(required_columns).show())

# Note: python variable is passed without '' in select 
print(users_df.select(required_columns).toDF(*target_column_names).show())

# 
# Note: renaming column with help of toDF

+---+----------+------------+--------------------+--------------------+-------+
| id|first_name|   last_name|               email|       phone_numbers|courses|
+---+----------+------------+--------------------+--------------------+-------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|
|  5|      Kurt|        Rome|krome4@shutterfly...|{+1 817 934 7142,...|     []|
+---+----------+------------+--------------------+--------------------+-------+

None
+-------+---------------+--------------+--------------------+--------------------+----------------+
|user_id|user_first_name|user_last_name|          user_email|  user_phone_numbers|enrolled_courses|
+-------+---------------+--------------+--------------------+-------------

In [0]:
def myDF(*cols):
    print(type(cols))
    print(cols)
    
myDF(*['f1', 'f2'])

<class 'tuple'>
('f1', 'f2')
