In [4]:
from pyspark.sql import SparkSession
import os
spark = (
    SparkSession
    .builder
    .master("local[*]")
    .getOrCreate()
)
df = spark.read.parquet(
    os.path.join("/opt/workspace", "parquet")
)
df.show(5)

+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+----------+--------------------+----------+------+
|              _links|           externals|              genres| id|               image|language|              name|             network|        officialSite| premiered|rating|runtime|           schedule| status|             summary|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+---+--------------------+--------+------------------+--------------------+--------------------+----------+------+-------+-------------------+-------+--------------------+----------+--------------------+----------+------+
|{NULL, {http://ap...|{tt2699110, 27043...|    [Drama, Romance]|127|{http://static.tv...| English|        The Affair|{{US, United Stat...|http://

# Selecting columns using string names

In [20]:
df.select(
    "id",
    "name",
    "runtime",
).show(5)

+---+------------------+-------+
| id|              name|runtime|
+---+------------------+-------+
|127|        The Affair|     60|
|128|   Jane the Virgin|     60|
|129|          Marry Me|     30|
|130|Two and a Half Men|     30|
|131|       About a Boy|     30|
+---+------------------+-------+


# Selecting columns using column objects

In [21]:
import pyspark.sql.functions as F
df.select(
    F.col("id"),
    F.col("name"),
    F.col("runtime"),
).show(5)

+---+------------------+-------+
| id|              name|runtime|
+---+------------------+-------+
|127|        The Affair|     60|
|128|   Jane the Virgin|     60|
|129|          Marry Me|     30|
|130|Two and a Half Men|     30|
|131|       About a Boy|     30|
+---+------------------+-------+


# use alias to rename a column

In [22]:
import pyspark.sql.functions as F
df.select(
    F.col("id"),
    F.col("name").alias("name_of_show"),
    F.col("runtime"),
).show(1)

+---+------------+-------+
| id|name_of_show|runtime|
+---+------------+-------+
|127|  The Affair|     60|
+---+------------+-------+


# use withColumn to add a new column

In [23]:
(
    df.select(
        F.col("id"),
        F.col("name"),
        F.col("runtime"),
    ).withColumn("runtime_without_ads", F.col("runtime") * 0.8)
).show(5)

+---+------------------+-------+-------------------+
| id|              name|runtime|runtime_without_ads|
+---+------------------+-------+-------------------+
|127|        The Affair|     60|               48.0|
|128|   Jane the Virgin|     60|               48.0|
|129|          Marry Me|     30|               24.0|
|130|Two and a Half Men|     30|               24.0|
|131|       About a Boy|     30|               24.0|
+---+------------------+-------+-------------------+


# use select and alias to add a new column

In [24]:
(
    df.select(
        F.col("id"),
        F.col("name"),
        F.col("runtime"),
        (F.col("runtime") * 0.8).alias("runtime_without_ads"),
    )
).show(5)

+---+------------------+-------+-------------------+
| id|              name|runtime|runtime_without_ads|
+---+------------------+-------+-------------------+
|127|        The Affair|     60|               48.0|
|128|   Jane the Virgin|     60|               48.0|
|129|          Marry Me|     30|               24.0|
|130|Two and a Half Men|     30|               24.0|
|131|       About a Boy|     30|               24.0|
+---+------------------+-------+-------------------+
