In [1]:
import warnings
warnings.filterwarnings("ignore")

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, ArrayType, Row

In [2]:
spark = SparkSession.builder.appName("sparksql-tutorial").getOrCreate()

23/02/12 07:06:32 WARN Utils: Your hostname, Pavans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.29.143 instead (on interface en0)
23/02/12 07:06:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/12 07:06:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [16]:
# define the structure to the data frame
schema = StructType([
    StructField(name="FirstName", dataType=StringType(), nullable=False),
    StructField(name="LastName", dataType=StringType(), nullable=False),
    StructField(name="Age", dataType=IntegerType(), nullable=False),
    StructField(name="Place", dataType=StringType(), nullable=False),
    StructField(name="Salary", dataType=LongType(), nullable=False),
    StructField(name="Department", dataType=StringType(), nullable=False),
    StructField(name="Technologies", dataType=ArrayType(elementType=StringType()), nullable=False)
])

In [17]:
# create the data rows as per the schema defined
rows = [
    Row("Pavan","Mantha",36,"Hyderabad",273567,"SPS",["java","spring boot","data science","react","node", "Terraform"]),
    Row("Arun","Boppudi",36,"Guntur",303567,"Aero",["java","spring boot","cloud","react","node", "druid", "kafka"]),
    Row("Ravi","Vadlamani",26,"Visakapatnam",213567,"Aero",["express","data structures","react"]),
    Row("Mahender","M",21,"Hyderabad",153567,"Aero",["java","spring boot","express","react","node"]),
    Row("Manoj","Manoj",21,"Guntur",183567,"Aero",["express","react"]),
    Row("Manoj","Velecheti",21,"Visakapatnam",223567,"Aero",["java","spring boot","express","react"]),
]

In [18]:
parallel_rows = spark.sparkContext.parallelize(rows)

In [19]:
# createDataFrame is used to create dataframe manually
df = spark.createDataFrame(parallel_rows, schema, verifySchema=True)

In [20]:
df.show()
df.printSchema()

+---------+---------+---+------------+------+----------+--------------------+
|FirstName| LastName|Age|       Place|Salary|Department|        Technologies|
+---------+---------+---+------------+------+----------+--------------------+
|    Pavan|   Mantha| 36|   Hyderabad|273567|       SPS|[java, spring boo...|
|     Arun|  Boppudi| 36|      Guntur|303567|      Aero|[java, spring boo...|
|     Ravi|Vadlamani| 26|Visakapatnam|213567|      Aero|[express, data st...|
| Mahender|        M| 21|   Hyderabad|153567|      Aero|[java, spring boo...|
|    Manoj|    Manoj| 21|      Guntur|183567|      Aero|    [express, react]|
|    Manoj|Velecheti| 21|Visakapatnam|223567|      Aero|[java, spring boo...|
+---------+---------+---+------------+------+----------+--------------------+

root
 |-- FirstName: string (nullable = false)
 |-- LastName: string (nullable = false)
 |-- Age: integer (nullable = false)
 |-- Place: string (nullable = false)
 |-- Salary: long (nullable = false)
 |-- Department: st

In [0]:
help(df.withColumn)

Help on method withColumn in module pyspark.sql.dataframe:

withColumn(colName: str, col: pyspark.sql.column.Column) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` by adding a column or replacing the
    existing column that has the same name.
    
    The column expression must be an expression over this :class:`DataFrame`; attempting to add
    a column from some other :class:`DataFrame` will raise an error.
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Support Spark Connect.
    
    Parameters
    ----------
    colName : str
        string, name of the new column.
    col : :class:`Column`
        a :class:`Column` expression for the new column.
    
    Notes
    -----
    This method introduces a projection internally. Therefore, calling it multiple
    times, for instance, via loops in order to add multiple columns can generate big
    plans which can cause performance issues and even `StackOv

In [12]:
from pyspark.sql import functions as F

In [21]:
df = df.withColumn(colName='Salary', col=F.col('Salary').cast('Double'))

In [22]:
df.printSchema()

root
 |-- FirstName: string (nullable = false)
 |-- LastName: string (nullable = false)
 |-- Age: integer (nullable = false)
 |-- Place: string (nullable = false)
 |-- Salary: double (nullable = false)
 |-- Department: string (nullable = false)
 |-- Technologies: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [23]:
df.show()

+---------+---------+---+------------+--------+----------+--------------------+
|FirstName| LastName|Age|       Place|  Salary|Department|        Technologies|
+---------+---------+---+------------+--------+----------+--------------------+
|    Pavan|   Mantha| 36|   Hyderabad|273567.0|       SPS|[java, spring boo...|
|     Arun|  Boppudi| 36|      Guntur|303567.0|      Aero|[java, spring boo...|
|     Ravi|Vadlamani| 26|Visakapatnam|213567.0|      Aero|[express, data st...|
| Mahender|        M| 21|   Hyderabad|153567.0|      Aero|[java, spring boo...|
|    Manoj|    Manoj| 21|      Guntur|183567.0|      Aero|    [express, react]|
|    Manoj|Velecheti| 21|Visakapatnam|223567.0|      Aero|[java, spring boo...|
+---------+---------+---+------------+--------+----------+--------------------+



In [24]:
#double the salary of every employee
df = df.withColumn(colName='Salary', col=F.col('Salary')*2)

In [25]:
df.show(truncate=False)

+---------+---------+---+------------+--------+----------+---------------------------------------------------------+
|FirstName|LastName |Age|Place       |Salary  |Department|Technologies                                             |
+---------+---------+---+------------+--------+----------+---------------------------------------------------------+
|Pavan    |Mantha   |36 |Hyderabad   |547134.0|SPS       |[java, spring boot, data science, react, node, Terraform]|
|Arun     |Boppudi  |36 |Guntur      |607134.0|Aero      |[java, spring boot, cloud, react, node, druid, kafka]    |
|Ravi     |Vadlamani|26 |Visakapatnam|427134.0|Aero      |[express, data structures, react]                        |
|Mahender |M        |21 |Hyderabad   |307134.0|Aero      |[java, spring boot, express, react, node]                |
|Manoj    |Manoj    |21 |Guntur      |367134.0|Aero      |[express, react]                                         |
|Manoj    |Velecheti|21 |Visakapatnam|447134.0|Aero      |[java,

In [26]:
df = df.withColumn('Experience', F.lit(10))

In [27]:
df.show()

+---------+---------+---+------------+--------+----------+--------------------+----------+
|FirstName| LastName|Age|       Place|  Salary|Department|        Technologies|Experience|
+---------+---------+---+------------+--------+----------+--------------------+----------+
|    Pavan|   Mantha| 36|   Hyderabad|547134.0|       SPS|[java, spring boo...|        10|
|     Arun|  Boppudi| 36|      Guntur|607134.0|      Aero|[java, spring boo...|        10|
|     Ravi|Vadlamani| 26|Visakapatnam|427134.0|      Aero|[express, data st...|        10|
| Mahender|        M| 21|   Hyderabad|307134.0|      Aero|[java, spring boo...|        10|
|    Manoj|    Manoj| 21|      Guntur|367134.0|      Aero|    [express, react]|        10|
|    Manoj|Velecheti| 21|Visakapatnam|447134.0|      Aero|[java, spring boo...|        10|
+---------+---------+---+------------+--------+----------+--------------------+----------+



In [28]:
df = df.withColumn('Experience', F.when(F.col('Age') < 26, F.col('Experience')/2).otherwise(F.col('Experience')*1))

In [29]:
df.show()

+---------+---------+---+------------+--------+----------+--------------------+----------+
|FirstName| LastName|Age|       Place|  Salary|Department|        Technologies|Experience|
+---------+---------+---+------------+--------+----------+--------------------+----------+
|    Pavan|   Mantha| 36|   Hyderabad|547134.0|       SPS|[java, spring boo...|      10.0|
|     Arun|  Boppudi| 36|      Guntur|607134.0|      Aero|[java, spring boo...|      10.0|
|     Ravi|Vadlamani| 26|Visakapatnam|427134.0|      Aero|[express, data st...|      10.0|
| Mahender|        M| 21|   Hyderabad|307134.0|      Aero|[java, spring boo...|       5.0|
|    Manoj|    Manoj| 21|      Guntur|367134.0|      Aero|    [express, react]|       5.0|
|    Manoj|Velecheti| 21|Visakapatnam|447134.0|      Aero|[java, spring boo...|       5.0|
+---------+---------+---+------------+--------+----------+--------------------+----------+

