In [7]:
from pyspark.sql import SparkSession


# Initialize Spark session
spark = SparkSession.builder \
.appName("PySpark Data Handling") \
.getOrCreate()

25/02/04 10:18:28 INFO SparkEnv: Registering MapOutputTracker
25/02/04 10:18:28 INFO SparkEnv: Registering BlockManagerMaster
25/02/04 10:18:29 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/02/04 10:18:29 INFO SparkEnv: Registering OutputCommitCoordinator


In [8]:
# Sample data
data = [
    (1, "John Doe", "Bangalore", "2023-01-15", "152.75", "True"),
    (2, "Jane Smith", "Delhi", "2023-05-20", "89.50", "False"),
    (3, "Robert Brown", "Mumbai", "InvalidDate", "200.00", "True"),
    (4, "Linda White", "Kolkata", "2023-02-29", None, "yes"),  # Feb 29 invalid in 2023
    (5, "Mike Green", "Chennai", "2023-08-10", "NaN", "1"),  # NaN needs handling
    (6, "Sarah Blue", "Hyderabad", "InvalidDate", "300.40", "No")
]

# Define column names
columns = ["id", "name", "city", "date", "amount", "is_active"]

# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

# Show the DataFrame
df.show()

                                                                                

+---+------------+---------+-----------+------+---------+
| id|        name|     city|       date|amount|is_active|
+---+------------+---------+-----------+------+---------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.40|       No|
+---+------------+---------+-----------+------+---------+



In [9]:
#schema

df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: string (nullable = true)



# handle Integer Column

In [10]:
df.id

Column<'id'>

In [11]:
df['id']

Column<'id'>

In [12]:
df.filter(df.id>3).show()

+---+-----------+---------+-----------+------+---------+
| id|       name|     city|       date|amount|is_active|
+---+-----------+---------+-----------+------+---------+
|  4|Linda White|  Kolkata| 2023-02-29|  null|      yes|
|  5| Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|  6| Sarah Blue|Hyderabad|InvalidDate|300.40|       No|
+---+-----------+---------+-----------+------+---------+



In [13]:
df.withColumn('id_double',df.id*2).show()

+---+------------+---------+-----------+------+---------+---------+
| id|        name|     city|       date|amount|is_active|id_double|
+---+------------+---------+-----------+------+---------+---------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|        2|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|        4|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|        6|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes|        8|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|       10|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.40|       No|       12|
+---+------------+---------+-----------+------+---------+---------+



In [15]:
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

from pyspark.sql.functions import col

df = df.withColumn('id' , col('id').cast(IntegerType()))

In [16]:
df.show()

+---+------------+---------+-----------+------+---------+
| id|        name|     city|       date|amount|is_active|
+---+------------+---------+-----------+------+---------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.40|       No|
+---+------------+---------+-----------+------+---------+



In [18]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: string (nullable = true)



# String Columns

In [21]:
from pyspark.sql.functions import *

df = df.withColumn('name_upper',upper(df.name))
df.show()

+---+------------+---------+-----------+------+---------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|
+---+------------+---------+-----------+------+---------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|    JOHN DOE|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|  JANE SMITH|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|ROBERT BROWN|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes| LINDA WHITE|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|  MIKE GREEN|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.40|       No|  SARAH BLUE|
+---+------------+---------+-----------+------+---------+------------+



In [25]:
df.filter(df.city.startswith('B')).show()

+---+--------+---------+----------+------+---------+----------+
| id|    name|     city|      date|amount|is_active|name_upper|
+---+--------+---------+----------+------+---------+----------+
|  1|John Doe|Bangalore|2023-01-15|152.75|     True|  JOHN DOE|
+---+--------+---------+----------+------+---------+----------+



In [26]:

df = df.withColumn('name_lower',lower(df.name))
df.show()

+---+------------+---------+-----------+------+---------+------------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|  name_lower|
+---+------------+---------+-----------+------+---------+------------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|    JOHN DOE|    john doe|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|  JANE SMITH|  jane smith|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|ROBERT BROWN|robert brown|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes| LINDA WHITE| linda white|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|  MIKE GREEN|  mike green|
|  6|  Sarah Blue|Hyderabad|InvalidDate|300.40|       No|  SARAH BLUE|  sarah blue|
+---+------------+---------+-----------+------+---------+------------+------------+



# handle Float Column ( amount)

In [28]:
df.printSchema()
df.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- is_active: string (nullable = true)
 |-- name_upper: string (nullable = true)
 |-- name_lower: string (nullable = true)

+---+------------+---------+-----------+------+---------+------------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|  name_lower|
+---+------------+---------+-----------+------+---------+------------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|    JOHN DOE|    john doe|
|  2|  Jane Smith|    Delhi| 2023-05-20| 89.50|    False|  JANE SMITH|  jane smith|
|  3|Robert Brown|   Mumbai|InvalidDate|200.00|     True|ROBERT BROWN|robert brown|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes| LINDA WHITE| linda white|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|  MIKE GREEN|  mike green|
|  6|  Sarah Blu

In [30]:
df = df.withColumn('amount',col('amount').cast('float'))
df.printSchema()
df.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- is_active: string (nullable = true)
 |-- name_upper: string (nullable = true)
 |-- name_lower: string (nullable = true)

+---+------------+---------+-----------+------+---------+------------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|  name_lower|
+---+------------+---------+-----------+------+---------+------------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|    JOHN DOE|    john doe|
|  2|  Jane Smith|    Delhi| 2023-05-20|  89.5|    False|  JANE SMITH|  jane smith|
|  3|Robert Brown|   Mumbai|InvalidDate| 200.0|     True|ROBERT BROWN|robert brown|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes| LINDA WHITE| linda white|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|  MIKE GREEN|  mike green|
|  6|  Sarah Blue

In [32]:
df_filled = df.fillna({'amount':0})
df_filled.show()

+---+------------+---------+-----------+------+---------+------------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|  name_lower|
+---+------------+---------+-----------+------+---------+------------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|    JOHN DOE|    john doe|
|  2|  Jane Smith|    Delhi| 2023-05-20|  89.5|    False|  JANE SMITH|  jane smith|
|  3|Robert Brown|   Mumbai|InvalidDate| 200.0|     True|ROBERT BROWN|robert brown|
|  4| Linda White|  Kolkata| 2023-02-29|   0.0|      yes| LINDA WHITE| linda white|
|  5|  Mike Green|  Chennai| 2023-08-10|   0.0|        1|  MIKE GREEN|  mike green|
|  6|  Sarah Blue|Hyderabad|InvalidDate| 300.4|       No|  SARAH BLUE|  sarah blue|
+---+------------+---------+-----------+------+---------+------------+------------+



# Handle Date Column

In [33]:
df.show()

+---+------------+---------+-----------+------+---------+------------+------------+
| id|        name|     city|       date|amount|is_active|  name_upper|  name_lower|
+---+------------+---------+-----------+------+---------+------------+------------+
|  1|    John Doe|Bangalore| 2023-01-15|152.75|     True|    JOHN DOE|    john doe|
|  2|  Jane Smith|    Delhi| 2023-05-20|  89.5|    False|  JANE SMITH|  jane smith|
|  3|Robert Brown|   Mumbai|InvalidDate| 200.0|     True|ROBERT BROWN|robert brown|
|  4| Linda White|  Kolkata| 2023-02-29|  null|      yes| LINDA WHITE| linda white|
|  5|  Mike Green|  Chennai| 2023-08-10|   NaN|        1|  MIKE GREEN|  mike green|
|  6|  Sarah Blue|Hyderabad|InvalidDate| 300.4|       No|  SARAH BLUE|  sarah blue|
+---+------------+---------+-----------+------+---------+------------+------------+

