In [0]:
# Convert a String Column to Date Format
# Problem: Convert the "DOB" (Date of Birth) column from string format to yyyy-MM-dd.

from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, to_date

data= [('Anjali','12-02-1995'),('Ramesh','15-07-1998'),('Priya','09-11-2001')]
schema =StructType([StructField('Name',StringType())
                    ,StructField('DOB',StringType())])

df=spark.createDataFrame(data,schema).withColumn("DOB",to_date(col("DOB"),"dd-MM-yyyy"))

display(df)

In [0]:
# Calculate Age from Date of Birth
# Problem: Create a new column "Age" by calculating the age from the "DOB" column.

from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, to_date, trim, datediff, lit,round

data= [('Anjali','1993-06-25'),('Ramesh','2000-03-10'),('Priya','1996-12-15')]
schema =StructType([StructField('Name',StringType())
                    ,StructField('DOB',StringType())])

df=spark.createDataFrame(data,schema).withColumn("DOB",to_date(col("DOB"),"yyyy-MM-dd"))\
                                    .withColumn("Age",round( datediff(lit("2022-01-01"), col("DOB")) / 365, 0))

display(df)


In [0]:
# Filter Records for People Born After 2000
# Problem: Filter out people born after January 1, 2000.

from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col

data= [('Suresh','2002-04-10'),('Sunita','1998-08-30'),('Priya','2003-01-25')]
schema=StructType([StructField('Name',StringType())
                    ,StructField('DOB',StringType())])

df=spark.createDataFrame(data,schema)

f=df.filter(col("DOB") > "2000-01-01")
display(f)

In [0]:
# Find the Month and Year of a Date
# Problem: Extract the month and year from the "Joining Date" column.

from pyspark.sql.types import StructType, StructField, StringType ,DateType
from pyspark.sql.functions import col, month, year, to_date, date_format

data = [('Mahesh','2020-05-12'),('Neha','2019-08-09'),('Ajay','2021-03-21')]
schema = StructType([StructField('Name', StringType()),StructField('Joining Date', StringType())])

df = (spark.createDataFrame(data, schema)
        .withColumn('Joining Date', to_date(col('Joining Date'), 'yyyy-MM-dd')))

j_date = (df.select(col('Joining Date'))
            .withColumn('Month', date_format(col('Joining Date'), "MMMM")) 
            .withColumn('Year', year(col('Joining Date'))))

df_1=df.join(j_date, df['Joining Date'] == j_date['Joining Date']).drop(j_date['Joining Date'])

display(df_1)

In [0]:
# Find the Difference Between Two Dates
# Problem: Calculate the number of days between "Start Date" and "End Date".

from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, to_date, date_format,datediff

data= [('Mahesh','2023-01-01','2023-02-01'),('Neha','2022-07-10','2022-08-01'),('Ajay','2023-03-15','2023-03-20')]
schema=StructType([StructField('Name',StringType())
                    ,StructField('Start Date',StringType())
                    ,StructField('End Date',StringType())])

df=spark.createDataFrame(data,schema).withColumn("End Date",to_date(col("End Date"),"yyyy-MM-dd"))\
            .withColumn("Start Date",to_date(col("Start Date"),"yyyy-MM-dd"))\
            .withColumn("date_diff_days",datediff(col("End Date"),col("Start Date")))

display(df)



In [0]:
# Add Days to a Date
# Problem: Add 30 days to the "Booking Date" column.

from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, to_date,date_add

data= [('Mahesh','2023-06-10'),('Neha','2023-07-01'),('Ajay','2023-05-25')]
schema=StructType([StructField('Name',StringType())
                    ,StructField('Booking Date',StringType())])

df=spark.createDataFrame(data,schema).withColumn("Booking Date",to_date(col("Booking Date"),"yyyy-MM-dd"))

date_add=df.withColumn("After adding date",date_add(col("Booking Date"),30))

display(date_add)

In [0]:
# Find the Day of the Week for a Date
# Problem: Find the day of the week for the "DOB" column.

data= [('Mahesh','2023-06-10'),('Neha','2023-07-01'),('Ajay','2023-05-25')]
rdd = spark.sparkContext.parallelize(data)

data= [('Mahesh','2023-06-10'),('Neha','2023-07-01'),('Ajay','2023-05-25')]
schema=StructType([StructField('Name',StringType())
                    ,StructField('DOB',StringType())])

df=spark.createDataFrame(data,schema).withColumn("DOB",to_date(col("DOB"),"yyyy-MM-dd"))\
                            .withColumn("Day of the week",dayofweek(col("DOB")))\
                             .withColumn("Day",date_format(col("DOB"),"EEEE"))
df.show()


In [0]:
# Group Records by Month
# Problem: Group records by the month of the "Order Date" and count the number of orders.

from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, to_date, date_format

data = [('Mahesh', '2023-06-10'),('Neha', '2023-07-01'),('Ajay', '2023-07-25')]

schema = StructType([StructField('Name', StringType()),StructField('Order Date', StringType())])

df = spark.createDataFrame( data,schema).withColumn("Order Date",to_date(col("Order Date"), "yyyy-MM-dd"))
display(df)


In [0]:
# Find Records for the Current Year
# Filter out records where the "Purchase Date" is in the current year (2025).

from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, to_date

data= [('Mahesh','2023-06-10'),('Neha','2025-01-01'),('Ajay','2025-02-02')]
schema=StructType([StructField('Name',StringType())
                    ,StructField('Purchase Date',StringType())])

df=spark.createDataFrame(data,schema).withColumn("Purchase Date",to_date(col("Purchase Date"),"yyyy-MM-dd"))

f=df.filter(col("Purchase Date") > "2025-01-01").display()


In [0]:
# Identify Leap Year Dates
# Problem: Identify if the "Event Date" is in a leap year.

from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, to_date,year

data = [("2020-02-29",), ("2021-06-15",), ("2024-03-10",), ("2023-07-20",)]
schema=StructType([StructField('Event Date', StringType())])

df = spark.createDataFrame(data,schema).withColumn("Event Date",to_date(col("Event Date"),"yyyy-MM-dd"))

leap=df.filter(year(col("Event Date")) %4==0).display()