# Dates and Timestamps

You will often find yourself working with Time and Date information, let's walk through some ways you can deal with it!

In [None]:
from pyspark.sql import SparkSession
# May take a little while on a local computer
spark = SparkSession.builder.appName("dates").getOrCreate()

In [None]:
#df = spark.read.csv("appl_stock.csv",header=True,inferSchema=True)
import pandas as pd
df = spark.createDataFrame(pd.read_csv("https://storage.googleapis.com/neurals/data/appl_stock.csv",header='infer'))
df.show()

In [None]:
df.show()

Let's walk through how to grab parts of the timestamp data

In [None]:
from pyspark.sql.functions import format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format

In [None]:
df.select(dayofmonth(df['Date'])).show()

In [None]:
df.select(hour(df['Date'])).show()

In [None]:
df.select(dayofyear(df['Date'])).show()

In [None]:
df.select(month(df['Date'])).show()

So for example, let's say we wanted to know the average closing price per year. Easy! With a groupby and the year() function call:

In [None]:
df.select(year(df['Date'])).show()

In [None]:
df.withColumn("Year",year(df['Date'])).show()

In [None]:
newdf = df.withColumn("Year",year(df['Date']))
newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']].show()

Still not quite presentable! Let's use the .alias method as well as round() to clean this up!

In [None]:
result = newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']]
result = result.withColumnRenamed("avg(Year)","Year")
result = result.select('Year',format_number('avg(Close)',2).alias("Mean Close")).show()

Perfect! Now you know how to work with Date and Timestamp information!