In [0]:
# Introduction
# Author: Martand Singh
# Gmail: martandsays@gmail.com
# Facebook: https://www.facebook.com/codemakerz
# Dataset: https://data.world/carlvlewis/u-s-weather-outliers-1964
# In this notebook - we will setup of our enviornment. we will learn:
# 1. How to read csv file into dataframe
# 2. How to explore dataframe
# 3. How to print schema
# 4. How to type cast
# 5. How to perform basic operations on dataset like select, filter, renaming column, cast column to other datatype

In [0]:
# Use spark read function to read from a location. format() is use to define the file format. it can be csv, json, parquet depending on you 
# input file. option() is use to define extra options, here we are telling spark to use first row as header. load() function takes the path
# of input file.

df  = spark \
      .read \
      .format("csv") \
      .option("header", "true")\
      .load('/FileStore/tables/weather_anomalies_1964_2013.csv')

In [0]:
df.show() # show top 20 records. 

In [0]:
df.show(5) # now it wil show 5 records

In [0]:
df.printSchema() # prints the schema of the table. Gives you detail about columns data type. 

In [0]:
# Projection - you can select all or few columns.
df.select("*").show(5) # all column. We are using show() because select() method returns a new dataset.

In [0]:
df.select("date_str", "degrees_from_mean", "station_name").show(5) # only 3 columns

In [0]:
# get the column names
df.columns

In [0]:
# rename a column
 # renaming the column is not inline operation. You have to store new database in a variable df_new. So df_new is our new dataset with renamed columns.
df_new = df.withColumnRenamed("degrees_from_mean", "mean_degrees").withColumnRenamed("min_temp", "minTemp")
df_new.show(5)

In [0]:
# delete the column. it will create new dataframe which we are saving in again df_new.
df_new = df.drop("date_str", "degrees_from_mean")
df_new.show(5)

In [0]:
df.head(5) # top 5 columns. but unlike show() it returns list. you can check using type()

In [0]:
type(df.head(5)) # return list type

In [0]:
# filter - like sql you can also use filter method to apply crierias
df_new.filter(df_new["station_name"] == "GROUSE").show()

In [0]:
# multiple criteria
df_new.filter( (df_new["station_name"] == "GROUSE") & (df_new["type"] == "Weak Hot" )).show()

In [0]:
# get total count
df_new.count()

In [0]:
# add new column. lets add a new column month, which will store the month value of date_str. So first we will cast date_str to date type
# and then we will calculate the month
from pyspark.sql.functions import month, year
df_new = df \
        .withColumn("month", month(df["date_str"].cast("date")) ) \
        .withColumn("year", year(df["date_str"].cast("date")) )
df_new.show()