In [5]:
import pyspark
import os

# Create SparkSession
sparkql = pyspark.sql.SparkSession.builder.master('local').getOrCreate()

# Create path and file variables
data_dir = './data'
coffee_data = 'coffee.csv'

# Create coffee PySpark DataFrame
coffee_df = sparkql.read.csv(os.path.join(data_dir, coffee_data), header=True)

# Show columns, schema, and df
print(coffee_df.columns)
print(coffee_df.schema)
coffee_df.show(4)

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Currency']
StructType([StructField('Date', StringType(), True), StructField('Open', StringType(), True), StructField('High', StringType(), True), StructField('Low', StringType(), True), StructField('Close', StringType(), True), StructField('Volume', StringType(), True), StructField('Currency', StringType(), True)])
+----------+------+-----+------+------+------+--------+
|      Date|  Open| High|   Low| Close|Volume|Currency|
+----------+------+-----+------+------+------+--------+
|2000-01-03|122.25|124.0| 116.1| 116.5|  6640|     USD|
|2000-01-04|116.25|120.5|115.75|116.25|  5492|     USD|
|2000-01-05| 115.0|121.0| 115.0| 118.6|  6165|     USD|
|2000-01-06| 119.0|121.4| 116.5|116.85|  5094|     USD|
+----------+------+-----+------+------+------+--------+
only showing top 4 rows



In [12]:
# Change the data types of columns
from pyspark.sql.functions import to_date
from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType

# Cast Date column as DateType with to_date method
coffee_df = coffee_df.withColumn('Date', to_date(coffee_df['Date'], format='yyyy-MM-dd'))
# Make sure date is properly formatted
coffee_df.select('Date').show(4)

# Cast other columns as FloatType
coffee_df = coffee_df.withColumn('Open', coffee_df['Open'].cast(FloatType()))
coffee_df = coffee_df.withColumn('High', coffee_df['High'].cast(FloatType()))
coffee_df = coffee_df.withColumn('Low', coffee_df['Low'].cast(FloatType()))
coffee_df = coffee_df.withColumn('Close', coffee_df['Close'].cast(FloatType()))

coffee_df = coffee_df.withColumn('Volume', coffee_df['Volume'].cast(IntegerType()))

print(coffee_df.schema)

+----------+
|      Date|
+----------+
|2000-01-03|
|2000-01-04|
|2000-01-05|
|2000-01-06|
+----------+
only showing top 4 rows

StructType([StructField('Date', DateType(), True), StructField('Open', FloatType(), True), StructField('High', FloatType(), True), StructField('Low', FloatType(), True), StructField('Close', FloatType(), True), StructField('Volume', IntegerType(), True), StructField('Currency', StringType(), True)])


In [16]:
from pyspark.sql.functions import round

# Create High-Low and Open-Close difference Columns
coffee_df = coffee_df.withColumn('Open_Close_Diff', round(coffee_df.Open - coffee_df.Close, 2))
coffee_df = coffee_df.withColumn('High_Low_Diff', round(coffee_df.High - coffee_df.Low, 2))
coffee_df.show(4)

+----------+------+-----+------+------+------+--------+---------------+-------------+
|      Date|  Open| High|   Low| Close|Volume|Currency|Open_Close_Diff|High_Low_Diff|
+----------+------+-----+------+------+------+--------+---------------+-------------+
|2000-01-03|122.25|124.0| 116.1| 116.5|  6640|     USD|           5.75|          7.9|
|2000-01-04|116.25|120.5|115.75|116.25|  5492|     USD|            0.0|         4.75|
|2000-01-05| 115.0|121.0| 115.0| 118.6|  6165|     USD|           -3.6|          6.0|
|2000-01-06| 119.0|121.4| 116.5|116.85|  5094|     USD|           2.15|          4.9|
+----------+------+-----+------+------+------+--------+---------------+-------------+
only showing top 4 rows



In [41]:
from pyspark.sql.functions import lit, when

# Create Boolean column based on Volume values
coffee_df = coffee_df.withColumn('volume_filter_100', when(coffee_df.Volume >= 100, lit(True)).otherwise(lit(False)))

# Making sure column was properly created
coffee_df.filter(coffee_df.volume_filter_100 == True).show(5)
coffee_df.filter(coffee_df.volume_filter_100 == False).show(10)

+----------+------+------+------+------+------+--------+---------------+-------------+-----------------+
|      Date|  Open|  High|   Low| Close|Volume|Currency|Open_Close_Diff|High_Low_Diff|volume_filter_100|
+----------+------+------+------+------+------+--------+---------------+-------------+-----------------+
|2000-01-03|122.25| 124.0| 116.1| 116.5|  6640|     USD|           5.75|          7.9|             true|
|2000-01-04|116.25| 120.5|115.75|116.25|  5492|     USD|            0.0|         4.75|             true|
|2000-01-05| 115.0| 121.0| 115.0| 118.6|  6165|     USD|           -3.6|          6.0|             true|
|2000-01-06| 119.0| 121.4| 116.5|116.85|  5094|     USD|           2.15|          4.9|             true|
|2000-01-07|117.25|117.75| 113.8|114.15|  6855|     USD|            3.1|         3.95|             true|
+----------+------+------+------+------+------+--------+---------------+-------------+-----------------+
only showing top 5 rows

+----------+------+------+----