In [None]:
# This notebook provides basic examples are interacting with Spark through the pyspark API
# If it is run with the corresponding Docker compose file then the Web UI at http://localhost:4040
# shows details of the jobs executed on the cluster/local machine

In [None]:
from os.path import abspath
import requests

# Fetch sample data
SAMPLE_DATA_URL = 'https://github.com/owid/owid-datasets/raw/master/datasets/UK%20Nominal%20wage%20data%2C%20price%20data%2C%20and%20real%20wage%20%E2%80%93%20Bank%20of%20England%20(!Three%20centuries%20of%20macroeconomic/UK%20Nominal%20wage%20data%2C%20price%20data%2C%20and%20real%20wage%20%E2%80%93%20Bank%20of%20England%20(!Three%20centuries%20of%20macroeconomic.csv'
SAMPLE_DATA_CSV = abspath('uk-macroeconomic-data.csv')

resp = requests.get(SAMPLE_DATA_URL)
with open(SAMPLE_DATA_CSV, 'wb') as file_handle:
    file_handle.write(resp.content)

In [None]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder.master("local").getOrCreate()

In [None]:
# Load CSV data into Spark DataFrame.
# By default all columns are assumed to be strings and column names are taken from header if provided
df_str = spark.read.csv(SAMPLE_DATA_CSV, header=True, sep=",")
df_str.printSchema()


In [None]:
# Load CSV data into Spark DataFrame and infer the schema. Note that this requires an
# additional pass over the data so is not suited for large datasets
df_inferred = spark.read.csv(SAMPLE_DATA_CSV, header=True, inferSchema=True, sep=",")
df_inferred.printSchema()

In [None]:
# Load CSV data into Spark DataFrame and provide the schema.
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

# Define the schema
schema = StructType([
    StructField("Entity", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Nominal Average Weekly Wages (2017)", DoubleType(), True),
    StructField("Spliced CPI (2015=100)", DoubleType(), True),
    StructField("Real Average Weekly Wages (2017)", DoubleType(), True),
])

df_manual_schema = spark.read.csv(SAMPLE_DATA_CSV, header=True, schema=macroeconomic_schema, sep=",")
df_manual_schema.printSchema()

In [None]:
# SQL queries can be run on the data by create a temporary view for this.
# The lifetime is tied to the session

# DF API
filtered_df = df_manual_schema.filter(df_manual_schema["Real Average Weekly Wages (2017)"] > 200.)
print(f'No. of Years where Real Average Weekly Wages > £200 (DataFrame API): {filtered_df.count()}')

# SQL query
df_manual_schema.createOrReplaceTempView('economy')
sql_df = spark.sql("select * from economy where `Real Average Weekly Wages (2017)` > 200")
print(f'No. of Years where Real Average Weekly Wages > £200 (SQL API): {sql_df.count()}')
