In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, to_date
spark = SparkSession.builder.getOrCreate()

In [2]:
# Enforce data types
pets_schema = StructType([
    StructField("PetID", StringType()),
    StructField("Name", StringType()),
    StructField("Kind", StringType()),
    StructField("Gender", StringType()),
    StructField("Age", IntegerType()),
    StructField("OwnerID", StringType())])

owners_schema = StructType([
    StructField("OwnerID", StringType()),
    StructField("Name", StringType()),
    StructField("Surname", StringType()),
    StructField("StreetAddress", StringType()),
    StructField("City", StringType()),
    StructField("State", StringType()),
    StructField("StateFull", StringType()),
    StructField("ZipCode", StringType())])

proceduresdetails_schema = StructType([
    StructField("ProcedureType", StringType()),
    StructField("ProcedureSubCode", StringType()),
    StructField("Description", StringType()),
    StructField("Price", DoubleType())])

procedureshistory_schema = StructType([
    StructField("PetID", StringType()),
    StructField("ProcedureDate", StringType()),
    StructField("ProcedureType", StringType()),
    StructField("ProcedureSubCode", StringType())])

In [3]:
# Create Spark DataFrames
pets = spark.read.csv('data/Pets.csv', header=True, schema=pets_schema)
owners = spark.read.csv('data/Owners.csv', header=True, schema=owners_schema)
proceduresdetails = spark.read.csv('data/ProceduresDetails.csv', header=True, schema=proceduresdetails_schema)
procedureshistory = spark.read.csv('data/ProceduresHistory.csv', header=True, schema=procedureshistory_schema)

# Change data type of the ProcedureDate as it couldn't be done in the schema options
procedureshistory = procedureshistory.withColumn('ProcedureDate', 
                   to_date(col('ProcedureDate'), 'yyyy/MM/dd'))

In [17]:
# Change the column names to lowercase
def col_to_lowercase(df):
    for col in df.columns:
        new_col = col.lower()
        df = df.withColumnRenamed(col, new_col)
    return df
pets = col_to_lowercase(pets)
owners = col_to_lowercase(owners)
proceduresdetails = col_to_lowercase(proceduresdetails)
procedureshistory = col_to_lowercase(procedureshistory)