In [0]:
from pyspark.sql.functions import lit, col,isnull, count
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, StringType, DateType

## Reading Source Data - Single Table Queries

In [0]:
staff_Schema = StructType([StructField('Email', StringType(), True), StructField('Hire_Date', DateType(), True), StructField('IsActive', StringType(), False)])

animals_Schema = StructType([StructField('Name', StringType(), True), StructField('Species', StringType(), True), StructField('Primary_Color', StringType(), True), StructField('Implant_Chip_ID', StringType(), True), StructField('Breed', StringType(), True), StructField('Gender', StringType(), True), StructField('Birth_Date', DateType(), True), StructField('Pattern', StringType(), True), StructField('Admission_Date', DateType(), True)])

adoptions_Schema = StructType([StructField('Name', StringType(), True), StructField('Species', StringType(), True), StructField('Adopter_Email', StringType(), True), StructField('Adoption_Date', DateType(), True), StructField('Adoption_Fee', IntegerType(), True)])

persons_Schema = StructType([StructField('Email', StringType(), True), StructField('First_Name', StringType(), True), StructField('Last_Name', StringType(), True), StructField('Birth_Date', StringType(), True), StructField('Address', StringType(), True), StructField('State', StringType(), True), StructField('City', StringType(), True), StructField('Zip_Code', IntegerType(), True)])

vaccinations_Schema = StructType([StructField('Name', StringType(), True), StructField('Species', StringType(), True), StructField('Vaccination_Time', TimestampType(), True), StructField('Vaccine', StringType(), True), StructField('Batch', StringType(), True), StructField('Comments', StringType(), True), StructField('Email', StringType(), True)])

In [0]:
staff = spark.read.format('csv').schema(staff_Schema).load('/FileStore/tables/Animal_Shelter_Staff.csv')
animals = spark.read.format('csv').schema(animals_Schema).load('/FileStore/tables/Animal_Shelter_Animals.csv')
adoptions = spark.read.format('csv').schema(adoptions_Schema).load('/FileStore/tables/Animal_Shelter_Adoptions.csv')
persons = spark.read.format('csv').schema(persons_Schema).load('/FileStore/tables/Animal_Shelter_Persons.csv')
vaccinations = spark.read.format('csv').schema(vaccinations_Schema).load('/FileStore/tables/Animal_Shelter_Vaccinations.csv')

In [0]:
display(vaccinations.schema)

StructType([StructField('Name', StringType(), True), StructField('Species', StringType(), True), StructField('Vaccination_Time', TimestampType(), True), StructField('Vaccine', StringType(), True), StructField('Batch', StringType(), True), StructField('Comments', StringType(), True), StructField('Email', StringType(), True)])

In [0]:
display(staff)

Email,Hire_Date,IsActive
ashley.flores@animalshelter.com,2016-01-01,
dennis.hill@animalshelter.com,2018-10-07,
frances.hill@animalshelter.com,2016-01-01,
gerald.reyes@animalshelter.com,2018-03-20,
patrick.hughes@animalshelter.com,2018-12-15,
robin.murphy@animalshelter.com,2018-08-15,
sharon.davis@animalshelter.com,2016-01-01,
wanda.myers@animalshelter.com,2016-01-01,
wayne.carter@animalshelter.com,2018-04-02,


## Adding a literal

In [0]:
staff = staff.withColumn('IsActive', lit('Y'))

## Single Table : Display Staffs

In [0]:
display(staff)

Email,Hire_Date,IsActive
ashley.flores@animalshelter.com,2016-01-01,Y
dennis.hill@animalshelter.com,2018-10-07,Y
frances.hill@animalshelter.com,2016-01-01,Y
gerald.reyes@animalshelter.com,2018-03-20,Y
patrick.hughes@animalshelter.com,2018-12-15,Y
robin.murphy@animalshelter.com,2018-08-15,Y
sharon.davis@animalshelter.com,2016-01-01,Y
wanda.myers@animalshelter.com,2016-01-01,Y
wayne.carter@animalshelter.com,2018-04-02,Y


## Joins : Animals with Breeds Information

In [0]:
animals_With_Tags_Breeds = animals.join(adoptions, (animals.Name == adoptions.Name) & (animals.Species == adoptions.Species ), how = "inner").select(adoptions.Name, adoptions.Species,adoptions.Adopter_Email, adoptions.Adoption_Date, adoptions.Adoption_Fee, animals.Breed, animals.Implant_Chip_ID )

In [0]:
display(animals_With_Tags_Breeds)

Name,Species,Adopter_Email,Adoption_Date,Adoption_Fee,Breed,Implant_Chip_ID
Abby,Dog,patrick.hughes@animalshelter.com,2018-08-30,58,,FDFDB6FE-3347-4E80-8C8A-2E3235C6D1DE
Ace,Dog,justin.ruiz@hotmail.com,2019-10-26,68,,33D50C6B-9D2E-4EB1-8171-0466DEE4F349
Archie,Cat,patrick.hughes@animalshelter.com,2018-08-30,82,Persian,970D7094-AB66-4DCA-A0D1-0C16264989AF
Bailey,Dog,wayne.turner@icloud.com,2019-07-26,50,,36438BC9-E225-4038-97B2-1E28FD287957
Baloo,Rabbit,jesse.cox@yahoo.com,2017-12-16,58,English Lop,F5CE3A02-1EC7-431D-8A76-09369E8D798B
Beau,Dog,shirley.williams@outlook.com,2018-04-15,90,,4B94A68C-0C97-4F70-9275-35B3A9EEE8D9
Benji,Dog,sharon.davis@animalshelter.com,2018-11-18,97,English setter,646F0A76-14E4-42E7-9554-3AF1EA6CC78F
Brody,Dog,george.scott@hotmail.com,2019-02-21,83,Schnauzer,EB517826-E48A-41AE-A5FB-1BBECA23C05D
Brutus,Dog,virginia.baker@gmail.com,2019-01-28,66,Weimaraner,B7FAD096-7CD1-42A7-85D6-0C3E6599DBEB
Buddy,Cat,karen.smith@icloud.com,2019-09-27,73,,6D49B3F6-E075-4F33-97A3-1D4878EE1345


In [0]:
all_Animals_Adopted_Or_Not = animals.join(adoptions , (animals.Name == adoptions.Name) & (animals.Species== adoptions.Species), "left" ).select(adoptions.Name, adoptions.Species,adoptions.Adopter_Email, adoptions.Adoption_Date, adoptions.Adoption_Fee, animals.Breed, animals.Implant_Chip_ID )

In [0]:
display(all_Animals_Adopted_Or_Not)

Name,Species,Adopter_Email,Adoption_Date,Adoption_Fee,Breed,Implant_Chip_ID
Abby,Dog,patrick.hughes@animalshelter.com,2018-08-30,58.0,,FDFDB6FE-3347-4E80-8C8A-2E3235C6D1DE
Ace,Dog,justin.ruiz@hotmail.com,2019-10-26,68.0,,33D50C6B-9D2E-4EB1-8171-0466DEE4F349
,,,,,,F0769A5E-1A11-49F1-AC80-3F40A32EA158
,,,,,,CCFEF7E8-6FAD-4BA0-81EA-0611DD229E42
Archie,Cat,patrick.hughes@animalshelter.com,2018-08-30,82.0,Persian,970D7094-AB66-4DCA-A0D1-0C16264989AF
,,,,,,CD1528AD-C91D-47EA-9B70-3CACD5BDBE71
,,,,,,51D4CFD1-CD25-4C5A-AA52-0BFD771F8886
Bailey,Dog,wayne.turner@icloud.com,2019-07-26,50.0,,36438BC9-E225-4038-97B2-1E28FD287957
Baloo,Rabbit,jesse.cox@yahoo.com,2017-12-16,58.0,English Lop,F5CE3A02-1EC7-431D-8A76-09369E8D798B
Beau,Dog,shirley.williams@outlook.com,2018-04-15,90.0,,4B94A68C-0C97-4F70-9275-35B3A9EEE8D9


## Multi table Joins : Animals with Adopters

In [0]:
a = animals
ad = adoptions
p = persons
v = vaccinations
s = staff

In [0]:
a = animals
ad = adoptions
p = persons
animals_with_Adoption_Names = animals.join(adoptions, (a.Name == ad.Name) & (a.Species == ad.Species), "inner" ).join(persons, p.Email == ad.Adopter_Email).select(a.Name, a.Species, a.Implant_Chip_ID, a.Breed, ad.Adoption_Date, ad.Adoption_Fee, p.First_Name, p.Last_Name)

In [0]:
display(animals_with_Adoption_Names)

Name,Species,Implant_Chip_ID,Breed,Adoption_Date,Adoption_Fee,First_Name,Last_Name
Abby,Dog,FDFDB6FE-3347-4E80-8C8A-2E3235C6D1DE,,2018-08-30,58,Patrick,Hughes
Ace,Dog,33D50C6B-9D2E-4EB1-8171-0466DEE4F349,,2019-10-26,68,Justin,Ruiz
Archie,Cat,970D7094-AB66-4DCA-A0D1-0C16264989AF,Persian,2018-08-30,82,Patrick,Hughes
Bailey,Dog,36438BC9-E225-4038-97B2-1E28FD287957,,2019-07-26,50,Wayne,Turner
Baloo,Rabbit,F5CE3A02-1EC7-431D-8A76-09369E8D798B,English Lop,2017-12-16,58,Jesse,Cox
Beau,Dog,4B94A68C-0C97-4F70-9275-35B3A9EEE8D9,,2018-04-15,90,Shirley,Williams
Benji,Dog,646F0A76-14E4-42E7-9554-3AF1EA6CC78F,English setter,2018-11-18,97,Sharon,Davis
Brody,Dog,EB517826-E48A-41AE-A5FB-1BBECA23C05D,Schnauzer,2019-02-21,83,George,Scott
Brutus,Dog,B7FAD096-7CD1-42A7-85D6-0C3E6599DBEB,Weimaraner,2019-01-28,66,Virginia,Baker
Buddy,Cat,6D49B3F6-E075-4F33-97A3-1D4878EE1345,,2019-09-27,73,Karen,Smith


In [0]:

all_Animals_Ad_Or_NAd_with_Ad = adoptions.join(
                                                persons, p.Email == ad.Adopter_Email, "inner")\
                                    .join(animals,  
                                            (a.Name == ad.Name) & (a.Species == ad.Species), "right" )\
                                .select(a.Name, a.Species, a.Implant_Chip_ID, a.Breed, ad.Adoption_Date, \
                                    ad.Adoption_Fee, p.First_Name, p.Last_Name)

In [0]:
display(all_Animals_Ad_Or_NAd_with_Ad)

Name,Species,Implant_Chip_ID,Breed,Adoption_Date,Adoption_Fee,First_Name,Last_Name
Abby,Dog,FDFDB6FE-3347-4E80-8C8A-2E3235C6D1DE,,2018-08-30,58.0,Patrick,Hughes
Ace,Dog,33D50C6B-9D2E-4EB1-8171-0466DEE4F349,,2019-10-26,68.0,Justin,Ruiz
Angel,Dog,F0769A5E-1A11-49F1-AC80-3F40A32EA158,,,,,
April,Rabbit,CCFEF7E8-6FAD-4BA0-81EA-0611DD229E42,,,,,
Archie,Cat,970D7094-AB66-4DCA-A0D1-0C16264989AF,Persian,2018-08-30,82.0,Patrick,Hughes
Arya,Dog,CD1528AD-C91D-47EA-9B70-3CACD5BDBE71,,,,,
Aspen,Dog,51D4CFD1-CD25-4C5A-AA52-0BFD771F8886,,,,,
Bailey,Dog,36438BC9-E225-4038-97B2-1E28FD287957,,2019-07-26,50.0,Wayne,Turner
Baloo,Rabbit,F5CE3A02-1EC7-431D-8A76-09369E8D798B,English Lop,2017-12-16,58.0,Jesse,Cox
Beau,Dog,4B94A68C-0C97-4F70-9275-35B3A9EEE8D9,,2018-04-15,90.0,Shirley,Williams


## Multi Table Exercise : Animals and Vaccinations

In [0]:
a.display()

Name,Species,Primary_Color,Implant_Chip_ID,Breed,Gender,Birth_Date,Pattern,Admission_Date
Abby,Dog,Black,FDFDB6FE-3347-4E80-8C8A-2E3235C6D1DE,,F,1999-02-19,Tricolor,2016-07-19
Ace,Dog,Ginger,33D50C6B-9D2E-4EB1-8171-0466DEE4F349,,M,2005-12-19,Bicolor,2019-06-25
Angel,Dog,Brown,F0769A5E-1A11-49F1-AC80-3F40A32EA158,,F,2001-09-19,Tuxedo,2017-02-04
April,Rabbit,Gray,CCFEF7E8-6FAD-4BA0-81EA-0611DD229E42,,F,2005-01-27,Broken,2019-04-24
Archie,Cat,Ginger,970D7094-AB66-4DCA-A0D1-0C16264989AF,Persian,M,2009-08-26,Tricolor,2016-07-10
Arya,Dog,Gray,CD1528AD-C91D-47EA-9B70-3CACD5BDBE71,,F,2014-04-14,Bicolor,2018-06-10
Aspen,Dog,Brown,51D4CFD1-CD25-4C5A-AA52-0BFD771F8886,,F,2010-04-17,Tuxedo,2016-02-09
Bailey,Dog,Ginger,36438BC9-E225-4038-97B2-1E28FD287957,,F,2014-09-28,Bicolor,2018-10-01
Baloo,Rabbit,White,F5CE3A02-1EC7-431D-8A76-09369E8D798B,English Lop,M,2015-04-27,Broken,2016-08-21
Beau,Dog,Cream,4B94A68C-0C97-4F70-9275-35B3A9EEE8D9,,M,2016-02-09,Solid,2017-05-24


In [0]:
all_animals_vaccinated = animals.join(vaccinations, (a.Name == v.Name) & (a.Species == v.Species) , "left").join(staff, v.Email == s.Email, "left")\
    .join(persons,s.Email == p.Email, "left").select(a.Name, a.Species, v.Vaccination_Time, v.Vaccine, p.First_Name, p.Last_Name)

In [0]:
display(all_animals_vaccinated)

Name,Species,Vaccination_Time,Vaccine,First_Name,Last_Name
Abby,Dog,2018-04-19T10:44:00.000+0000,Distemper Virus,Wanda,Myers
Abby,Dog,2017-04-19T09:01:00.000+0000,Distemper Virus,Ashley,Flores
Ace,Dog,,,,
Angel,Dog,2018-05-04T11:18:00.000+0000,Rabies,Ashley,Flores
Angel,Dog,2017-05-04T12:49:00.000+0000,Rabies,Wanda,Myers
Angel,Dog,2018-05-04T09:47:00.000+0000,Distemper Virus,Wanda,Myers
Angel,Dog,2017-05-04T10:38:00.000+0000,Distemper Virus,Wanda,Myers
April,Rabbit,,,,
Archie,Cat,2017-11-20T13:25:00.000+0000,Panleukopenia Virus,Ashley,Flores
Archie,Cat,2017-11-20T09:35:00.000+0000,Calicivirus,Ashley,Flores


In [0]:
animals_vaccinated = all_animals_vaccinated.where( col('Vaccine').isNotNull()  )

In [0]:
display(animals_vaccinated)

Name,Species,Vaccination_Time,Vaccine,First_Name,Last_Name
Abby,Dog,2017-04-19T09:01:00.000+0000,Distemper Virus,Ashley,Flores
Abby,Dog,2018-04-19T10:44:00.000+0000,Distemper Virus,Wanda,Myers
Angel,Dog,2017-05-04T10:38:00.000+0000,Distemper Virus,Wanda,Myers
Angel,Dog,2018-05-04T09:47:00.000+0000,Distemper Virus,Wanda,Myers
Angel,Dog,2017-05-04T12:49:00.000+0000,Rabies,Wanda,Myers
Angel,Dog,2018-05-04T11:18:00.000+0000,Rabies,Ashley,Flores
Archie,Cat,2017-11-20T09:35:00.000+0000,Calicivirus,Ashley,Flores
Archie,Cat,2017-11-20T13:25:00.000+0000,Panleukopenia Virus,Ashley,Flores
Aspen,Dog,2016-09-28T07:36:00.000+0000,Adenovirus,Wanda,Myers
Aspen,Dog,2017-09-29T12:35:00.000+0000,Adenovirus,Wanda,Myers


In [0]:
vaccinations_agg = vaccinations.groupBy('Species', 'Name').agg(count(col('*')).alias('CountOfSpecies')).orderBy('Species', 'Name')
vaccinations_agg.display()

Species,Name,CountOfSpecies
Cat,Archie,2
Cat,Fiona,2
Cat,Hobbes,1
Cat,Misty,2
Cat,Nova,2
Cat,Oscar,3
Cat,Patches,1
Cat,Penelope,2
Cat,Pumpkin,2
Cat,Sadie,1
