In [None]:
# SET UP OF DATA

# Set up PySpark

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, to_date
spark = SparkSession.builder.getOrCreate()

In [None]:
# Enforce data types
pets_schema = StructType([
    StructField("PetID", StringType()),
    StructField("Name", StringType()),
    StructField("Kind", StringType()),
    StructField("Gender", StringType()),
    StructField("Age", IntegerType()),
    StructField("OwnerID", StringType())])

owners_schema = StructType([
    StructField("OwnerID", StringType()),
    StructField("Name", StringType()),
    StructField("Surname", StringType()),
    StructField("StreetAddress", StringType()),
    StructField("City", StringType()),
    StructField("State", StringType()),
    StructField("StateFull", StringType()),
    StructField("ZipCode", StringType())])

proceduresdetails_schema = StructType([
    StructField("ProcedureType", StringType()),
    StructField("ProcedureSubCode", StringType()),
    StructField("Description", StringType()),
    StructField("Price", DoubleType())])

procedureshistory_schema = StructType([
    StructField("PetID", StringType()),
    StructField("ProcedureDate", StringType()),
    StructField("ProcedureType", StringType()),
    StructField("ProcedureSubCode", StringType())])

In [34]:
# Create Spark DataFrames
ps_pets = spark.read.csv('data/Pets.csv', header=True, schema=pets_schema)
ps_owners = spark.read.csv('data/Owners.csv', header=True, schema=owners_schema)
ps_proceduresdetails = spark.read.csv('data/ProceduresDetails.csv', header=True, schema=proceduresdetails_schema)
ps_procedureshistory = spark.read.csv('data/ProceduresHistory.csv', header=True, schema=procedureshistory_schema)

# Change data type of the ProcedureDate as it couldn't be done in the schema options
ps_procedureshistory = ps_procedureshistory.withColumn('ProcedureDate', 
                   to_date(col('ProcedureDate'), 'yyyy/MM/dd'))

In [35]:
# Change the column names to lowercase
def col_to_lowercase(df):
    for col in df.columns:
        new_col = col.lower()
        df = df.withColumnRenamed(col, new_col)
    return df
ps_pets = col_to_lowercase(ps_pets)
ps_owners = col_to_lowercase(ps_owners)
ps_proceduresdetails = col_to_lowercase(ps_proceduresdetails)
ps_procedureshistory = col_to_lowercase(ps_procedureshistory)

# Set up PostgreSQL

In [None]:
import psycopg2
import pandas as pd
from db_creds import creds

In [None]:
class DataBase:
    def __init__(self, host, dbname, username, password, port):
        self.host = host 
        self.port = port 
        self.dbname = dbname 
        self.username = username 
        self.password = password 
    
    def __connect__(self):
        """Opens connector class and initiates cursor"""
        self.con = psycopg2.connect(host=self.host, port=self.port, user=self.username, password=self.password, 
                                                 database=self.dbname) 
        self.cur = self.con.cursor()

    def __disconnect__(self):
        """Commits any changes to the database and closes connection"""
        self.con.commit()
        self.con.close()
        
    def conn(self):
        self.con = psycopg2.connect(host=self.host, port=self.port, user=self.username, password=self.password, 
                                                 database=self.dbname) 
        return self.con

    def fetch(self, sql, variables=None):
        """Connects to database, fetches data specific to sql query, then disconnects from database"""
        self.__connect__()
        try:
            self.cur.execute(sql, variables)
            result = self.cur.fetchall()
            return result
        except Exception as e:
            print (e)
        finally:
            self.__disconnect__()
        

    def execute(self, sql, variables=None):
        """Connects to database, executes sql query, along with any variables, then disconnects from database"""
        self.__connect__()
        try:
            self.cur.execute(sql, variables)
        except Exception as e:
            print (e)
        finally:
            self.__disconnect__()
            
    def get_cols(self, table, details='no'):
        data = self.fetch("""
                SELECT *
          FROM information_schema.columns
        where table_schema = 'public'
             ;
                """)
        cols = []
        if details == 'yes':
            for i in data:
                if i[2:][0] == table:
                    print (i[2:][1], i[2:][5], i[2:][6])
        else:
            for i in data:
                if i[2:][0] == table:
                    cols.append(i[2:][1])
            str_cols = ', '.join(cols)
            return str_cols
    
    def get_tables(self):
        return self.fetch('''
            SELECT table_name
            FROM information_schema.tables
            WHERE table_schema = 'public'
            ORDER BY table_name;
            ''')

In [None]:
# Create database connection
username = creds['username']
password = creds['password']
host = creds['host']
dbname = creds['dbname']
port = creds['port']
db = DataBase(host, dbname, username, password, port)

In [None]:
# Create pets table
db.execute('''
    CREATE TABLE IF NOT EXISTS pets (
    petid varchar,
    name varchar,
    kind varchar,
    gender varchar,
    age int,
    ownerid varchar
)
''')

In [None]:
# Create owners owners
db.execute('''
CREATE TABLE IF NOT EXISTS owners (
    ownerid varchar,
    name varchar,
    surname varchar,
    streetaddress varchar,
    city varchar,
    state varchar(2),
    statefull varchar,
    zipcode varchar
)
''')

In [None]:
# Create owners proceduredetails
db.execute('''
CREATE TABLE IF NOT EXISTS proceduresdetails (
    proceduretype varchar,
    proceduresubcode varchar,
    description varchar,
    price float
)
''')

In [None]:
# Create owners prodecurehistory
db.execute('''
CREATE TABLE IF NOT EXISTS procedureshistory (
    petid varchar,
    proceduredate date,
    proceduretype varchar,
    proceduresubcode varchar
)
''')

In [None]:
# Check tables have been created
db.get_tables()

## With COPY statement if you have superuser rights on Postgres

In [None]:
# Copy pets data
db.execute('''
    COPY pets FROM 'data/Pets.csv' DELIMITER ',' CSV HEADER
''')

In [None]:
# Copy owners data
db.execute('''
    COPY owners FROM 'data/Owners.csv' DELIMITER ',' CSV HEADER
''')

In [None]:
# Copy proceduredetails data
db.execute('''
    COPY proceduresdetails FROM 'data/ProceduresDetails.csv' DELIMITER ',' CSV HEADER
''')

In [None]:
# Copy procedurehistory data
db.execute('''
    COPY procedureshistory FROM 'data/ProceduresHistory.csv' DELIMITER ',' CSV HEADER
''')

## With pandas.to_csv() 

An alternative if you do not have superuser access to your Postgres database

In [None]:
from sqlalchemy import create_engine
engine = create_engine(f'postgresql+psycopg2://{username}:{password}@{host}:{port}/{dbname}')

In [None]:
# Create pandas DataFrames
pets = pd.read_csv('data/Pets.csv')#, dtype={'OwnerID':'object'})
owners = pd.read_csv('data/Owners.csv')
proceduresdetails = pd.read_csv('data/ProceduresDetails.csv')
procedureshistory = pd.read_csv('data/ProceduresHistory.csv')

In [None]:
# Rename the columns to lower case to match database column names
pets = pets.rename(str.lower, axis='columns')
owners = owners.rename(str.lower, axis='columns')
proceduresdetails = proceduresdetails.rename(str.lower, axis='columns')
procedureshistory = procedureshistory.rename(str.lower, axis='columns')

In [None]:
# Copy data to database
pets.to_sql('pets', engine, if_exists='append', index=False)
owners.to_sql('owners', engine, if_exists='append', index=False)
proceduresdetails.to_sql('proceduresdetails', engine, if_exists='append', index=False)
procedureshistory.to_sql('procedureshistory', engine, if_exists='append', index=False)

In [None]:
# Verify # of records in each table
tables = [['pets', pets],['owners', owners],['proceduresdetails', proceduresdetails],['procedureshistory', procedureshistory]]
for table in tables:
    table_name = table[0]
    db_count = db.fetch(f'''
        SELECT COUNT(*) FROM {table_name}
    ''')[0][0]
    if db_count == len(table[1]):
        print (f'{table_name} table verified.')
    else:
        print (f'{table_name} Error: {db_count} records in SQL table & {len(table[1])} records in Pandas dataframe.')

# Pandas Setup

In [None]:
import pandas as pd

In [None]:
# Create pandas DataFrames
pd_pets = pd.read_csv('data/Pets.csv', dtype={'OwnerID':'object'})
pd_owners = pd.read_csv('data/Owners.csv', dtype={'OwnerID':'object', 'ZipCode':'object'})
pd_proceduresdetails = pd.read_csv('data/ProceduresDetails.csv', dtype={'ProcedureSubCode':'object'})
pd_procedureshistory = pd.read_csv('data/ProceduresHistory.csv', parse_dates=['ProcedureDate'], dtype={'ProcedureSubCode':'object'})

In [None]:
# Rename the columns to lower case to match database column names
pd_pets = pd_pets.rename(str.lower, axis='columns')
pd_owners = pd_owners.rename(str.lower, axis='columns')
pd_proceduresdetails = pd_proceduresdetails.rename(str.lower, axis='columns')
pd_procedureshistory = pd_procedureshistory.rename(str.lower, axis='columns')

In [None]:
# ensure all data types are correct
print (pets.dtypes)
print ('---------------')
print (owners.dtypes)
print ('---------------')
print (proceduresdetails.dtypes)
print ('---------------')
print (procedureshistory.dtypes)

# START OF COMPARISON

# Queries

## SELECT & LIMIT Statements

In [38]:
# SQL
sql = '''
    SELECT * FROM owners
    LIMIT 3
'''
db. fetch(sql)

[('6049',
  'Debbie',
  'Metivier',
  '315 Goff Avenue',
  'Grand Rapids',
  'MI',
  'Michigan',
  '49503'),
 ('2863',
  'John',
  'Sebastian',
  '3221 Perry Street',
  'Davison',
  'MI',
  'Michigan',
  '48423'),
 ('3518',
  'Connie',
  'Pauley',
  '1539 Cunningham Court',
  'Bloomfield Township',
  'MI',
  'Michigan',
  '48302')]

In [40]:
# Pands
pd_owners.head(3)

Unnamed: 0,ownerid,name,surname,streetaddress,city,state,statefull,zipcode
0,6049,Debbie,Metivier,315 Goff Avenue,Grand Rapids,MI,Michigan,49503
1,2863,John,Sebastian,3221 Perry Street,Davison,MI,Michigan,48423
2,3518,Connie,Pauley,1539 Cunningham Court,Bloomfield Township,MI,Michigan,48302


In [167]:
# PySpark
ps_owners.limit(3).show(truncate=False)

+-------+------+---------+---------------------+-------------------+-----+---------+-------+
|ownerid|name  |surname  |streetaddress        |city               |state|statefull|zipcode|
+-------+------+---------+---------------------+-------------------+-----+---------+-------+
|6049   |Debbie|Metivier |315 Goff Avenue      |Grand Rapids       |MI   |Michigan |49503  |
|2863   |John  |Sebastian|3221 Perry Street    |Davison            |MI   |Michigan |48423  |
|3518   |Connie|Pauley   |1539 Cunningham Court|Bloomfield Township|MI   |Michigan |48302  |
+-------+------+---------+---------------------+-------------------+-----+---------+-------+



## SELECT multiple columns

In [61]:
# SQL
sql = '''
    SELECT name, surname
    FROM owners
    LIMIT 3
'''
db.fetch(sql)

[('Debbie', 'Metivier'), ('John', 'Sebastian'), ('Connie', 'Pauley')]

In [63]:
# Pandas
pd_owners[['name','surname']].head(3)

Unnamed: 0,name,surname
0,Debbie,Metivier
1,John,Sebastian
2,Connie,Pauley


In [168]:
# PySpark
ps_owners.select('name','surname').limit(3).show()

+------+---------+
|  name|  surname|
+------+---------+
|Debbie| Metivier|
|  John|Sebastian|
|Connie|   Pauley|
+------+---------+



## DISTINCT Clause

In [66]:
# SQL
sql = '''
    SELECT DISTINCT statefull
    FROM owners
'''
db.fetch(sql)

[('Michigan',)]

In [67]:
# Pandas
pd_owners['statefull'].unique()

array(['Michigan'], dtype=object)

In [68]:
# PySpark
ps_owners.select('statefull').distinct().show()

+---------+
|statefull|
+---------+
| Michigan|
+---------+



## WHERE Clause

In [71]:
# SQL
sql = '''
    SELECT name, surname
    FROM owners
    WHERE city = 'Southfield'
'''
db.fetch(sql)

[('Jessica', 'Velazquez'),
 ('Marie', 'Floyd'),
 ('Rosa', 'Quarles'),
 ('Elizabeth', 'Griffin'),
 ('Debra', 'Robins'),
 ('Stacey', 'Randolph'),
 ('Elvia', 'Warren'),
 ('Paul', 'Haring'),
 ('Jason', 'Cantwell'),
 ('Travis', 'Bowman'),
 ('Lawrence', 'Roder'),
 ('Wm', 'Poulson'),
 ('Bruce', 'Hart'),
 ('Marion', 'Glover'),
 ('Joseph', 'Blow')]

In [72]:
# Pandas
pd_owners[pd_owners['city'] == 'Southfield'][['name','surname']]

Unnamed: 0,name,surname
4,Jessica,Velazquez
12,Marie,Floyd
16,Rosa,Quarles
17,Elizabeth,Griffin
25,Debra,Robins
32,Stacey,Randolph
34,Elvia,Warren
37,Paul,Haring
43,Jason,Cantwell
45,Travis,Bowman


In [83]:
# PySpark
ps_owners.select('name','surname').where(ps_owners.city == 'Southfield').show()

+---------+---------+
|     name|  surname|
+---------+---------+
|  Jessica|Velazquez|
|    Marie|    Floyd|
|     Rosa|  Quarles|
|Elizabeth|  Griffin|
|    Debra|   Robins|
|   Stacey| Randolph|
|    Elvia|   Warren|
|     Paul|   Haring|
|    Jason| Cantwell|
|   Travis|   Bowman|
| Lawrence|    Roder|
|       Wm|  Poulson|
|    Bruce|     Hart|
|   Marion|   Glover|
|   Joseph|     Blow|
+---------+---------+



## LIKE Operator (beginning of text)

In [77]:
# SQL
sql = '''
    SELECT name 
    FROM owners
    WHERE name LIKE 'An%'
'''
db.fetch(sql)

[('Anne',), ('Andrew',)]

In [80]:
# Pandas
pd_owners[pd_owners['name'].str.startswith('An')]['name']

11      Anne
66    Andrew
Name: name, dtype: object

In [87]:
# PySpark
ps_owners.select('name').where(ps_owners.name.like('An%')).show()

+------+
|  name|
+------+
|  Anne|
|Andrew|
+------+



## LIKE Operator (end of text)

In [89]:
# SQL
sql = '''
    SELECT name 
    FROM owners
    WHERE name LIKE '%na'
'''
db.fetch(sql)

[('Lena',), ('Edna',), ('Sabrina',)]

In [90]:
# Pandas
pd_owners[pd_owners['name'].str.endswith('na')]['name']

3        Lena
23       Edna
31    Sabrina
Name: name, dtype: object

In [91]:
# PySpark
ps_owners.select('name').where(ps_owners.name.like('%na')).show()

+-------+
|   name|
+-------+
|   Lena|
|   Edna|
|Sabrina|
+-------+



## LIKE Operator (middle of text)

In [93]:
# SQL
sql = '''
    SELECT name 
    FROM owners
    WHERE name LIKE '%ar%'
'''
db.fetch(sql)

[('Karen',),
 ('Mary',),
 ('Carmen',),
 ('Marie',),
 ('Sarah',),
 ('Carolyn',),
 ('Larry',),
 ('Charles',),
 ('Charles',),
 ('Richard',),
 ('Mario',),
 ('Ricardo',),
 ('Marion',),
 ('Gary',)]

In [94]:
# Pandas
pd_owners[pd_owners['name'].str.contains('ar')]['name']

7       Karen
8        Mary
9      Carmen
12      Marie
15      Sarah
24    Carolyn
40      Larry
53    Charles
56    Charles
72    Richard
74      Mario
78    Ricardo
81     Marion
87       Gary
Name: name, dtype: object

In [95]:
# PySpark
ps_owners.select('name').where(ps_owners.name.like('%ar%')).show()

+-------+
|   name|
+-------+
|  Karen|
|   Mary|
| Carmen|
|  Marie|
|  Sarah|
|Carolyn|
|  Larry|
|Charles|
|Charles|
|Richard|
|  Mario|
|Ricardo|
| Marion|
|   Gary|
+-------+



## NULL Values (is null)

In [104]:
# SQL
sql = '''
    SELECT streetaddress
    FROM owners
    WHERE streetaddress IS NOT NULL
    LIMIT 3
'''
db.fetch(sql)

[('315 Goff Avenue',), ('3221 Perry Street',), ('1539 Cunningham Court',)]

In [106]:
# Pandas
pd_owners[pd_owners['streetaddress'].notnull()]['streetaddress'].head(3)

0          315 Goff Avenue
1        3221 Perry Street
2    1539 Cunningham Court
Name: streetaddress, dtype: object

In [109]:
# PySpark
ps_owners.select('streetaddress').where(ps_owners.streetaddress.isNotNull()).show(3, truncate=False)

+---------------------+
|streetaddress        |
+---------------------+
|315 Goff Avenue      |
|3221 Perry Street    |
|1539 Cunningham Court|
+---------------------+
only showing top 3 rows



## NULL Values (is null)

In [110]:
# SQL
sql = '''
    SELECT streetaddress
    FROM owners
    WHERE streetaddress IS NULL
'''
db.fetch(sql)

[]

In [112]:
# Pandas
pd_owners[pd_owners['streetaddress'].isnull()]['streetaddress']

Series([], Name: streetaddress, dtype: object)

In [113]:
# PySpark
ps_owners.select('streetaddress').where(ps_owners.streetaddress.isNull()).show()

+-------------+
|streetaddress|
+-------------+
+-------------+



## BETWEEN Operator

In [125]:
# SQL
sql = '''
    SELECT * 
    FROM pets
    WHERE age BETWEEN 1 AND 3
'''
db.fetch(sql)

[('M0-2904', 'Simba', 'Cat', 'male', 1, '3086'),
 ('R3-7551', 'Keller', 'Parrot', 'female', 2, '7908'),
 ('J2-3320', 'Heisenberg', 'Dog', 'male', 3, '1319'),
 ('U4-9376', 'Scout', 'Dog', 'female', 2, '7846'),
 ('H8-1429', 'Lily', 'Dog', 'female', 3, '7846'),
 ('J1-6366', 'Bruce', 'Dog', 'male', 3, '8316'),
 ('O3-1895', 'Candy', 'Dog', 'female', 3, '8133'),
 ('S4-4013', 'Pip', 'Dog', 'male', 3, '7484'),
 ('U4-6674', 'Biscuit', 'Dog', 'female', 1, '3663'),
 ('L4-4030', 'Bruce', 'Dog', 'male', 1, '5502'),
 ('O6-3123', 'Biscuit', 'Dog', 'female', 2, '6194'),
 ('W9-8307', 'Lexie', 'Dog', 'female', 1, '5207'),
 ('S1-2243', 'Cuddles', 'Dog', 'male', 2, '5447'),
 ('Q0-8904', 'Bright', 'Dog', 'male', 1, '4110'),
 ('G6-6501', 'Jake', 'Cat', 'male', 2, '3089'),
 ('S4-2254', 'Draper', 'Cat', 'male', 3, '8619'),
 ('X6-4876', 'Brandy', 'Cat', 'female', 3, '6406'),
 ('F1-1855', 'Bandit', 'Parrot', 'male', 2, '9604'),
 ('Z8-4419', 'Scooter', 'Dog', 'male', 3, '4464'),
 ('U8-6473', 'Biscuit', 'Dog', 'f

In [126]:
# Pandas
pd_pets[pd_pets['age'].between(1, 3)]

Unnamed: 0,petid,name,kind,gender,age,ownerid
2,M0-2904,Simba,Cat,male,1,3086
3,R3-7551,Keller,Parrot,female,2,7908
9,J2-3320,Heisenberg,Dog,male,3,1319
11,U4-9376,Scout,Dog,female,2,7846
12,H8-1429,Lily,Dog,female,3,7846
34,J1-6366,Bruce,Dog,male,3,8316
41,O3-1895,Candy,Dog,female,3,8133
43,S4-4013,Pip,Dog,male,3,7484
54,U4-6674,Biscuit,Dog,female,1,3663
56,L4-4030,Bruce,Dog,male,1,5502


In [127]:
# PySpark
ps_pets.where(ps_pets.age.between(1,3)).show()

+-------+----------+------+------+---+-------+
|  petid|      name|  kind|gender|age|ownerid|
+-------+----------+------+------+---+-------+
|M0-2904|     Simba|   Cat|  male|  1|   3086|
|R3-7551|    Keller|Parrot|female|  2|   7908|
|J2-3320|Heisenberg|   Dog|  male|  3|   1319|
|U4-9376|     Scout|   Dog|female|  2|   7846|
|H8-1429|      Lily|   Dog|female|  3|   7846|
|J1-6366|     Bruce|   Dog|  male|  3|   8316|
|O3-1895|     Candy|   Dog|female|  3|   8133|
|S4-4013|       Pip|   Dog|  male|  3|   7484|
|U4-6674|   Biscuit|   Dog|female|  1|   3663|
|L4-4030|     Bruce|   Dog|  male|  1|   5502|
|O6-3123|   Biscuit|   Dog|female|  2|   6194|
|W9-8307|     Lexie|   Dog|female|  1|   5207|
|S1-2243|   Cuddles|   Dog|  male|  2|   5447|
|Q0-8904|    Bright|   Dog|  male|  1|   4110|
|G6-6501|      Jake|   Cat|  male|  2|   3089|
|S4-2254|    Draper|   Cat|  male|  3|   8619|
|X6-4876|    Brandy|   Cat|female|  3|   6406|
|F1-1855|    Bandit|Parrot|  male|  2|   9604|
|Z8-4419|   S

## AND Operator

In [128]:
# SQL
sql = '''
    SELECT name, gender, age
    FROM pets
    WHERE gender = 'male'
        AND age < 2
'''
db.fetch(sql)

[('Simba', 'male', 1),
 ('Simba', 'male', 0),
 ('Ebenezer', 'male', 0),
 ('Bruce', 'male', 1),
 ('Bright', 'male', 1),
 ('Bruce', 'male', 0)]

In [130]:
# Pandas
pd_pets[(pd_pets['gender'] == 'male') & (pd_pets['age'] < 2)][['name','gender','age']]

Unnamed: 0,name,gender,age
2,Simba,male,1
7,Simba,male,0
51,Ebenezer,male,0
56,Bruce,male,1
71,Bright,male,1
88,Bruce,male,0


In [135]:
# PySpark
ps_pets.select('name','gender','age').where((ps_pets.gender == 'male') & (ps_pets.age < 2)).show()

+--------+------+---+
|    name|gender|age|
+--------+------+---+
|   Simba|  male|  1|
|   Simba|  male|  0|
|Ebenezer|  male|  0|
|   Bruce|  male|  1|
|  Bright|  male|  1|
|   Bruce|  male|  0|
+--------+------+---+



## OR Operator

In [137]:
# SQL
sql = '''
    SELECT name, gender, age
    FROM pets
    WHERE gender = 'male'
        OR age < 2
    LIMIT 3
'''
db.fetch(sql)

[('Blackie', 'male', 11), ('Roomba', 'male', 9), ('Simba', 'male', 1)]

In [138]:
# Pand
pd_pets[(pets['gender'] == 'male') | (pets['age'] > 2)][['name','gender','age']].head(3)

Unnamed: 0,name,gender,age
0,Blackie,male,11
1,Roomba,male,9
2,Simba,male,1


In [169]:
# PySpark
ps_pets.select('name','gender','age').where((ps_pets.gender == 'male') | (ps_pets.age < 2)).limit(3).show()

+-------+------+---+
|   name|gender|age|
+-------+------+---+
|Blackie|  male| 11|
| Roomba|  male|  9|
|  Simba|  male|  1|
+-------+------+---+



## ORDER BY Clause

In [161]:
# SQL (Ascending Order)
sql = '''
    SELECT name, ownerid
    FROM pets
    ORDER BY ownerid
    LIMIT 3
'''
db.fetch(sql)

[('Biscuit', '1070'), ('Stowe', '1132'), ('Enyo', '1202')]

In [156]:
# SQL (Descending Order)
sql = '''
    SELECT name, ownerid
    FROM pets
    ORDER BY ownerid DESC
    LIMIT 3
'''
db.fetch(sql)

[('Dior', '9900'), ('Scooter', '9850'), ('Daisy', '9850')]

In [159]:
# Pandas (Ascending Order)
pd_pets[['name','ownerid']].sort_values('ownerid').head(3)

Unnamed: 0,name,ownerid
95,Biscuit,1070
10,Stowe,1132
33,Enyo,1202


In [160]:
# Pandas (Descending Order)
pd_pets[['name','ownerid']].sort_values('ownerid', ascending=False).head(3)

Unnamed: 0,name,ownerid
66,Dior,9900
53,Daisy,9850
15,Scooter,9850


In [170]:
# PySpark (Ascending Order)
ps_pets.select('name','ownerid').sort('ownerid').limit(3).show()

+-------+-------+
|   name|ownerid|
+-------+-------+
|Biscuit|   1070|
|  Stowe|   1132|
|   Enyo|   1202|
+-------+-------+



In [171]:
# PySpark (Ascending Order)
ps_pets.select('name','ownerid').sort('ownerid', ascending=False).limit(3).show()

+-------+-------+
|   name|ownerid|
+-------+-------+
|   Dior|   9900|
|  Daisy|   9850|
|Scooter|   9850|
+-------+-------+



# Aggregation

## COUNT aggregate function

In [172]:
# SQL
sql = '''
    SELECT COUNT(*)
    FROM procedureshistory
'''
db.fetch(sql)

[(2284,)]

In [176]:
# Pandas (option 1) - gives count for each column
pd_procedureshistory.count()

petid               2284
proceduredate       2284
proceduretype       2284
proceduresubcode    2284
dtype: int64

In [177]:
# Pandas (option 2) - gives count for total rows
len(pd_procedureshistory)

2284

In [179]:
# PySpark
ps_procedureshistory.count()

2284

## SUM aggregate function

In [186]:
pd_proceduresdetails.dtypes

proceduretype       object
proceduresubcode    object
description         object
price                int64
dtype: object

In [191]:
# SQL
sql = '''
    SELECT SUM(price)
    FROM proceduresdetails
'''
db.fetch(sql)

[(8011.0,)]

In [192]:
# Pandas
pd_proceduresdetails['price'].sum()

8011

In [213]:
# PySpark
from pyspark.sql import functions as F
ps_proceduresdetails.select(F.sum('price')).show()

+----------+
|sum(price)|
+----------+
|    8011.0|
+----------+



## AVG aggregate function

In [216]:
# SQL
sql = '''
    SELECT AVG(age)
    FROM pets
'''
db.fetch(sql)

[(Decimal('6.9300000000000000'),)]

In [221]:
# Pandas
pd_pets['age'].mean()

6.93

In [222]:
# PySpark
ps_pets.select(F.avg('age')).show()

+--------+
|avg(age)|
+--------+
|    6.93|
+--------+

