In [0]:
## Filtering operations

from pyspark.sql import Row
import pandas as pd
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', False)

import datetime
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "gender": "male",
        "current_city": "Dallas",
        "phone_numbers": Row(mobile="+1 234 567 8901", home="+1 234 567 8911"),
        "courses": [1, 2],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "gender": "male",
        "current_city": "Houston",
        "phone_numbers":  Row(mobile="+1 234 567 8923", home="1 234 567 8934"),
        "courses": [3],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "gender": "female",
        "current_city": "",
        "phone_numbers": Row(mobile="+1 714 512 9752", home="+1 714 512 6601"),
        "courses": [2, 4],
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "gender": "male",
        "current_city": "San Fransisco",
        "phone_numbers": Row(mobile=None, home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "gender": "female",
        "current_city": None,
        "phone_numbers": Row(mobile="+1 817 934 7142", home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]


users_df = spark.createDataFrame(pd.DataFrame(users))

In [0]:
users_df.show()

+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender| current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+-------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|       Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|  male|      Houston|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|female|             |{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home

In [0]:

# * `where` and `filter` are synonyms
# * We can pass conditions either by using SQL Style or Non SQL Style.
# * For Non SQL Style we can pass columns using `col` function on column name as string or using the notation of`df['column_name']`

In [0]:
from pyspark.sql.functions import col

users_df.filter(col('id') == 1).show()
users_df.where(col('id') == 1).show()
users_df.filter(users_df['id'] == 1).show()

# 'id == 1' also works
users_df.filter('id = 1').show()
## here filter and where conditions are doing the same
users_df.createOrReplaceTempView('users')
spark.sql("""
    SELECT *
    FROM users
    WHERE id = 1
"""). \
    show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|g

In [0]:

#     Equal -> = or ==
#     Not Equal -> !=
#     Greater Than -> >
#     Less Than -> <
#     Greater Than or Equal To -> >=
#     Less Than or Equal To -> <=
#     IN Operator -> isin function or IN or contains function
#     Between Operator -> between function or BETWEEN with AND



In [0]:
from pyspark.sql.functions import col

users_df.filter(col('current_city') == 'Dallas').show()
users_df.filter('amount_paid == "900.0"').show()


from pyspark.sql.functions import isnan
## note isnan == is not a nan   returns true or False values 
users_df.select('amount_paid', isnan('amount_paid')).show()

###
users_df.filter('isnan(amount_paid) = True').show()

+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|gender|current_city|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|  male|      Dallas|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
+---+----------+------------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+

+---+----------+---------+--------------------+------+------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|               email|gender|

In [0]:
## filter with != , filter with OR
users_df.select('id', 'current_city').filter(col('current_city') != 'Dallas').show()

## filter with and ie |
users_df.select('id', 'current_city').filter((col('current_city') != 'Dallas') | (col('current_city').isNull())).show()

## filter with OR 
users_df.select('id', 'current_city').filter("current_city != 'Dallas' OR current_city IS NULL").show()

## 
users_df.select('id', 'current_city').filter(col('current_city') != '').show()

+---+-------------+
| id| current_city|
+---+-------------+
|  2|      Houston|
|  3|             |
|  4|San Fransisco|
+---+-------------+

+---+-------------+
| id| current_city|
+---+-------------+
|  2|      Houston|
|  3|             |
|  4|San Fransisco|
|  5|         null|
+---+-------------+

+---+-------------+
| id| current_city|
+---+-------------+
|  2|      Houston|
|  3|             |
|  4|San Fransisco|
|  5|         null|
+---+-------------+

+---+-------------+
| id| current_city|
+---+-------------+
|  1|       Dallas|
|  2|      Houston|
|  4|San Fransisco|
+---+-------------+



In [0]:
### between operator 
### between(lowerBound, upperBound)

users_df.select('id', 'email', 'last_updated_ts').filter(col('last_updated_ts').between('2021-02-15 00:00:00', '2021-03-15 23:59:59')).show()

#########
users_df.select('id', 'amount_paid').filter('amount_paid BETWEEN "850" AND "900"').show()

+---+--------------------+-------------------+
| id|               email|    last_updated_ts|
+---+--------------------+-------------------+
|  2|nbrewitt1@dailyma...|2021-02-18 03:33:00|
|  3|openney2@vistapri...|2021-03-15 15:16:55|
+---+--------------------+-------------------+

+---+-----------+
| id|amount_paid|
+---+-----------+
|  2|      900.0|
|  3|     850.55|
+---+-----------+



In [0]:
## is null operations
users_df.select('id', 'current_city').filter(col('current_city').isNull()).show()
## same with string 
users_df.select('id', 'customer_from').filter('customer_from IS NULL').show()
## is not null function
users_df.select('id', 'current_city').filter(col('current_city').isNotNull()).show()

+---+------------+
| id|current_city|
+---+------------+
|  5|        null|
+---+------------+

+---+-------------+
| id|customer_from|
+---+-------------+
|  4|         null|
|  5|         null|
+---+-------------+

+---+-------------+
| id| current_city|
+---+-------------+
|  1|       Dallas|
|  2|      Houston|
|  3|             |
|  4|San Fransisco|
+---+-------------+



In [0]:
# Boolean Operations

#     Boolean OR
#     Boolean AND
#     Negation
print(True or True)
print(True or False)
print(True and True)
print(False or False)
print(not True)
print(not False)


True
True
True
False
False
True


In [0]:
#10 Boolean OR on same column of Spark Data Frame and IN Operator

users_df.select('id', 'current_city').filter((col('current_city') == '') | (col('current_city').isNull())).show()
## note: ie showing null values and ''

+---+------------+
| id|current_city|
+---+------------+
|  3|            |
|  5|        null|
+---+------------+



In [0]:

# Not recommended, use in instead
users_df.select('id', 'current_city').filter((col('current_city') == 'Houston') | (col('current_city') == 'Dallas')).show()
## note: above method is not recomadded as filtering is going to happen in single column 
# Not recommended, use in instead
users_df.select('id', 'current_city').filter("current_city = 'Houston' OR current_city = 'Dallas'").show()
#### note: above method is not recomadded as filtering is going to happen in single column 
users_df.select('id','current_city').filter(col("current_city").isin('Houston','Dallas')).show()

## isin () is used within filter 
users_df.select('id', 'current_city').filter("current_city IN ('Houston', 'Dallas', '','NULL')").show()
# IMp Note :Passing null will not be effective
users_df.select('id', 'current_city').filter("current_city IN ('Houston', 'Dallas', '') OR current_city IS NULL").show()


# Boolean OR including null check
users_df.select('id', 'current_city').filter((col('current_city').isin('Houston', 'Dallas', '')) | (col('current_city').isNull())).show()
## use "is NULL " in string expresion filter or use .isNull() in filter conditions 

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
+---+------------+

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
+---+------------+

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
+---+------------+

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
|  3|            |
+---+------------+

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
|  3|            |
|  5|        null|
+---+------------+

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|     Houston|
|  3|            |
|  5|        null|
+---+------------+



In [0]:

#     >
#     <
#     >= (equivalent to boolean with col1 > val1 or cal1 = val1)
#     <= (equivalent to boolean with col1 < val1 or cal1 = val1)
## filtering with greter than , equal to , <= ,>=
users_df.select('id', 'customer_from').filter('customer_from > "2021-01-21"').show()

users_df.select('id', 'customer_from').filter(col('customer_from') > '2021-01-21').show()

## question : find id and amount_paid which are less than 900 and not null 
users_df.filter('amount_paid <= 900 AND isnan(amount_paid) = false').select('id', 'amount_paid').show()

## find id of cutomer who are customer joined after 2021-02-14
users_df.select('id', 'customer_from').filter('customer_from > "2021-01-21"').show()

+---+-------------+
| id|customer_from|
+---+-------------+
|  2|   2021-02-14|
+---+-------------+

+---+-------------+
| id|customer_from|
+---+-------------+
|  2|   2021-02-14|
+---+-------------+

+---+-----------+
| id|amount_paid|
+---+-----------+
|  2|      900.0|
|  3|     850.55|
+---+-----------+

+---+-------------+
| id|customer_from|
+---+-------------+
|  2|   2021-02-14|
+---+-------------+



In [0]:
## boolean and and conditioning 
users_df.filter((col('gender') == 'male') & (col('is_customer') == True)).select('id', 'gender', 'is_customer').show()
users_df.filter("customer_from >= '2021-01-20' AND customer_from  <= '2021-02-15'").select('id', 'customer_from').show()
users_df.filter((col('customer_from') >= '2021-01-20') & (col('customer_from') <= '2021-02-15')).select('id', 'customer_from').show()

+---+------+-----------+
| id|gender|is_customer|
+---+------+-----------+
|  1|  male|       true|
|  2|  male|       true|
+---+------+-----------+

+---+-------------+
| id|customer_from|
+---+-------------+
|  2|   2021-02-14|
|  3|   2021-01-21|
+---+-------------+

+---+-------------+
| id|customer_from|
+---+-------------+
|  2|   2021-02-14|
|  3|   2021-01-21|
+---+-------------+



# dropping the columns

In [0]:
from pyspark.sql import Row
import datetime
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "phone_numbers": Row(mobile="+1 234 567 8901", home="+1 234 567 8911"),
        "courses": [1, 2],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "phone_numbers":  Row(mobile="+1 234 567 8923", home="1 234 567 8934"),
        "courses": [3],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "phone_numbers": Row(mobile="+1 714 512 9752", home="+1 714 512 6601"),
        "courses": [2, 4],
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "phone_numbers": Row(mobile=None, home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "phone_numbers": Row(mobile="+1 817 934 7142", home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]
import pandas as pd
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', False)
# note:  using PyArrow if you are using pandas in your spark application, it will speed the data conversion between spark and pandas ie if true 
users_df = spark.createDataFrame(pd.DataFrame(users))

In [0]:
users_df.show(3,False)
users_df.printSchema()

+---+----------+------------+-------------------------+----------------------------------+-------+-----------+-----------+-------------+-------------------+
|id |first_name|last_name   |email                    |phone_numbers                     |courses|is_customer|amount_paid|customer_from|last_updated_ts    |
+---+----------+------------+-------------------------+----------------------------------+-------+-----------+-----------+-------------+-------------------+
|1  |Corrie    |Van den Oord|cvandenoord0@etsy.com    |{+1 234 567 8901, +1 234 567 8911}|[1, 2] |true       |1000.55    |2021-01-15   |2021-02-10 01:15:00|
|2  |Nikolaus  |Brewitt     |nbrewitt1@dailymail.co.uk|{+1 234 567 8923, 1 234 567 8934} |[3]    |true       |900.0      |2021-02-14   |2021-02-18 03:33:00|
|3  |Orelie    |Penney      |openney2@vistaprint.com  |{+1 714 512 9752, +1 714 512 6601}|[2, 4] |true       |850.55     |2021-01-21   |2021-03-15 15:16:55|
+---+----------+------------+-------------------------+---

In [0]:
## dropping single column 
users_df.drop('last_updated_ts').printSchema()
users_df.drop(users_df['last_updated_ts']).printSchema()
users_df.drop(col('last_updated_ts')).printSchema()
## above all three do the same things 
# If we have column name which does not exist, the column will be ignored
users_df.drop(col('user_id')).printSchema()

##note: if we try to drop the column which is not present in dataframe it will not throw error just do nothing 


root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)

In [0]:
## droping multiple columns 

users_df.drop(col('first_name'), col('id')).printSchema()
##Imp Note: above will fail when we are dropping multiple columns  we need to pass them as strings  
## ie TypeError: each col in the param list should be a string error 
## # This will fail as we are passing multiple column objects
# When we want to pass more than one column, we have to pass all column names as strings

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-402239863734907>[0m in [0;36m<module>[0;34m[0m
[1;32m      1[0m [0;31m## droping multiple columns[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0;34m[0m[0m
[0;32m----> 3[0;31m [0musers_df[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0mcol[0m[0;34m([0m[0;34m'first_name'[0m[0;34m)[0m[0;34m,[0m [0mcol[0m[0;34m([0m[0;34m'id'[0m[0;34m)[0m[0;34m)[0m[0;34m.[0m[0mprintSchema[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      4[0m [0;31m##Imp Note: above will fail when we are dropping multiple columns  we need to pass them as strings[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      5[0m [0;31m## # This will fail as we are passing multiple column objects[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/spark/python/pyspark/

In [0]:
users_df.drop('user_id', 'first_name', 'last_name').printSchema()
users_df.drop('user_id', 'first_name', 'last_name').show()

root
 |-- id: long (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)

+---+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900

In [0]:
## droping list of columns in pyspark 
users_df.show(3)
pii_columns = ['first_name', 'last_name', 'email', 'phone_numbers']

# This will fail
# We need to convert list to varrying arguments to get it working
users_df_nopii = users_df.drop(pii_columns)

## note : passing list to drop column will fail the thing we need to pass varing argument to that .
## *args 

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
only showing top 3 rows



[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-402239863734909>[0m in [0;36m<module>[0;34m[0m
[1;32m      5[0m [0;31m# This will fail[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      6[0m [0;31m# We need to convert list to varrying arguments to get it working[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 7[0;31m [0musers_df_nopii[0m [0;34m=[0m [0musers_df[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0mpii_columns[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/spark/python/pyspark/sql/dataframe.py[0m in [0;36mdrop[0;34m(self, *cols)[0m
[1;32m   2734[0m                 [0mjdf[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_jdf[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0mcol[0m[0;34m.[0m[0m_jc[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m   2735[0m             [0;32melse[0m[0;34m:[0m[0;34

In [0]:
users_df_nopii = users_df.drop(*pii_columns).show()

+---+-------+-----------+-----------+-------------+-------------------+
| id|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+-------+-----------+-----------+-------------+-------------------+
|  1| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|     []|      false|        NaN|         null|2021-04-02 00:55:18|
+---+-------+-----------+-----------+-------------+-------------------+



In [0]:
##Here are the features that are available with respect to dropping duplicate records.

#     Drop duplicates based on all columns. It is also known as distinct.
#     Drop duplicates based on certain columns.
#     We can use distinct, drop_duplicates or dropDuplicates for these scenarios.

## drop_duplicates == based on all columns 
## drop duplicates based on certain columns 



In [0]:
import datetime
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "is_customer": True,
        "amount_paid": 1050.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 25, 3, 33, 0)
    }
]


import pandas as pd
users_df = spark.createDataFrame(pd.DataFrame(users))

In [0]:
users_df.show(4,False)

+---+----------+------------+-------------------------+-----------+-----------+-------------+-------------------+
|id |first_name|last_name   |email                    |is_customer|amount_paid|customer_from|last_updated_ts    |
+---+----------+------------+-------------------------+-----------+-----------+-------------+-------------------+
|1  |Corrie    |Van den Oord|cvandenoord0@etsy.com    |true       |1000.55    |2021-01-15   |2021-02-10 01:15:00|
|2  |Nikolaus  |Brewitt     |nbrewitt1@dailymail.co.uk|true       |900.0      |2021-02-14   |2021-02-18 03:33:00|
|3  |Orelie    |Penney      |openney2@vistaprint.com  |true       |850.55     |2021-01-21   |2021-03-15 15:16:55|
|3  |Orelie    |Penney      |openney2@vistaprint.com  |true       |850.55     |2021-01-21   |2021-03-15 15:16:55|
+---+----------+------------+-------------------------+-----------+-----------+-------------+-------------------+
only showing top 4 rows



In [0]:
print(users_df.count())
## count will give the no of rows present 
print(users_df.distinct().count())

## note: distinmct will give 
users_df.dropDuplicates().count()
## dropDuplicates will drop duplicate rows ie taking account all columns combinations check results eg id,amount_column 
users_df.dropDuplicates().show()

8
6
+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome4@shutterfly...|      false|        NaN|         null|2021-04-02 00:55:18|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|     1050.0|   2021-02-14|2021-02-25 03:33:00|
+---+----------

In [0]:
users_df.dropDuplicates(['id', 'amount_paid']).show()

+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|     1050.0|   2021-02-14|2021-02-25 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome4@shutterfly...|      false|        NaN|         null|2021-04-02 00:55:18|
+---+----------+---

In [0]:
# Here are the features that are available with respect to dropping records based on null values.
# * Drop records when all column values are nulls.
# * Drop records any of the column value is null.
# * Drop records that have less than `thresh` non-null values.
# * Drop records when any of the column value or all column values are nulls for provided subset of columns.
# * We can use `df.na.drop` or `df.dropna` to take care of dealing with records having columns with null values.

In [0]:
import datetime
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": None,
        "first_name": None,
        "last_name": None,
        "email": None,
        "is_customer": None,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": None
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    },
    {
        "id": None,
        "first_name": None,
        "last_name": None,
        "email": None,
        "is_customer": None,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": None
    },
    {
        "id": 5,
        "first_name": None,
        "last_name": None,
        "email": None,
        "is_customer": None,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": None
    },
    {
        "id": None,
        "first_name": None,
        "last_name": None,
        "email": "nbrewitt1@dailymail.co.uk",
        "is_customer": None,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": None
    },
    {
        "id": None,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": None,
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "is_customer": True,
        "amount_paid": 1050.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 25, 3, 33, 0)
    }
]


import pandas as pd
users_df = spark.createDataFrame(pd.DataFrame(users))

In [0]:
users_df.count()
users_df.show()

+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
|1.0|    Corrie|Van den Oord|cvandenoord0@etsy...|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|NaN|      null|        null|                null|       null|        NaN|         null|               null|
|2.0|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|3.0|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|3.0|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|4.0|     Ashby|    Maddocks|  amaddocks3@home.pl|      false|        NaN|         null|2021-04-10 17:45:30|
|4.0|     Ashby|   

In [0]:
# Attribute which exposes functions dealing with null records
# drop => users_df.dropna
# fill => users_df.fillna
# replace => users_df.replacena

## note: basically df.na.drop()
## ie df.na === is attribute 
# note: drop(how='any', thresh=None, subset=None)

users_df.na.drop('all').show()
# Drop if any column value is null
users_df.na.drop('any').show()
users_df.na.drop(thresh=2).show()

help(users_df.drop())

## dropDuplicates(self, subset=None)
users_df.na.drop(how='any', subset=['id', 'email']).show()

## 
#  subset : str, tuple or list, optional optional list of column names to consider.
# thresh: int, optiona default None  ie dropna only after thrushhold is briched 
#  If specified, drop rows that have less than `thresh` non-null values.This overwrites the `how` parameter.
users_df.na.drop(how='any', subset=['id', 'email']).show()

+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
|1.0|    Corrie|Van den Oord|cvandenoord0@etsy...|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|2.0|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|3.0|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|3.0|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|4.0|     Ashby|    Maddocks|  amaddocks3@home.pl|      false|        NaN|         null|2021-04-10 17:45:30|
|4.0|     Ashby|    Maddocks|  amaddocks3@home.pl|      false|        NaN|         null|2021-04-10 17:45:30|
|5.0|      Kurt|   

In [0]:
users_df = spark.createDataFrame(pd.DataFrame(users)).show()

+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+-----------+-----------+-------------+-------------------+
|1.0|    Corrie|Van den Oord|cvandenoord0@etsy...|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|NaN|      null|        null|                null|       null|        NaN|         null|               null|
|2.0|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|3.0|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|3.0|    Orelie|      Penney|openney2@vistapri...|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|4.0|     Ashby|    Maddocks|  amaddocks3@home.pl|      false|        NaN|         null|2021-04-10 17:45:30|
|4.0|     Ashby|   

In [0]:
# Here are the sorting scenarios which we should be familiar with.
# * Sort a data frame using ascending order by a specific column.
# * Sort a data frame using descending order by a specific column.
# * Dealing with nulls while sorting the data (having the null values at the beginning or at the end).
# * Sort a data frame using multiple columns (composite sorting). We also need to be aware of how to sort the data in ascending order by first column and then descending order by second column as well as vice versa.
# * We also need to make sure how to perform prioritized sorting. For example, let's say we want to get USA at the top and rest of the countries in ascending order by their respective names.

In [0]:
import datetime
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "phone_numbers": Row(mobile="+1 234 567 8901", home="+1 234 567 8911"),
        "courses": [1, 2],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "phone_numbers":  Row(mobile="+1 234 567 8923", home="1 234 567 8934"),
        "courses": [3],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "phone_numbers": Row(mobile="+1 714 512 9752", home="+1 714 512 6601"),
        "courses": [2, 4],
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "phone_numbers": Row(mobile=None, home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "phone_numbers": Row(mobile="+1 817 934 7142", home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]
import pandas as pd
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', False)
users_df = spark.createDataFrame(pd.DataFrame(users))
users_df.show()

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

In [0]:
## sort data in ascending order , sort data in descending order 
from pyspark.sql.functions import col
users_df.show()
users_df.sort(col('first_name')).show()
## note : this will sort data in ascending order 
## note : this will sort data in descending order

# Questions : * Sort the **data** in ascending order by **number of enrolled courses**.

##
users_df.select('id', 'courses').withColumn('no_of_courses', size('courses')).sort(size('courses')).show()
## sort(,ascending=False)
users_df.sort('first_name', ascending=False).show()

###########
users_df.sort(users_df['first_name'].desc()).show()

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

In [0]:
## note: * Sort the data in ascending order by **customer_from** with null values coming at the end.


users_df.select('id', 'customer_from').orderBy(users_df['customer_from'].asc_nulls_last()).show()

## asc_nulls_last ==== give null at last 
###desc_nulls_first == give null at fist 
users_df.select('id', 'customer_from'). orderBy(users_df['customer_from'].desc_nulls_first()).show()

+---+-------------+
| id|customer_from|
+---+-------------+
|  1|   2021-01-15|
|  3|   2021-01-21|
|  2|   2021-02-14|
|  4|         null|
|  5|         null|
+---+-------------+

+---+-------------+
| id|customer_from|
+---+-------------+
|  5|         null|
|  4|         null|
|  2|   2021-02-14|
|  3|   2021-01-21|
|  1|   2021-01-15|
+---+-------------+



In [0]:
### sort by and orderBy 
# The difference between "order by" and "sort by" is that the former guarantees total order in the output while the latter only guarantees ordering of the rows within a reducer.

## composite sorting

In [0]:
courses = [{'course_id': 1,
  'course_name': '2020 Complete Python Bootcamp: From Zero to Hero in Python',
  'suitable_for': 'Beginner',
  'enrollment': 1100093,
  'stars': 4.6,
  'number_of_ratings': 318066},
 {'course_id': 4,
  'course_name': 'Angular - The Complete Guide (2020 Edition)',
  'suitable_for': 'Intermediate',
  'enrollment': 422557,
  'stars': 4.6,
  'number_of_ratings': 129984},
 {'course_id': 12,
  'course_name': 'Automate the Boring Stuff with Python Programming',
  'suitable_for': 'Advanced',
  'enrollment': 692617,
  'stars': 4.6,
  'number_of_ratings': 70508},
 {'course_id': 10,
  'course_name': 'Complete C# Unity Game Developer 2D',
  'suitable_for': 'Advanced',
  'enrollment': 364934,
  'stars': 4.6,
  'number_of_ratings': 78989},
 {'course_id': 5,
  'course_name': 'Java Programming Masterclass for Software Developers',
  'suitable_for': 'Advanced',
  'enrollment': 502572,
  'stars': 4.6,
  'number_of_ratings': 123798},
 {'course_id': 15,
  'course_name': 'Learn Python Programming Masterclass',
  'suitable_for': 'Advanced',
  'enrollment': 240790,
  'stars': 4.5,
  'number_of_ratings': 58677},
 {'course_id': 3,
  'course_name': 'Machine Learning A-Z™: Hands-On Python & R In Data Science',
  'suitable_for': 'Intermediate',
  'enrollment': 692812,
  'stars': 4.5,
  'number_of_ratings': 132228},
 {'course_id': 14,
  'course_name': 'Modern React with Redux [2020 Update]',
  'suitable_for': 'Intermediate',
  'enrollment': 203214,
  'stars': 4.7,
  'number_of_ratings': 60835},
 {'course_id': 8,
  'course_name': 'Python for Data Science and Machine Learning Bootcamp',
  'suitable_for': 'Intermediate',
  'enrollment': 387789,
  'stars': 4.6,
  'number_of_ratings': 87403},
 {'course_id': 6,
  'course_name': 'React - The Complete Guide (incl Hooks, React Router, Redux)',
  'suitable_for': 'Intermediate',
  'enrollment': 304670,
  'stars': 4.6,
  'number_of_ratings': 90964},
 {'course_id': 18,
  'course_name': 'Selenium WebDriver with Java -Basics to Advanced+Frameworks',
  'suitable_for': 'Advanced',
  'enrollment': 148562,
  'stars': 4.6,
  'number_of_ratings': 49947},
 {'course_id': 21,
  'course_name': 'Spring & Hibernate for Beginners (includes Spring Boot)',
  'suitable_for': 'Advanced',
  'enrollment': 177053,
  'stars': 4.6,
  'number_of_ratings': 45329},
 {'course_id': 7,
  'course_name': 'The Complete 2020 Web Development Bootcamp',
  'suitable_for': 'Beginner',
  'enrollment': 270656,
  'stars': 4.7,
  'number_of_ratings': 88098},
 {'course_id': 9,
  'course_name': 'The Complete JavaScript Course 2020: Build Real Projects!',
  'suitable_for': 'Intermediate',
  'enrollment': 347979,
  'stars': 4.6,
  'number_of_ratings': 83521},
 {'course_id': 16,
  'course_name': 'The Complete Node.js Developer Course (3rd Edition)',
  'suitable_for': 'Advanced',
  'enrollment': 202922,
  'stars': 4.7,
  'number_of_ratings': 50885},
 {'course_id': 13,
  'course_name': 'The Complete Web Developer Course 2.0',
  'suitable_for': 'Intermediate',
  'enrollment': 273598,
  'stars': 4.5,
  'number_of_ratings': 63175},
 {'course_id': 11,
  'course_name': 'The Data Science Course 2020: Complete Data Science Bootcamp',
  'suitable_for': 'Beginner',
  'enrollment': 325047,
  'stars': 4.5,
  'number_of_ratings': 76907},
 {'course_id': 20,
  'course_name': 'The Ultimate MySQL Bootcamp: Go from SQL Beginner to Expert',
  'suitable_for': 'Beginner',
  'enrollment': 203366,
  'stars': 4.6,
  'number_of_ratings': 45382},
 {'course_id': 2,
  'course_name': 'The Web Developer Bootcamp',
  'suitable_for': 'Beginner',
  'enrollment': 596726,
  'stars': 4.6,
  'number_of_ratings': 182997},
 {'course_id': 19,
  'course_name': 'Unreal Engine C++ Developer: Learn C++ and Make Video Games',
  'suitable_for': 'Advanced',
  'enrollment': 229005,
  'stars': 4.5,
  'number_of_ratings': 45860},
 {'course_id': 17,
  'course_name': 'iOS 13 & Swift 5 - The Complete iOS App Development Bootcamp',
  'suitable_for': 'Advanced',
  'enrollment': 179598,
  'stars': 4.8,
  'number_of_ratings': 49972}]

In [0]:
from pyspark.sql import Row
courses_df = spark.createDataFrame([Row(**course) for course in courses])
courses_df.show(2)

+---------+--------------------+------------+----------+-----+-----------------+
|course_id|         course_name|suitable_for|enrollment|stars|number_of_ratings|
+---------+--------------------+------------+----------+-----+-----------------+
|        1|2020 Complete Pyt...|    Beginner|   1100093|  4.6|           318066|
|        4|Angular - The Com...|Intermediate|    422557|  4.6|           129984|
+---------+--------------------+------------+----------+-----+-----------------+
only showing top 2 rows



In [0]:
courses_df.dtypes
courses_df.sort(courses_df['suitable_for'], courses_df['number_of_ratings'].desc()).show()

+---------+--------------------+------------+----------+-----+-----------------+
|course_id|         course_name|suitable_for|enrollment|stars|number_of_ratings|
+---------+--------------------+------------+----------+-----+-----------------+
|        5|Java Programming ...|    Advanced|    502572|  4.6|           123798|
|       10|Complete C# Unity...|    Advanced|    364934|  4.6|            78989|
|       12|Automate the Bori...|    Advanced|    692617|  4.6|            70508|
|       15|Learn Python Prog...|    Advanced|    240790|  4.5|            58677|
|       16|The Complete Node...|    Advanced|    202922|  4.7|            50885|
|       17|iOS 13 & Swift 5 ...|    Advanced|    179598|  4.8|            49972|
|       18|Selenium WebDrive...|    Advanced|    148562|  4.6|            49947|
|       19|Unreal Engine C++...|    Advanced|    229005|  4.5|            45860|
|       21|Spring & Hibernat...|    Advanced|    177053|  4.6|            45329|
|        1|2020 Complete Pyt

## prioterizing the sorting of DataFrame

In [0]:
courses = [{'course_id': 1,
  'course_name': '2020 Complete Python Bootcamp: From Zero to Hero in Python',
  'suitable_for': 'Beginner',
  'enrollment': 1100093,
  'stars': 4.6,
  'number_of_ratings': 318066},
 {'course_id': 4,
  'course_name': 'Angular - The Complete Guide (2020 Edition)',
  'suitable_for': 'Intermediate',
  'enrollment': 422557,
  'stars': 4.6,
  'number_of_ratings': 129984},
 {'course_id': 12,
  'course_name': 'Automate the Boring Stuff with Python Programming',
  'suitable_for': 'Advanced',
  'enrollment': 692617,
  'stars': 4.6,
  'number_of_ratings': 70508},
 {'course_id': 10,
  'course_name': 'Complete C# Unity Game Developer 2D',
  'suitable_for': 'Advanced',
  'enrollment': 364934,
  'stars': 4.6,
  'number_of_ratings': 78989},
 {'course_id': 5,
  'course_name': 'Java Programming Masterclass for Software Developers',
  'suitable_for': 'Advanced',
  'enrollment': 502572,
  'stars': 4.6,
  'number_of_ratings': 123798},
 {'course_id': 15,
  'course_name': 'Learn Python Programming Masterclass',
  'suitable_for': 'Advanced',
  'enrollment': 240790,
  'stars': 4.5,
  'number_of_ratings': 58677},
 {'course_id': 3,
  'course_name': 'Machine Learning A-Z™: Hands-On Python & R In Data Science',
  'suitable_for': 'Intermediate',
  'enrollment': 692812,
  'stars': 4.5,
  'number_of_ratings': 132228},
 {'course_id': 14,
  'course_name': 'Modern React with Redux [2020 Update]',
  'suitable_for': 'Intermediate',
  'enrollment': 203214,
  'stars': 4.7,
  'number_of_ratings': 60835},
 {'course_id': 8,
  'course_name': 'Python for Data Science and Machine Learning Bootcamp',
  'suitable_for': 'Intermediate',
  'enrollment': 387789,
  'stars': 4.6,
  'number_of_ratings': 87403},
 {'course_id': 6,
  'course_name': 'React - The Complete Guide (incl Hooks, React Router, Redux)',
  'suitable_for': 'Intermediate',
  'enrollment': 304670,
  'stars': 4.6,
  'number_of_ratings': 90964},
 {'course_id': 18,
  'course_name': 'Selenium WebDriver with Java -Basics to Advanced+Frameworks',
  'suitable_for': 'Advanced',
  'enrollment': 148562,
  'stars': 4.6,
  'number_of_ratings': 49947},
 {'course_id': 21,
  'course_name': 'Spring & Hibernate for Beginners (includes Spring Boot)',
  'suitable_for': 'Advanced',
  'enrollment': 177053,
  'stars': 4.6,
  'number_of_ratings': 45329},
 {'course_id': 7,
  'course_name': 'The Complete 2020 Web Development Bootcamp',
  'suitable_for': 'Beginner',
  'enrollment': 270656,
  'stars': 4.7,
  'number_of_ratings': 88098},
 {'course_id': 9,
  'course_name': 'The Complete JavaScript Course 2020: Build Real Projects!',
  'suitable_for': 'Intermediate',
  'enrollment': 347979,
  'stars': 4.6,
  'number_of_ratings': 83521},
 {'course_id': 16,
  'course_name': 'The Complete Node.js Developer Course (3rd Edition)',
  'suitable_for': 'Advanced',
  'enrollment': 202922,
  'stars': 4.7,
  'number_of_ratings': 50885},
 {'course_id': 13,
  'course_name': 'The Complete Web Developer Course 2.0',
  'suitable_for': 'Intermediate',
  'enrollment': 273598,
  'stars': 4.5,
  'number_of_ratings': 63175},
 {'course_id': 11,
  'course_name': 'The Data Science Course 2020: Complete Data Science Bootcamp',
  'suitable_for': 'Beginner',
  'enrollment': 325047,
  'stars': 4.5,
  'number_of_ratings': 76907},
 {'course_id': 20,
  'course_name': 'The Ultimate MySQL Bootcamp: Go from SQL Beginner to Expert',
  'suitable_for': 'Beginner',
  'enrollment': 203366,
  'stars': 4.6,
  'number_of_ratings': 45382},
 {'course_id': 2,
  'course_name': 'The Web Developer Bootcamp',
  'suitable_for': 'Beginner',
  'enrollment': 596726,
  'stars': 4.6,
  'number_of_ratings': 182997},
 {'course_id': 19,
  'course_name': 'Unreal Engine C++ Developer: Learn C++ and Make Video Games',
  'suitable_for': 'Advanced',
  'enrollment': 229005,
  'stars': 4.5,
  'number_of_ratings': 45860},
 {'course_id': 17,
  'course_name': 'iOS 13 & Swift 5 - The Complete iOS App Development Bootcamp',
  'suitable_for': 'Advanced',
  'enrollment': 179598,
  'stars': 4.8,
  'number_of_ratings': 49972}]
from pyspark.sql import Row
courses_df = spark.createDataFrame([Row(**course) for course in courses])
from pyspark.sql.functions import col, when

In [0]:
course_level = when(col('suitable_for') == 'Beginner', 0).otherwise(when(col('suitable_for') == 'Intermediate', 1).otherwise(2))

In [0]:
# Using expr and then case and when

from pyspark.sql.functions import expr

course_level = expr("""
    CASE 
        WHEN suitable_for = 'Beginner'
            THEN 0
        WHEN suitable_for = 'Intermediate'
            THEN 1
        ELSE 2
    END
""")

courses_df. \
    sort(course_level, col('number_of_ratings').desc()). \
    show()

+---------+--------------------+------------+----------+-----+-----------------+
|course_id|         course_name|suitable_for|enrollment|stars|number_of_ratings|
+---------+--------------------+------------+----------+-----+-----------------+
|        1|2020 Complete Pyt...|    Beginner|   1100093|  4.6|           318066|
|        2|The Web Developer...|    Beginner|    596726|  4.6|           182997|
|        7|The Complete 2020...|    Beginner|    270656|  4.7|            88098|
|       11|The Data Science ...|    Beginner|    325047|  4.5|            76907|
|       20|The Ultimate MySQ...|    Beginner|    203366|  4.6|            45382|
|        3|Machine Learning ...|Intermediate|    692812|  4.5|           132228|
|        4|Angular - The Com...|Intermediate|    422557|  4.6|           129984|
|        6|React - The Compl...|Intermediate|    304670|  4.6|            90964|
|        8|Python for Data S...|Intermediate|    387789|  4.6|            87403|
|        9|The Complete Java