In [2]:
import findspark

findspark.init()
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, BooleanType, ArrayType, MapType
from pyspark.sql.functions import col, struct, when, lit, sum, expr, array_contains, udf, upper, explode, row_number, rank, dense_rank, lead, current_date, date_format, to_date, datediff, from_json, to_json, json_tuple, get_json_object, collect_set
from pyspark.sql.window import Window
import json
import time

In [4]:
spark = SparkSession.builder.master('local[1]').appName('ali_spark_cond').getOrCreate()

In [8]:
df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
df2.agg(collect_set('age')).show()

+----------------+
|collect_set(age)|
+----------------+
|          [5, 2]|
+----------------+



In [26]:
spark = SparkSession.builder.master('local[1]').appName('appv1').getOrCreate()

In [5]:
datalist = [
    ('ali', 26),
    ('haris', 25)
]
rdd = spark.sparkContext.parallelize(datalist)

In [6]:
rdd = spark.sparkContext.textFile('./aws_command_emr.txt')

In [9]:
data = [('James', '', 'Smith', '1991-04-01', 'M', 3000),
        ('Michael', 'Rose', '', '2000-05-19', 'M', 4000),
        ('Robert', '', 'Williams', '1978-09-05', 'M', 4000),
        ('Maria', 'Anne', 'Jones', '1967-12-01', 'F', 4000),
        ('Jen', 'Mary', 'Brown', '1980-02-17', 'F', -1)]

columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]
df = spark.createDataFrame(data=data, schema=columns)
print(df.printSchema())
df.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

None
+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [10]:
df.withColumn(
    'cond_y',
    collect_set('salary').over(Window.partitionBy('gender'))
).show()

+---------+----------+--------+----------+------+------+------------+
|firstname|middlename|lastname|       dob|gender|salary|      cond_y|
+---------+----------+--------+----------+------+------+------------+
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|  [-1, 4000]|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|  [-1, 4000]|
|    James|          |   Smith|1991-04-01|     M|  3000|[3000, 4000]|
|  Michael|      Rose|        |2000-05-19|     M|  4000|[3000, 4000]|
|   Robert|          |Williams|1978-09-05|     M|  4000|[3000, 4000]|
+---------+----------+--------+----------+------+------+------------+



In [12]:
df_cities = spark.read.csv('./cities.csv', header=True)
print(df_cities.printSchema())
df_cities.show()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- coords: string (nullable = true)
 |-- start_year: string (nullable = true)
 |-- url_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- country_state: string (nullable = true)

None
+---+----------+--------------------+----------+----------+----------------+-------------+
| id|      name|              coords|start_year|  url_name|         country|country_state|
+---+----------+--------------------+----------+----------+----------------+-------------+
|  5|  Aberdeen|  POINT(-2.15 57.15)|      2017|  aberdeen|        Scotland|         null|
|  6|  Adelaide|POINT(138.6 -34.9...|      2017|  adelaide|       Australia|         null|
|  7|   Algiers|POINT(3 36.83333333)|      2017|   algiers|         Algeria|         null|
|  9|    Ankara|POINT(32.91666667...|      2017|    ankara|          Turkey|         null|
| 16|     Belém|POINT(-48.4833333...|      2017|     belem|          Brazil|  

In [13]:
df_cities.createOrReplaceTempView("CITIES")

In [20]:
df_sql_cities = spark.sql('SELECT * FROM cities')
print(df_sql_cities.printSchema())
df.show(n=3)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- coords: string (nullable = true)
 |-- start_year: string (nullable = true)
 |-- url_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- country_state: string (nullable = true)

None
+---+--------+--------------------+----------+--------+---------+-------------+
| id|    name|              coords|start_year|url_name|  country|country_state|
+---+--------+--------------------+----------+--------+---------+-------------+
|  5|Aberdeen|  POINT(-2.15 57.15)|      2017|aberdeen| Scotland|         null|
|  6|Adelaide|POINT(138.6 -34.9...|      2017|adelaide|Australia|         null|
|  7| Algiers|POINT(3 36.83333333)|      2017| algiers|  Algeria|         null|
+---+--------+--------------------+----------+--------+---------+-------------+
only showing top 3 rows



In [23]:
spark.sql('''
    select 
        country, 
        count(*) as locations_count
    from 
        cities 
    group by 
        country
    order by
        2 desc
''').show(n=5)

+-------------+---------------+
|      country|locations_count|
+-------------+---------------+
|United States|            115|
|       France|             71|
|       Canada|             14|
|        Spain|              8|
|      England|              8|
+-------------+---------------+
only showing top 5 rows



In [25]:
emptyRdd = spark.sparkContext.emptyRDD()
emptyRddParal = spark.sparkContext.parallelize([])
print(emptyRdd)
print(emptyRddParal)

EmptyRDD[102] at emptyRDD at NativeMethodAccessorImpl.java:0
ParallelCollectionRDD[103] at parallelize at PythonRDD.scala:195


In [29]:
# Create an empty rdd but with schema
emptyRdd = spark.sparkContext.emptyRDD()
schema = StructType([
    StructField('Name', StringType(), nullable=False),
    StructField('Age', IntegerType(), nullable=False),
    StructField('isMale', BooleanType(), nullable=False),
])
emptyRdd_students = spark.createDataFrame(emptyRdd, schema)
emptyRdd

EmptyRDD[105] at emptyRDD at NativeMethodAccessorImpl.java:0

In [33]:
# converting rdd to df
emptyRdd = spark.sparkContext.emptyRDD()
schema = StructType([
    StructField('Name', StringType(), nullable=False),
    StructField('Age', IntegerType(), nullable=False),
    StructField('isMale', BooleanType(), nullable=False),
])
df_1 = emptyRdd.toDF(schema)
df_2 = spark.createDataFrame([], schema)
df_2_with_empty_schema = spark.createDataFrame([], StructType([]))
print(df_1.printSchema())
print(df_2.printSchema())
print(df_2_with_empty_schema.printSchema())

root
 |-- Name: string (nullable = false)
 |-- Age: integer (nullable = false)
 |-- isMale: boolean (nullable = false)

None
root
 |-- Name: string (nullable = false)
 |-- Age: integer (nullable = false)
 |-- isMale: boolean (nullable = false)

None
root

None


In [11]:
dept = [
    ('Business Intelligence', 15),
    ('Data Engineering', 35),
    ('Software Engineering', 62),
]
dept_df = spark.sparkContext.parallelize(dept)
# dept_df.collect()
df = dept_df.toDF(['Department', 'no_of_employees'])
print(df.printSchema())
df.show(truncate=False)

root
 |-- Department: string (nullable = true)
 |-- no_of_employees: long (nullable = true)

None
+---------------------+---------------+
|Department           |no_of_employees|
+---------------------+---------------+
|Business Intelligence|15             |
|Data Engineering     |35             |
|Software Engineering |62             |
+---------------------+---------------+



In [12]:
dept = [
    ('Business Intelligence', 15),
    ('Data Engineering', 35),
    ('Software Engineering', 62),
]
dept_rdd = spark.sparkContext.parallelize(dept)
dept_df = spark.createDataFrame(dept_rdd, schema=['Department', 'no_of_employees'])
print(dept_df.printSchema())
dept_df.show(truncate=False)

root
 |-- Department: string (nullable = true)
 |-- no_of_employees: long (nullable = true)

None
+---------------------+---------------+
|Department           |no_of_employees|
+---------------------+---------------+
|Business Intelligence|15             |
|Data Engineering     |35             |
|Software Engineering |62             |
+---------------------+---------------+



In [13]:
dept = [
    ('Business Intelligence', 15),
    ('Data Engineering', 35),
    ('Software Engineering', 62),
]
dept_rdd = spark.sparkContext.parallelize(dept)
schema = StructType([
    StructField('Dept_name', StringType(), nullable=False),
    StructField('No_employees', IntegerType(), nullable=False)
])
dept_df = spark.createDataFrame(dept_rdd, schema)
print(dept_df.printSchema())
dept_df.show(truncate=False)

root
 |-- Dept_name: string (nullable = false)
 |-- No_employees: integer (nullable = false)

None
+---------------------+------------+
|Dept_name            |No_employees|
+---------------------+------------+
|Business Intelligence|15          |
|Data Engineering     |35          |
|Software Engineering |62          |
+---------------------+------------+



In [15]:
data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

columns = ["first_name","middle_name","last_name","dob","gender","salary"]
employees_df = spark.createDataFrame(data, columns)
print(employees_df.printSchema())
# employees_df.show(truncate=False)
employees_df.toPandas()

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

None


Unnamed: 0,first_name,middle_name,last_name,dob,gender,salary
0,James,,Smith,36636.0,M,60000
1,Michael,Rose,,40288.0,M,70000
2,Robert,,Williams,42114.0,,400000
3,Maria,Anne,Jones,39192.0,F,500000
4,Jen,Mary,Brown,,F,0


In [18]:
dataStruct = [(("James", "", "Smith"), "36636", "M", "3000"),
              (("Michael", "Rose", ""), "40288", "M", "4000"),
              (("Robert", "", "Williams"), "42114", "M", "4000"),
              (("Maria", "Anne", "Jones"), "39192", "F", "4000"),
              (("Jen", "Mary", "Brown"), "", "F", "-1")]
schema = StructType([
    StructField(
        'Name',
        StructType([
            StructField('first_name', StringType()),
            StructField('middle_name', StringType()),
            StructField('last_name', StringType())
        ])),
    StructField(
        'dob',
        StringType(),
    ),
    StructField(
        'gender',
        StringType(),
    ),
    StructField(
        'salary',
        StringType(),
    ),
])
employees_nested_df = spark.createDataFrame(dataStruct, schema)
print(employees_nested_df.printSchema())
employees_nested_df.show(truncate=False)
employees_nested_df.toPandas()

root
 |-- Name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

None
+--------------------+-----+------+------+
|Name                |dob  |gender|salary|
+--------------------+-----+------+------+
|[James, , Smith]    |36636|M     |3000  |
|[Michael, Rose, ]   |40288|M     |4000  |
|[Robert, , Williams]|42114|M     |4000  |
|[Maria, Anne, Jones]|39192|F     |4000  |
|[Jen, Mary, Brown]  |     |F     |-1    |
+--------------------+-----+------+------+



Unnamed: 0,Name,dob,gender,salary
0,"(James, , Smith)",36636.0,M,3000
1,"(Michael, Rose, )",40288.0,M,4000
2,"(Robert, , Williams)",42114.0,M,4000
3,"(Maria, Anne, Jones)",39192.0,F,4000
4,"(Jen, Mary, Brown)",,F,-1


In [19]:
employees_nested_df.show(n=2, truncate=25, vertical=True)

-RECORD 0-------------------
 Name   | [James, , Smith]  
 dob    | 36636             
 gender | M                 
 salary | 3000              
-RECORD 1-------------------
 Name   | [Michael, Rose, ] 
 dob    | 40288             
 gender | M                 
 salary | 4000              
only showing top 2 rows



In [39]:
structureData = [(("Muhammad", "", "Ali"), "36636", "M", 3100, ['swiming', 'gaming'], {'math':92, 'english': 32}),
                 (("Johncena", "Uncle", ""), "40288", "M", 4300, ['cricket'], {'english': 63}),
                 (("Muhammad", "", "Talhah"), "42114", "M", 1400, ['gaming'], {'science': 98}),
                 (("Syeda", "Eraj", "Rizvi"), "39192", "F", 5500, ['movies'], {'computer science': 92, 'math': 99}),
                 (("Jen", "Mary", "Brown"), "", "F", -1, ['--', 'na'], {'super': 120})]
schema = StructType([
    StructField(
        'Name',
        StructType([
            StructField('first_name', StringType()),
            StructField('middle_name', StringType()),
            StructField('last_name', StringType())
        ])),
    StructField(
        'id',
        StringType(),
    ),
    StructField(
        'gender',
        StringType(),
    ),
    StructField(
        'salary',
        IntegerType(),
    ),
    StructField(
        'hobbies',
        ArrayType(StringType())
    ),
    StructField(
        'properties',
        MapType(StringType(), IntegerType())
    )
])
employees_nested_df = spark.createDataFrame(structureData, schema)
employees_nested_df.printSchema()

root
 |-- Name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)



In [40]:
employees_nested_df_updated_df = employees_nested_df.withColumn(
    'moreinfo',
    struct(
        col('id').alias('identifier'),
        col('gender').alias('gender'),
        col('salary').alias('salary'),
        when(col('salary').cast(IntegerType()) < 2000, 'low').when(
            col('salary').cast(IntegerType()) < 4000,
            'medium').otherwise('high').alias('salary_grade'))).drop(
                'id', 'gender', 'salary')
employees_nested_df_updated_df.printSchema()

root
 |-- Name: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- hobbies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)
 |-- moreinfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- salary_grade: string (nullable = false)



In [41]:
employees_nested_df_updated_df.toPandas()

Unnamed: 0,Name,hobbies,properties,moreinfo
0,"(Muhammad, , Ali)","[swiming, gaming]","{'english': 32, 'math': 92}","(36636, M, 3100, medium)"
1,"(Johncena, Uncle, )",[cricket],{'english': 63},"(40288, M, 4300, high)"
2,"(Muhammad, , Talhah)",[gaming],{'science': 98},"(42114, M, 1400, low)"
3,"(Syeda, Eraj, Rizvi)",[movies],"{'computer science': 92, 'math': 99}","(39192, F, 5500, high)"
4,"(Jen, Mary, Brown)","[--, na]",{'super': 120},"(, F, -1, low)"


In [4]:
colobj = lit('mynameisali')
colobj

Column<b'mynameisali'>

In [22]:
data = [('ali', 26), ('faisal', 31)]
df = spark.createDataFrame(data).toDF('name', 'gender')
# df.printSchema()
df.select(df.gender).show()

+------+
|gender|
+------+
|    26|
|    31|
+------+



In [23]:
df.select(col('name')).show()

+------+
|  name|
+------+
|   ali|
|faisal|
+------+



In [31]:
data = [
    Row(name="James", prop=Row(hair="black", eye="blue")),
    Row(name="Ann", prop=Row(hair="grey", eye="black"))
]
df = spark.createDataFrame(data)
df.show()
df.select(col('name'), col('prop.eye')).show()

+-----+-------------+
| name|         prop|
+-----+-------------+
|James|[blue, black]|
|  Ann|[black, grey]|
+-----+-------------+

+-----+-----+
| name|  eye|
+-----+-----+
|James| blue|
|  Ann|black|
+-----+-----+



In [39]:
data = [('math', 92), ('english', 98)]
df = spark.createDataFrame(data, ['subject','marks'])
df.show()
df.select(sum(col('marks'))).show()
df.select((sum(col('marks')) == 90).alias('iam90')).show()

+-------+-----+
|subject|marks|
+-------+-----+
|   math|   92|
|english|   98|
+-------+-----+

+----------+
|sum(marks)|
+----------+
|       190|
+----------+

+-----+
|iam90|
+-----+
|false|
+-----+



In [8]:
data=[("James","Bond","100",None, None),
      ("Ann","Varsa","200",'F', 'Check'),
      ("Tom Cruise","XXX","400",''),
      ("Tom Brand",None,"400",'M')] 
columns=["fname","lname","id","gender"]
df=spark.createDataFrame(data,columns)
# df.select(expr("fname || ' ' || lname").alias('full_name')).show()

In [45]:
df.select(col('fname'), col('id')).sort(col('id').asc()).show()

+----------+---+
|     fname| id|
+----------+---+
|     James|100|
|       Ann|200|
|Tom Cruise|400|
| Tom Brand|400|
+----------+---+



In [50]:
df.select(col('fname'), col('id').cast('int')).printSchema()

root
 |-- fname: string (nullable = true)
 |-- id: integer (nullable = true)



In [54]:
# if you don't specify cast it will automatically convert into integer explicitly
df.select(col('fname'), col('lname'), col('id')).filter(col('id').cast('int').between(200, 500)).show()

+----------+-----+---+
|     fname|lname| id|
+----------+-----+---+
|       Ann|Varsa|200|
|Tom Cruise|  XXX|400|
| Tom Brand| null|400|
+----------+-----+---+



In [61]:
df.select(expr("fname || ' ' || coalesce(lname, '')").alias('full_name')).filter(col('fname').contains('Tom')).show()

+--------------+
|     full_name|
+--------------+
|Tom Cruise XXX|
|    Tom Brand |
+--------------+



In [63]:
df.filter(col('lname').isNull()).show()

+---------+-----+---+------+
|    fname|lname| id|gender|
+---------+-----+---+------+
|Tom Brand| null|400|     M|
+---------+-----+---+------+



In [64]:
df.filter(col('lname').isNotNull()).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|     James| Bond|100|  null|
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [76]:
df.filter(df.fname.like("%es")).show()

+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|James| Bond|100|  null|
+-----+-----+---+------+



In [77]:
df.select(
    col('fname'), col('lname'),
    when(col('gender') == 'M',
         'Male').when(col('gender') == 'F', 'Female').otherwise(
             col('gender')).alias('genderkanayacolumn')).show()

+----------+-----+------------------+
|     fname|lname|genderkanayacolumn|
+----------+-----+------------------+
|     James| Bond|              null|
|       Ann|Varsa|            Female|
|Tom Cruise|  XXX|                  |
| Tom Brand| null|              Male|
+----------+-----+------------------+



In [78]:
df.filter(df.id.isin([10, 200])).show()

+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|  Ann|Varsa|200|     F|
+-----+-----+---+------+



In [3]:
data=[(("James","Bond"),["Java","C#"],{'hair':'black','eye':'brown'}),
      (("Ann","Varsa"),[".NET","Python"],{'hair':'brown','eye':'black'}),
      (("Tom Cruise",""),["Python","Scala"],{'hair':'red','eye':'grey'}),
      (("Tom Brand",None),["Perl","Ruby"],{'hair':'black','eye':'blue'})]

schema = StructType([
        StructField('name', StructType([
            StructField('fname', StringType(), True),
            StructField('lname', StringType(), True)])),
        StructField('languages', ArrayType(StringType()),True),
        StructField('properties', MapType(StringType(),StringType()),True)
     ])
df=spark.createDataFrame(data,schema)
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [5]:
df.select(col('name').getField('fname')).show()

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
|Tom Cruise|
| Tom Brand|
+----------+



In [6]:
df.select(col('properties').getField('eye')).show()

+---------------+
|properties[eye]|
+---------------+
|          brown|
|          black|
|           grey|
|           blue|
+---------------+



In [7]:
df.select(col('properties').getItem('eye')).show()

+---------------+
|properties[eye]|
+---------------+
|          brown|
|          black|
|           grey|
|           blue|
+---------------+



In [8]:
df.select(col('languages').getItem(1)).show()

+------------+
|languages[1]|
+------------+
|          C#|
|      Python|
|       Scala|
|        Ruby|
+------------+



In [10]:
columns = ['name', 'languages', 'properties']
df.select(*columns).show()

+--------------+---------------+--------------------+
|          name|      languages|          properties|
+--------------+---------------+--------------------+
| [James, Bond]|     [Java, C#]|[eye -> brown, ha...|
|  [Ann, Varsa]| [.NET, Python]|[eye -> black, ha...|
|[Tom Cruise, ]|[Python, Scala]|[eye -> grey, hai...|
|  [Tom Brand,]|   [Perl, Ruby]|[eye -> blue, hai...|
+--------------+---------------+--------------------+



In [20]:
df.select([i for i in columns if i == 'name']).show()

+--------------+
|          name|
+--------------+
| [James, Bond]|
|  [Ann, Varsa]|
|[Tom Cruise, ]|
|  [Tom Brand,]|
+--------------+



In [21]:
df.select('*').show()

+--------------+---------------+--------------------+
|          name|      languages|          properties|
+--------------+---------------+--------------------+
| [James, Bond]|     [Java, C#]|[eye -> brown, ha...|
|  [Ann, Varsa]| [.NET, Python]|[eye -> black, ha...|
|[Tom Cruise, ]|[Python, Scala]|[eye -> grey, hai...|
|  [Tom Brand,]|   [Perl, Ruby]|[eye -> blue, hai...|
+--------------+---------------+--------------------+



In [26]:
df.select(df.columns[1:2]).show() # only show languages

+---------------+
|      languages|
+---------------+
|     [Java, C#]|
| [.NET, Python]|
|[Python, Scala]|
|   [Perl, Ruby]|
+---------------+



In [28]:
dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.show(truncate=False)
deptDF.collect() # collect data from all nodes to driver node but used for smaller datasets, kindly avoid on larger datasets

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



[Row(dept_name='Finance', dept_id=10),
 Row(dept_name='Marketing', dept_id=20),
 Row(dept_name='Sales', dept_id=30),
 Row(dept_name='IT', dept_id=40)]

In [30]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]
columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)

In [31]:
df.withColumn('salary', col('salary').cast('int')).printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [32]:
df.withColumn('salary', col('salary') / 10).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M| 300.0|
|  Michael|      Rose|        |2000-05-19|     M| 400.0|
|   Robert|          |Williams|1978-09-05|     M| 400.0|
|    Maria|      Anne|   Jones|1967-12-01|     F| 400.0|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -0.1|
+---------+----------+--------+----------+------+------+



In [37]:
df = df.withColumn('pf', col('salary') * 0.05)
df.show()

+---------+----------+--------+----------+------+------+----+-----+
|firstname|middlename|lastname|       dob|gender|salary|eobi|   pf|
+---------+----------+--------+----------+------+------+----+-----+
|    James|          |   Smith|1991-04-01|     M|  3000| 250|150.0|
|  Michael|      Rose|        |2000-05-19|     M|  4000| 250|200.0|
|   Robert|          |Williams|1978-09-05|     M|  4000| 250|200.0|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000| 250|200.0|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1| 250|-0.05|
+---------+----------+--------+----------+------+------+----+-----+



In [39]:
df = df.withColumn('eobi', lit(50))
df.show()

+---------+----------+--------+----------+------+------+----+-----+
|firstname|middlename|lastname|       dob|gender|salary|eobi|   pf|
+---------+----------+--------+----------+------+------+----+-----+
|    James|          |   Smith|1991-04-01|     M|  3000|  50|150.0|
|  Michael|      Rose|        |2000-05-19|     M|  4000|  50|200.0|
|   Robert|          |Williams|1978-09-05|     M|  4000|  50|200.0|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|  50|200.0|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|  50|-0.05|
+---------+----------+--------+----------+------+------+----+-----+



In [40]:
df = df.withColumnRenamed('gender', 'sex')
df.show()

+---------+----------+--------+----------+---+------+----+-----+
|firstname|middlename|lastname|       dob|sex|salary|eobi|   pf|
+---------+----------+--------+----------+---+------+----+-----+
|    James|          |   Smith|1991-04-01|  M|  3000|  50|150.0|
|  Michael|      Rose|        |2000-05-19|  M|  4000|  50|200.0|
|   Robert|          |Williams|1978-09-05|  M|  4000|  50|200.0|
|    Maria|      Anne|   Jones|1967-12-01|  F|  4000|  50|200.0|
|      Jen|      Mary|   Brown|1980-02-17|  F|    -1|  50|-0.05|
+---------+----------+--------+----------+---+------+----+-----+



In [44]:
df.drop('middlename', 'salary', 'pf').show()

+---------+--------+----------+---+----+
|firstname|lastname|       dob|sex|eobi|
+---------+--------+----------+---+----+
|    James|   Smith|1991-04-01|  M|  50|
|  Michael|        |2000-05-19|  M|  50|
|   Robert|Williams|1978-09-05|  M|  50|
|    Maria|   Jones|1967-12-01|  F|  50|
|      Jen|   Brown|1980-02-17|  F|  50|
+---------+--------+----------+---+----+



In [66]:
dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1)
]
schema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])
df = spark.createDataFrame(data = dataDF, schema = schema)
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [67]:
df.withColumnRenamed('dob', 'Date of Birth').show()

+--------------------+-------------+------+------+
|                name|Date of Birth|gender|salary|
+--------------------+-------------+------+------+
|    [James, , Smith]|   1991-04-01|     M|  3000|
|   [Michael, Rose, ]|   2000-05-19|     M|  4000|
|[Robert, , Williams]|   1978-09-05|     M|  4000|
|[Maria, Anne, Jones]|   1967-12-01|     F|  4000|
|  [Jen, Mary, Brown]|   1980-02-17|     F|    -1|
+--------------------+-------------+------+------+



In [68]:
df.withColumnRenamed('dob', 'Date of Birth').withColumnRenamed('salary', 'salary_amount').show()

+--------------------+-------------+------+-------------+
|                name|Date of Birth|gender|salary_amount|
+--------------------+-------------+------+-------------+
|    [James, , Smith]|   1991-04-01|     M|         3000|
|   [Michael, Rose, ]|   2000-05-19|     M|         4000|
|[Robert, , Williams]|   1978-09-05|     M|         4000|
|[Maria, Anne, Jones]|   1967-12-01|     F|         4000|
|  [Jen, Mary, Brown]|   1980-02-17|     F|           -1|
+--------------------+-------------+------+-------------+



In [69]:
df.select('*', col('name.firstname').alias('fname')).show()

+--------------------+----------+------+------+-------+
|                name|       dob|gender|salary|  fname|
+--------------------+----------+------+------+-------+
|    [James, , Smith]|1991-04-01|     M|  3000|  James|
|   [Michael, Rose, ]|2000-05-19|     M|  4000|Michael|
|[Robert, , Williams]|1978-09-05|     M|  4000| Robert|
|[Maria, Anne, Jones]|1967-12-01|     F|  4000|  Maria|
|  [Jen, Mary, Brown]|1980-02-17|     F|    -1|    Jen|
+--------------------+----------+------+------+-------+



In [3]:
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
 ]
        
schema = StructType([
     StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
     ])),
     StructField('languages', ArrayType(StringType()), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
 ])

df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith]      |[Java, Scala, C++]|OH   |M     |
|[Anna, Rose, ]        |[Spark, Java, C++]|NY   |F     |
|[Julia, , Williams]   |[CSharp, VB]      |OH   |F     |
|[Maria, Anne, Jones]  |[CSharp, VB]      |NY   |M     |
|[Jen, Mary, Brown]    |[CSharp, VB]      |NY   |M     |
|[Mike, Mary, Williams]|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [4]:
df.filter(df.state == 'NY').show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
|[Maria, Anne, Jones]|      [CSharp, VB]|   NY|     M|
|  [Jen, Mary, Brown]|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



In [5]:
df.filter(array_contains('languages', 'Java')).show()

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|[James, , Smith]|[Java, Scala, C++]|   OH|     M|
|  [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
+----------------+------------------+-----+------+



In [7]:
df.filter(df.name.firstname.contains('J')).show()

+-------------------+------------------+-----+------+
|               name|         languages|state|gender|
+-------------------+------------------+-----+------+
|   [James, , Smith]|[Java, Scala, C++]|   OH|     M|
|[Julia, , Williams]|      [CSharp, VB]|   OH|     F|
| [Jen, Mary, Brown]|      [CSharp, VB]|   NY|     M|
+-------------------+------------------+-----+------+



In [8]:
data = [("James", "Sales", 3000), \
    ("Michael", "Sales", 4600), \
    ("Robert", "Sales", 4100), \
    ("Maria", "Finance", 3000), \
    ("James", "Sales", 3000), \
    ("Scott", "Finance", 3300), \
    ("Jen", "Finance", 3900), \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000), \
    ("Saif", "Sales", 4100) \
  ]

# Create DataFrame
columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [10]:
df.distinct().count()

9

In [11]:
df.distinct().show()

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|          Jen|   Finance|  3900|
|      Michael|     Sales|  4600|
|        Scott|   Finance|  3300|
|        Kumar| Marketing|  2000|
|        James|     Sales|  3000|
|       Robert|     Sales|  4100|
|         Jeff| Marketing|  3000|
|         Saif|     Sales|  4100|
|        Maria|   Finance|  3000|
+-------------+----------+------+



In [12]:
df.drop_duplicates().show()

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|          Jen|   Finance|  3900|
|      Michael|     Sales|  4600|
|        Scott|   Finance|  3300|
|        Kumar| Marketing|  2000|
|        James|     Sales|  3000|
|       Robert|     Sales|  4100|
|         Jeff| Marketing|  3000|
|         Saif|     Sales|  4100|
|        Maria|   Finance|  3000|
+-------------+----------+------+



In [16]:
df.drop_duplicates(['department', 'salary']).select('department', 'salary').show()

+----------+------+
|department|salary|
+----------+------+
|     Sales|  4600|
|     Sales|  4100|
|   Finance|  3900|
|   Finance|  3000|
|   Finance|  3300|
| Marketing|  2000|
|     Sales|  3000|
| Marketing|  3000|
+----------+------+



In [17]:
simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Raman","Finance","CA",99000,40,24000), \
    ("Scott","Finance","NY",83000,36,19000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [18]:
df.sort('salary', 'age').show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        Raman|   Finance|   CA| 99000| 40|24000|
+-------------+----------+-----+------+---+-----+



In [19]:
df.sort(df.salary.desc()).show() # Raman CIVIC WALA LONDA PAISA BOHAT HY 

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|          Jen|   Finance|   NY| 79000| 53|15000|
+-------------+----------+-----+------+---+-----+



In [21]:
df.orderBy(col('salary'), col('age').desc()).show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        Raman|   Finance|   CA| 99000| 40|24000|
+-------------+----------+-----+------+---+-----+



In [22]:
df.createOrReplaceTempView('Employees')

In [23]:
spark.sql('''
    SELECT 
        department, 
        count(*) no_employees 
    FROM
        employees
    GROUP BY
        department
    ORDER BY
        2 DESC
''').show()

+----------+------------+
|department|no_employees|
+----------+------------+
|   Finance|           4|
|     Sales|           3|
| Marketing|           2|
+----------+------------+



In [24]:
simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]

schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [38]:
df.groupBy('department').sum('salary').show()

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|     Sales|     257000|
|   Finance|     351000|
| Marketing|     171000|
+----------+-----------+



In [40]:
df.groupBy('department').avg('salary').show()

+----------+-----------------+
|department|      avg(salary)|
+----------+-----------------+
|     Sales|85666.66666666667|
|   Finance|          87750.0|
| Marketing|          85500.0|
+----------+-----------------+



In [41]:
df.groupBy('department').mean('salary').show()

+----------+-----------------+
|department|      avg(salary)|
+----------+-----------------+
|     Sales|85666.66666666667|
|   Finance|          87750.0|
| Marketing|          85500.0|
+----------+-----------------+



In [43]:
df.groupBy('department').mean('salary', 'bonus').show()

+----------+-----------------+------------------+
|department|      avg(salary)|        avg(bonus)|
+----------+-----------------+------------------+
|     Sales|85666.66666666667|17666.666666666668|
|   Finance|          87750.0|           20250.0|
| Marketing|          85500.0|           19500.0|
+----------+-----------------+------------------+



In [47]:
df.groupBy('department').agg(
    sum('salary').alias('salary'),
    sum('bonus').alias('bonus')).orderBy(col('salary').desc()).show()

+----------+------+-----+
|department|salary|bonus|
+----------+------+-----+
|   Finance|351000|81000|
|     Sales|257000|53000|
| Marketing|171000|39000|
+----------+------+-----+



In [51]:
df.groupBy('department').agg(
    sum('salary').alias('salary'),
    sum('bonus').alias('bonus')).where(col('salary') > 200000).orderBy(
        col('salary').desc()).show()

+----------+------+-----+
|department|salary|bonus|
+----------+------+-----+
|   Finance|351000|81000|
|     Sales|257000|53000|
+----------+------+-----+



In [3]:
emp = [(1,"Smith",-1,"2018","10","M",3000), \
    (2,"Rose",1,"2010","20","M",4000), \
    (3,"Williams",1,"2010","10","M",1000), \
    (4,"Jones",2,"2005","10","F",2000), \
    (5,"Brown",2,"2010","40","",-1), \
      (6,"Brown",2,"2010","50","",-1) \
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
       "emp_dept_id","gender","salary"]

empDF = spark.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
empDF.show(truncate=False)

dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- superior_emp_id: long (nullable = true)
 |-- year_joined: string (nullable = true)
 |-- emp_dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
|6     |Brown   |2              |2010       |50         |      |-1    |
+------+--------+---------------+-----------+-----------+------+-----

In [6]:
empDF.join(deptDF, on=empDF.emp_dept_id == deptDF.dept_id).show(truncate=False) # by default inner join

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [8]:
empDF.join(deptDF, on=empDF.emp_dept_id == deptDF.dept_id, how='outer').show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [9]:
empDF.join(deptDF, on=empDF.emp_dept_id == deptDF.dept_id, how='left').show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [10]:
empDF.join(deptDF, on=empDF.emp_dept_id == deptDF.dept_id, how='right').show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [11]:
# take inner join, select only left side columns
empDF.join(deptDF, on=empDF.emp_dept_id == deptDF.dept_id, how='leftsemi').show(truncate=False) 

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
+------+--------+---------------+-----------+-----------+------+------+



In [13]:
# take left join but where right table have null in it's column
empDF.join(deptDF, on=empDF.emp_dept_id == deptDF.dept_id, how='leftanti').show(truncate=False)

+------+-----+---------------+-----------+-----------+------+------+
|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+-----+---------------+-----------+-----------+------+------+
|6     |Brown|2              |2010       |50         |      |-1    |
+------+-----+---------------+-----------+-----------+------+------+



In [16]:
empDF.alias('emp1').join(
    empDF.alias('emp2'),
    on=col('emp1.emp_id') == col('emp2.superior_emp_id')).select(
        col('emp1.emp_id').alias('Emp_id'),
        col('emp1.name').alias('Emp_name'),
        col('emp2.superior_emp_id').alias('Superior_id'),
        col('emp2.name').alias('Superior_name')).show(truncate=False)

+------+--------+-----------+-------------+
|Emp_id|Emp_name|Superior_id|Superior_name|
+------+--------+-----------+-------------+
|1     |Smith   |1          |Rose         |
|1     |Smith   |1          |Williams     |
|2     |Rose    |2          |Jones        |
|2     |Rose    |2          |Brown        |
|2     |Rose    |2          |Brown        |
+------+--------+-----------+-------------+



In [25]:
empDF.createOrReplaceTempView('employees')
deptDF.createOrReplaceTempView('department')

In [33]:
spark.sql('''
    SELECT
        dept_name,
        COUNT(*) no_of_employees
    FROM
        employees e JOIN department d
    ON
        e.emp_dept_id = d.dept_id
    GROUP BY
        dept_name
''').show()

+---------+---------------+
|dept_name|no_of_employees|
+---------+---------------+
|  Finance|              3|
|Marketing|              1|
|       IT|              1|
+---------+---------------+



In [34]:
simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

simpleData2 = [("James","Sales","NY",90000,34,10000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns2= ["employee_name","department","state","salary","age","bonus"]

df2 = spark.createDataFrame(data = simpleData2, schema = columns2)

df2.printSchema()
df2.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
+-------------+----------+-----+------+---+-----+

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----

In [35]:
df.union(df2).show() # including duplicates

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [36]:
df.union(df2).distinct().show() # excluding duplicates

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|       Robert|     Sales|   CA| 81000| 30|23000|
+-------------+----------+-----+------+---+-----+



In [48]:
# Create DataFrame df1 with columns name, and id
data = [("James",34), ("Michael",56), \
        ("Robert",30), ("Maria",24)
       ]

df1 = spark.createDataFrame(data = data, schema=["name","id"])
df1.printSchema()
df1.show()
# Create DataFrame df2 with columns name and id
# data2=[(34,"James", "chacha"),(45,"Maria", "anty"), \
#        (45,"Jen", "khala"),(34,"Jeff", "phupo")]
data2=[(34,"James"),(45,"Maria"), \
       (45,"Jen"),(34,"Jeff")]
# df2 = spark.createDataFrame(data = data2, schema = ["id","name", "relation"])
df2 = spark.createDataFrame(data = data2, schema = ["id","name"])
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- id: long (nullable = true)

+-------+---+
|   name| id|
+-------+---+
|  James| 34|
|Michael| 56|
| Robert| 30|
|  Maria| 24|
+-------+---+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+---+-----+
| id| name|
+---+-----+
| 34|James|
| 45|Maria|
| 45|  Jen|
| 34| Jeff|
+---+-----+



In [49]:
df1.unionByName(df2).show() # allowMissingColumns=True in spark 3.1

+-------+---+
|   name| id|
+-------+---+
|  James| 34|
|Michael| 56|
| Robert| 30|
|  Maria| 24|
|  James| 34|
|  Maria| 45|
|    Jen| 45|
|   Jeff| 34|
+-------+---+



In [3]:
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

root
 |-- Seqno: string (nullable = true)
 |-- Name: string (nullable = true)

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
+-----+------------+



In [4]:
convert_case = lambda str: str[0].upper() + str[1:]
convertUdf = udf(convert_case,
                 StringType())  # default return type is string type
df.select(col('Seqno'), convertUdf(col('Name'))).show()

+-----+--------------+
|Seqno|<lambda>(Name)|
+-----+--------------+
|    1|    John jones|
|    2|  Tracey smith|
|    3|   Amy sanders|
+-----+--------------+



In [5]:
upperUdf = udf(str.upper)
df.withColumn('Name in (Upper Case)', upperUdf(col('Name'))).show()

+-----+------------+--------------------+
|Seqno|        Name|Name in (Upper Case)|
+-----+------------+--------------------+
|    1|  john jones|          JOHN JONES|
|    2|tracey smith|        TRACEY SMITH|
|    3| amy sanders|         AMY SANDERS|
+-----+------------+--------------------+



In [6]:
convert_case = lambda str: str[0].upper() + str[1:]
upper_case = lambda x: x.upper()
spark.udf.register('convertUDF', convert_case)
spark.udf.register('to_upper', upper_case)
df.createOrReplaceTempView('customers')
spark.sql('''
    SELECT
        Seqno,
        convertUDF(name) Name_in_title,
        to_upper(name) Name_in_uppercase
    FROM
        customers
''').show()

+-----+-------------+-----------------+
|Seqno|Name_in_title|Name_in_uppercase|
+-----+-------------+-----------------+
|    1|   John jones|       JOHN JONES|
|    2| Tracey smith|     TRACEY SMITH|
|    3|  Amy sanders|      AMY SANDERS|
+-----+-------------+-----------------+



In [7]:
@udf(returnType=StringType())
def convert_every_word_first_lettter_to_capital(s):
    new_split = s.split(' ')
    new_str = ''
    return ' '.join([i[0].upper() + i[1:] for i in new_split])


df.withColumn('Every Letter Is Capital',
              convert_every_word_first_lettter_to_capital(col('name'))).show()

+-----+------------+-----------------------+
|Seqno|        Name|Every Letter Is Capital|
+-----+------------+-----------------------+
|    1|  john jones|             John Jones|
|    2|tracey smith|           Tracey Smith|
|    3| amy sanders|            Amy Sanders|
+-----+------------+-----------------------+



In [8]:
convert_case = lambda str: str[0].upper() + str[1:]
upper_case = lambda x: x.upper()
spark.udf.register('convertUDF', convert_case)
spark.udf.register('to_upper', upper_case)
df.createOrReplaceTempView('customers')
spark.sql('''
    SELECT
        Seqno,
        convertUDF(name) Name_in_title,
        to_upper(name) Name_in_uppercase
    FROM
        customers
    WHERE
        name IS NOT null AND name like '%joh%'
''').show()

+-----+-------------+-----------------+
|Seqno|Name_in_title|Name_in_uppercase|
+-----+-------------+-----------------+
|    1|   John jones|       JOHN JONES|
+-----+-------------+-----------------+



In [9]:
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders"),
    ('4',None)]

df2 = spark.createDataFrame(data=data,schema=columns)
df2.show(truncate=False)
df2.createOrReplaceTempView("NAME_TABLE2")
spark.sql('''
    SELECT
        Seqno,
        convertUDF(name) Name_in_title,
        to_upper(name) Name_in_uppercase
    FROM
        NAME_TABLE2
    WHERE
        name IS NOT null AND name like '%joh%'.
''').show() # gives error not handling null in udf

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
|4    |null        |
+-----+------------+



ParseException: "\nmismatched input 'FROM' expecting <EOF>(line 6, pos 4)\n\n== SQL ==\n\n    SELECT\n        Seqno,\n        convertUDF(name) Name_in_title,\n        to_upper(name) Name_in_uppercase\n    FROM\n----^^^\n        NAME_TABLE2\n    WHERE\n        name IS NOT null AND name like '%joh%'.\n"

In [1]:
# handling null in udf
convert_case = lambda str:  'null' if str is None else str[0].upper() + str[1:] 
spark.udf.register('convertUDF', convert_case)
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders"),
    ('4',None)]

df2 = spark.createDataFrame(data=data,schema=columns)
df2.show(truncate=False)
df2.createOrReplaceTempView("NAME_TABLE3")
spark.sql('''
    SELECT
        Seqno,
        convertUDF(name) Name_in_title
    FROM
        NAME_TABLE3
    WHERE
        name IS NOT null AND convertUDF(name) like '%Joh%'
''').show() # gives error not handling null in udf

NameError: name 'spark' is not defined

In [28]:
data = ["Project","Gutenberg’s","Alice’s","Adventures",
"in","Wonderland","Project","Gutenberg’s","Adventures",
"in","Wonderland","Project","Gutenberg’s"]

rdd=spark.sparkContext.parallelize(data)

In [31]:
rdd = rdd.map(lambda x: x.upper())
rdd = rdd.map(lambda x: (x, 1))
rdd.collect()

[('PROJECT', 1),
 ('GUTENBERG’S', 1),
 ('ALICE’S', 1),
 ('ADVENTURES', 1),
 ('IN', 1),
 ('WONDERLAND', 1),
 ('PROJECT', 1),
 ('GUTENBERG’S', 1),
 ('ADVENTURES', 1),
 ('IN', 1),
 ('WONDERLAND', 1),
 ('PROJECT', 1),
 ('GUTENBERG’S', 1)]

In [32]:
data = [('James','Smith','M',30),
  ('Anna','Rose','F',41),
  ('Robert','Williams','M',62), 
]

columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|    30|
|     Anna|    Rose|     F|    41|
|   Robert|Williams|     M|    62|
+---------+--------+------+------+



In [37]:
rdd = df.rdd.map(
    lambda x: (f'{x[0].upper()} {x[1]}', 'Male' if x[2] == 'M' else 'Female', x[3] * 2)
)
rdd.toDF(["name","gender","new_salary"]).show()

+---------------+------+----------+
|           name|gender|new_salary|
+---------------+------+----------+
|    JAMES Smith|  Male|        60|
|      ANNA Rose|Female|        82|
|ROBERT Williams|  Male|       124|
+---------------+------+----------+



In [39]:
rdd = df.rdd.map(
    lambda x: (f'{x["firstname"].upper()} {x["lastname"]}', 'Male' if x["gender"] == 'M' else 'Female', x["salary"] * 2)
)
rdd.toDF(["name","gender","new_salary"]).show()

+---------------+------+----------+
|           name|gender|new_salary|
+---------------+------+----------+
|    JAMES Smith|  Male|        60|
|      ANNA Rose|Female|        82|
|ROBERT Williams|  Male|       124|
+---------------+------+----------+



In [41]:
def custom_func(x):
    name = x['firstname'] + ' ' + x['lastname']
    gender = 'Male' if x["gender"] == 'M' else 'Female'
    salary = x["salary"] * 2
    return (name, gender, salary)
rdd = df.rdd.map(lambda x: custom_func(x))
rdd.toDF(["name","gender","new_salary"]).show()

+---------------+------+----------+
|           name|gender|new_salary|
+---------------+------+----------+
|    James Smith|  Male|        60|
|      Anna Rose|Female|        82|
|Robert Williams|  Male|       124|
+---------------+------+----------+



In [42]:
data = ["Project Gutenberg’s",
        "Alice’s Adventures in Wonderland",
        "Project Gutenberg’s",
        "Adventures in Wonderland",
        "Project Gutenberg’s"]
rdd=spark.sparkContext.parallelize(data)
rdd.collect()

['Project Gutenberg’s',
 'Alice’s Adventures in Wonderland',
 'Project Gutenberg’s',
 'Adventures in Wonderland',
 'Project Gutenberg’s']

In [48]:
print(rdd.flatMap(lambda x: x.lower()).collect()[:10])
rdd.flatMap(lambda x: x.split(' ')).collect()

['p', 'r', 'o', 'j', 'e', 'c', 't', ' ', 'g', 'u']


['Project',
 'Gutenberg’s',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'Project',
 'Gutenberg’s',
 'Adventures',
 'in',
 'Wonderland',
 'Project',
 'Gutenberg’s']

In [49]:
arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})]
df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])
df.show()

+----------+--------------+--------------------+
|      name|knownLanguages|          properties|
+----------+--------------+--------------------+
|     James| [Java, Scala]|[eye -> brown, ha...|
|   Michael|[Spark, Java,]|[eye ->, hair -> ...|
|    Robert|    [CSharp, ]|[eye -> , hair ->...|
|Washington|          null|                null|
| Jefferson|        [1, 2]|                  []|
+----------+--------------+--------------------+



In [52]:
df.select('name', explode(df.knownLanguages).alias('programming_languages')).show()

+---------+---------------------+
|     name|programming_languages|
+---------+---------------------+
|    James|                 Java|
|    James|                Scala|
|  Michael|                Spark|
|  Michael|                 Java|
|  Michael|                 null|
|   Robert|               CSharp|
|   Robert|                     |
|Jefferson|                    1|
|Jefferson|                    2|
+---------+---------------------+



In [54]:
# Prepare Data
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

# Create DataFrame
df = spark.createDataFrame(data=data,schema=columns)
df.show()

+-----+------------+
|Seqno|        Name|
+-----+------------+
|    1|  john jones|
|    2|tracey smith|
|    3| amy sanders|
+-----+------------+



In [3]:
# Prepare Data
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

# Create DataFrame
df = spark.createDataFrame(data=data,schema=columns)
df.show()

+-----+------------+
|Seqno|        Name|
+-----+------------+
|    1|  john jones|
|    2|tracey smith|
|    3| amy sanders|
+-----+------------+



In [8]:
def get_seq_no(df):
    print(df.Seqno)
df.foreach(get_seq_no)

In [10]:
accum = spark.sparkContext.accumulator(0)
df.foreach(lambda df: accum.add(int(df.Seqno)))
accum.value

6

In [12]:
accum = spark.sparkContext.accumulator(0)
rdd = spark.sparkContext.parallelize(list(range(1, 4)))
rdd.foreach(lambda r: accum.add(r))
accum.value

6

In [17]:
df = spark.range(1, 101)
df.sample(fraction=0.06).collect() # this will not contain duplicates
df.sample(fraction=0.06, seed=123).collect() # return the same sample every time

[Row(id=35), Row(id=47), Row(id=69), Row(id=87), Row(id=89)]

In [23]:
df.sample(withReplacement=True, fraction=0.06).collect()

[Row(id=23), Row(id=24), Row(id=25), Row(id=32), Row(id=58)]

In [5]:
df = spark.read.csv('./zip_codes.csv', inferSchema=True, header=True)
df.printSchema()
df.show(truncate=False)

root
 |-- id: integer (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- population: integer (nullable = true)

+---+-------+--------+-------------------+-----+----------+
|id |zipcode|type    |city               |state|population|
+---+-------+--------+-------------------+-----+----------+
|1  |704    |STANDARD|null               |PR   |30100     |
|2  |704    |null    |PASEO COSTA DEL SUR|PR   |null      |
|3  |709    |null    |BDA SAN LUIS       |PR   |3700      |
|4  |76166  |UNIQUE  |CINGULAR WIRELESS  |TX   |84000     |
|5  |76177  |STANDARD|null               |TX   |null      |
+---+-------+--------+-------------------+-----+----------+



In [11]:
df.na.fill(value=0).show()
df.na.fill(value=0, subset=['population']).show()
df.na.fill({'type': 'unknown', 'population': 0, 'city': '--'}).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|         0|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|         0|
+---+-------+--------+-------------------+-----+----------+

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|         0|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               nu

In [12]:
data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \
      ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), \
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")]

columns= ["Product","Amount","Country"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- Product: string (nullable = true)
 |-- Amount: long (nullable = true)
 |-- Country: string (nullable = true)

+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
|Banana |1000  |USA    |
|Carrots|1500  |USA    |
|Beans  |1600  |USA    |
|Orange |2000  |USA    |
|Orange |2000  |USA    |
|Banana |400   |China  |
|Carrots|1200  |China  |
|Beans  |1500  |China  |
|Orange |4000  |China  |
|Banana |2000  |Canada |
|Carrots|2000  |Canada |
|Beans  |2000  |Mexico |
+-------+------+-------+



In [16]:
df.groupBy('Product').pivot('Country').agg(sum('Amount').alias('Total Amount')).fillna(0).show()

+-------+------+-----+------+----+
|Product|Canada|China|Mexico| USA|
+-------+------+-----+------+----+
| Orange|     0| 4000|     0|4000|
|  Beans|     0| 1500|  2000|1600|
| Banana|  2000|  400|     0|1000|
|Carrots|  2000| 1200|     0|1500|
+-------+------+-----+------+----+



In [17]:
# because of performance issue, you can also provide column values as an second argument
columns = ['Canada', 'USA']
df.groupBy('Product').pivot('Country', columns).agg(sum('Amount').alias('Total Amount')).fillna(0).show()

+-------+------+----+
|Product|Canada| USA|
+-------+------+----+
| Orange|     0|4000|
|  Beans|     0|1600|
| Banana|  2000|1000|
|Carrots|  2000|1500|
+-------+------+----+



In [31]:
# your code here
# to improve more performance, do it in 2 phases
df.groupBy('Product', 'Country').sum('Amount').groupBy('Product').pivot(
    'Country').sum('sum(Amount)').fillna(0).show()

+-------+------+-----+------+----+
|Product|Canada|China|Mexico| USA|
+-------+------+-----+------+----+
| Orange|     0| 4000|     0|4000|
|  Beans|     0| 1500|  2000|1600|
| Banana|  2000|  400|     0|1000|
|Carrots|  2000| 1200|     0|1500|
+-------+------+-----+------+----+



In [36]:
df_pivot = df.groupBy('Product', 'Country').sum('Amount').groupBy(
    'Product').pivot('Country').sum('sum(Amount)')
df_pivot.select(
    'Product',
    expr(
        "stack(4, 'Canada', Canada, 'China', China, 'Mexico', Mexico, 'USA', USA) AS (Country, Total)" 
    )).where('Total IS NOT NULL').show()

+-------+-------+-----+
|Product|Country|Total|
+-------+-------+-----+
| Orange|  China| 4000|
| Orange|    USA| 4000|
|  Beans|  China| 1500|
|  Beans| Mexico| 2000|
|  Beans|    USA| 1600|
| Banana| Canada| 2000|
| Banana|  China|  400|
| Banana|    USA| 1000|
|Carrots| Canada| 2000|
|Carrots|  China| 1200|
|Carrots|    USA| 1500|
+-------+-------+-----+



In [38]:
df = spark.read.csv('./simle_zip_code.csv', inferSchema=True, header=True)
df.printSchema()
df.show(3)

root
 |-- RecordNumber: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- State: string (nullable = true)

+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|State|
+------------+-------+-------------------+-------+-----+
|           1|     US|        PARC PARQUE|    704|   PR|
|           2|     US|PASEO COSTA DEL SUR|    704|   PR|
|          10|     US|       BDA SAN LUIS|    709|   PR|
+------------+-------+-------------------+-------+-----+
only showing top 3 rows



In [41]:
df.write.option('header', True).partitionBy('state').mode('overwrite').csv('./tmp/zipcode_state')

In [42]:
df.write.option('header', True).partitionBy('state', 'city').mode('overwrite').csv('./tmp/zipcode_state')

In [47]:
df.repartition(2).write.option('header', True).partitionBy('state').mode('overwrite').csv('./tmp/zipcode_state')

In [None]:
df.write.option('header', True).option('maxRecordsPerFile', 2).partitionBy('state').mode('overwrite').csv('./tmp/zipcode_state')

In [48]:
# read partition data
parDF = spark.read.option('header', True).csv('./tmp/zipcode_state')
parDF.createOrReplaceTempView('zipcode')
spark.sql('''
    SELECT * FROM zipcode
''').show()

+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|state|
+------------+-------+-------------------+-------+-----+
|           1|     US|        PARC PARQUE|    704|   PR|
|           3|     US|      SECT LANAUSSE|    704|   PR|
|          10|     US|       BDA SAN LUIS|    709|   PR|
|           4|     US|    URB EUGENE RICE|    704|   PR|
|       61392|     US|         FORT WORTH|  76177|   TX|
|       61393|     US|           FT WORTH|  76177|   TX|
|       61391|     US|  CINGULAR WIRELESS|  76166|   TX|
|       54355|     US|        SPRINGVILLE|  35146|   AL|
|       54356|     US|        SPRUCE PINE|  35585|   AL|
|       49345|     US|           HILLIARD|  32046|   FL|
|       49348|     US|          HOMOSASSA|  34487|   FL|
|       76513|     US|           ASHEBORO|  27204|   NC|
|       76512|     US|           ASHEBORO|  27203|   NC|
|       49347|     US|               HOLT|  32564|   FL|
|       49346|     US|         

In [56]:
simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) \
  )
 
columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [57]:
WindowSpec = Window.partitionBy('department').orderBy('salary')
df.withColumn('rn', row_number().over(WindowSpec)).show(truncate=False)

+-------------+----------+------+---+
|employee_name|department|salary|rn |
+-------------+----------+------+---+
|James        |Sales     |3000  |1  |
|James        |Sales     |3000  |2  |
|Robert       |Sales     |4100  |3  |
|Saif         |Sales     |4100  |4  |
|Michael      |Sales     |4600  |5  |
|Maria        |Finance   |3000  |1  |
|Scott        |Finance   |3300  |2  |
|Jen          |Finance   |3900  |3  |
|Kumar        |Marketing |2000  |1  |
|Jeff         |Marketing |3000  |2  |
+-------------+----------+------+---+



In [59]:
df.withColumn('rn', rank().over(WindowSpec)).show(truncate=False)

+-------------+----------+------+---+
|employee_name|department|salary|rn |
+-------------+----------+------+---+
|James        |Sales     |3000  |1  |
|James        |Sales     |3000  |1  |
|Robert       |Sales     |4100  |3  |
|Saif         |Sales     |4100  |3  |
|Michael      |Sales     |4600  |5  |
|Maria        |Finance   |3000  |1  |
|Scott        |Finance   |3300  |2  |
|Jen          |Finance   |3900  |3  |
|Kumar        |Marketing |2000  |1  |
|Jeff         |Marketing |3000  |2  |
+-------------+----------+------+---+



In [61]:
df.withColumn('rn', dense_rank().over(WindowSpec)).show(truncate=False)

+-------------+----------+------+---+
|employee_name|department|salary|rn |
+-------------+----------+------+---+
|James        |Sales     |3000  |1  |
|James        |Sales     |3000  |1  |
|Robert       |Sales     |4100  |2  |
|Saif         |Sales     |4100  |2  |
|Michael      |Sales     |4600  |3  |
|Maria        |Finance   |3000  |1  |
|Scott        |Finance   |3300  |2  |
|Jen          |Finance   |3900  |3  |
|Kumar        |Marketing |2000  |1  |
|Jeff         |Marketing |3000  |2  |
+-------------+----------+------+---+



In [64]:
WindowSpec = Window.partitionBy('department').orderBy('salary')
df.withColumn('Cummulative Sum', sum('salary').over(WindowSpec)).show(truncate=False)

+-------------+----------+------+---------------+
|employee_name|department|salary|Cummulative Sum|
+-------------+----------+------+---------------+
|James        |Sales     |3000  |6000           |
|James        |Sales     |3000  |6000           |
|Robert       |Sales     |4100  |14200          |
|Saif         |Sales     |4100  |14200          |
|Michael      |Sales     |4600  |18800          |
|Maria        |Finance   |3000  |3000           |
|Scott        |Finance   |3300  |6300           |
|Jen          |Finance   |3900  |10200          |
|Kumar        |Marketing |2000  |2000           |
|Jeff         |Marketing |3000  |5000           |
+-------------+----------+------+---------------+



In [66]:
WindowSpec = Window.partitionBy('department').orderBy('salary')
df.withColumn('Lead', lead('salary', 2).over(WindowSpec)).show(truncate=False)

+-------------+----------+------+----+
|employee_name|department|salary|Lead|
+-------------+----------+------+----+
|James        |Sales     |3000  |4100|
|James        |Sales     |3000  |4100|
|Robert       |Sales     |4100  |4600|
|Saif         |Sales     |4100  |null|
|Michael      |Sales     |4600  |null|
|Maria        |Finance   |3000  |3900|
|Scott        |Finance   |3300  |null|
|Jen          |Finance   |3900  |null|
|Kumar        |Marketing |2000  |null|
|Jeff         |Marketing |3000  |null|
+-------------+----------+------+----+



In [67]:
data=[["1","2020-02-01"],["2","2019-03-01"],["3","2021-03-01"]]
df=spark.createDataFrame(data,["id","input"])
df.show()

+---+----------+
| id|     input|
+---+----------+
|  1|2020-02-01|
|  2|2019-03-01|
|  3|2021-03-01|
+---+----------+



In [71]:
df.select(current_date().alias('current_date')).limit(1).show()

+------------+
|current_date|
+------------+
|  2023-03-13|
+------------+



In [74]:
df.select(date_format(col('input'), 'MM-dd-yyyy').alias('date_format')).show()

+-----------+
|date_format|
+-----------+
| 02-01-2020|
| 03-01-2019|
| 03-01-2021|
+-----------+



In [79]:
df.printSchema()
df.select(to_date(col('input')).alias('to_date')).printSchema()

root
 |-- id: string (nullable = true)
 |-- input: string (nullable = true)

root
 |-- to_date: date (nullable = true)



In [82]:
df.select(datediff(current_date(), col('input')).alias('difference')).show()

+----------+
|difference|
+----------+
|      1136|
|      1473|
|       742|
+----------+



In [83]:
jsonString="""{"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}"""
df=spark.createDataFrame([(1, jsonString)],["id","value"])
df.show(truncate=False)

+---+--------------------------------------------------------------------------+
|id |value                                                                     |
+---+--------------------------------------------------------------------------+
|1  |{"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}|
+---+--------------------------------------------------------------------------+



In [91]:
df2 = df.withColumn('value', from_json(df.value, MapType(StringType(), StringType())))
df2.printSchema()

root
 |-- id: long (nullable = true)
 |-- value: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [93]:
df2.withColumn('to_json', to_json(col('value'))).show()

+---+--------------------+--------------------+
| id|               value|             to_json|
+---+--------------------+--------------------+
|  1|[Zipcode -> 704, ...|{"Zipcode":"704",...|
+---+--------------------+--------------------+



In [98]:
df.select('id', json_tuple('value', 'Zipcode', 'ZipCodeType', 'City',
                           'State')).toDF('id', 'Zipcode', 'ZipCodeType',
                                          'City', 'State').show()

+---+-------+-----------+-----------+-----+
| id|Zipcode|ZipCodeType|       City|State|
+---+-------+-----------+-----------+-----+
|  1|    704|   STANDARD|PARC PARQUE|   PR|
+---+-------+-----------+-----------+-----+



In [100]:
df.select('id', get_json_object(col('value'), '$.City').alias('HahaCity')).show()

+---+-----------+
| id|   HahaCity|
+---+-----------+
|  1|PARC PARQUE|
+---+-----------+

