In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, BooleanType

In [0]:
data = [
    (1, 'mahendra1', 34, 5600, 'Data Science'),
    (2, 'mahi', 35, 62000, 'Data Engineer'),
    (3, 'am1', 29, 5400, 'Data Science'),
    (4, 'aga', 53, 80000, 'Data Engineer'),
    (5, 'agahow1', 25, 4400, 'Data')
]

In [0]:
spark = SparkSession.builder.appName('first').getOrCreate()

In [0]:
df = spark.createDataFrame(data)

In [0]:
df.show(n=10)

+---+--------+---+-----+-------------+
| _1|      _2| _3|   _4|           _5|
+---+--------+---+-----+-------------+
|  1|mahendra| 34|56000| Data Science|
|  2|    mahi| 35|62000|Data Engineer|
|  3|      am| 29|54000| Data Science|
|  4|     aga| 53|80000|Data Engineer|
|  5|  agahow| 25|44000|         Data|
+---+--------+---+-----+-------------+



In [0]:
df.collect()

Out[14]: [Row(_1=1, _2='mahendra', _3=34, _4=56000, _5='Data Science'),
 Row(_1=2, _2='mahi', _3=35, _4=62000, _5='Data Engineer'),
 Row(_1=3, _2='am', _3=29, _4=54000, _5='Data Science'),
 Row(_1=4, _2='aga', _3=53, _4=80000, _5='Data Engineer'),
 Row(_1=5, _2='agahow', _3=25, _4=44000, _5='Data')]

In [0]:
cols = ['empid', 'name', 'age', 'salary', 'department']

In [0]:
df5 = spark.createDataFrame(data,cols)

In [0]:
df2.show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
|    3|      am| 29| 54000| Data Science|
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
+-----+--------+---+------+-------------+



In [0]:
df2.filter((col('name') == 'mahi') | (col('salary')>60000) & (col('department') == 'Data Engineer')).show()

+-----+----+---+------+-------------+
|empid|name|age|salary|   department|
+-----+----+---+------+-------------+
|    2|mahi| 35| 62000|Data Engineer|
|    4| aga| 53| 80000|Data Engineer|
+-----+----+---+------+-------------+



In [0]:
df2.show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
|    3|      am| 29| 54000| Data Science|
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
+-----+--------+---+------+-------------+



In [0]:
df2.orderBy('name').show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
|    3|      am| 29| 54000| Data Science|
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
+-----+--------+---+------+-------------+



In [0]:
df2.orderBy(col('department').asc(), col('age').desc()).show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    5|  agahow| 25| 44000|         Data|
|    4|     aga| 53| 80000|Data Engineer|
|    2|    mahi| 35| 62000|Data Engineer|
|    1|mahendra| 34| 56000| Data Science|
|    3|      am| 29| 54000| Data Science|
+-----+--------+---+------+-------------+



In [0]:
df2.drop(col('department')).show()

+-----+--------+---+------+
|empid|    name|age|salary|
+-----+--------+---+------+
|    1|mahendra| 34| 56000|
|    2|    mahi| 35| 62000|
|    3|      am| 29| 54000|
|    4|     aga| 53| 80000|
|    5|  agahow| 25| 44000|
+-----+--------+---+------+



In [0]:
df2

Out[37]: DataFrame[empid: bigint, name: string, age: bigint, salary: bigint, department: string]

In [0]:
df2.show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
|    3|      am| 29| 54000| Data Science|
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
+-----+--------+---+------+-------------+



In [0]:
df3 = df2.join(df, df2.empid == df._1).select(df._1.alias('employeeid'),
                                        df._2.alias('employeename'),
                                        df._3.alias('age'),
                                        df._4.alias('salary'),
                                        df._5.alias('department'))

In [0]:
df3.show()

+----------+------------+---+------+-------------+
|employeeid|employeename|age|salary|   department|
+----------+------------+---+------+-------------+
|         1|    mahendra| 34| 56000| Data Science|
|         2|        mahi| 35| 62000|Data Engineer|
|         3|          am| 29| 54000| Data Science|
|         4|         aga| 53| 80000|Data Engineer|
|         5|      agahow| 25| 44000|         Data|
+----------+------------+---+------+-------------+



In [0]:
df4 = df2.unionAll(df3)

In [0]:
df4.show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
|    3|      am| 29| 54000| Data Science|
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
|    3|      am| 29| 54000| Data Science|
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
+-----+--------+---+------+-------------+



In [0]:
df4.dropDuplicates().show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
|    3|      am| 29| 54000| Data Science|
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
+-----+--------+---+------+-------------+



In [0]:
df4.drop_duplicates().show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
|    3|      am| 29| 54000| Data Science|
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
+-----+--------+---+------+-------------+



In [0]:
df4.dropDuplicates(['name', 'age']).show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
|    3|      am| 29| 54000| Data Science|
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
+-----+--------+---+------+-------------+



In [0]:
df5.show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1|mahendra1| 34|  5600| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|      am1| 29|  5400| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
df6 = df2.unionAll(df5)

In [0]:
df6.show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1| mahendra| 34| 56000| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|       am| 29| 54000| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|   agahow| 25| 44000|         Data|
|    1|mahendra1| 34|  5600| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|      am1| 29|  5400| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
df6.dropDuplicates().show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1| mahendra| 34| 56000| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|       am| 29| 54000| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|   agahow| 25| 44000|         Data|
|    1|mahendra1| 34|  5600| Data Science|
|    3|      am1| 29|  5400| Data Science|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
df6.drop_duplicates().show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1| mahendra| 34| 56000| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|       am| 29| 54000| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|   agahow| 25| 44000|         Data|
|    1|mahendra1| 34|  5600| Data Science|
|    3|      am1| 29|  5400| Data Science|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
df6.drop_duplicates(['age', 'name', 'salary']).show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    5|   agahow| 25| 44000|         Data|
|    5|  agahow1| 25|  4400|         Data|
|    3|       am| 29| 54000| Data Science|
|    3|      am1| 29|  5400| Data Science|
|    1| mahendra| 34| 56000| Data Science|
|    1|mahendra1| 34|  5600| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    4|      aga| 53| 80000|Data Engineer|
+-----+---------+---+------+-------------+



In [0]:
df5.show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1|mahendra1| 34|  5600| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|      am1| 29|  5400| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
df6.show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1| mahendra| 34| 56000| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|       am| 29| 54000| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|   agahow| 25| 44000|         Data|
|    1|mahendra1| 34|  5600| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|      am1| 29|  5400| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
df6.where(col('name')=='mahendra1').show()

+-----+---------+---+------+------------+
|empid|     name|age|salary|  department|
+-----+---------+---+------+------------+
|    1|mahendra1| 34|  5600|Data Science|
+-----+---------+---+------+------------+



In [0]:
df6.groupBy('department').agg(sum('salary').alias('total_salary')).show()

+-------------+------------+
|   department|total_salary|
+-------------+------------+
| Data Science|      121000|
|Data Engineer|      284000|
|         Data|       48400|
+-------------+------------+



In [0]:
df6.where((col('department') == 'Data')| (col('salary') > 50000))\
    .groupBy('department').agg(avg('salary').alias('average_salary')).show()

+-------------+--------------+
|   department|average_salary|
+-------------+--------------+
| Data Science|       55000.0|
|Data Engineer|       71000.0|
|         Data|       24200.0|
+-------------+--------------+



In [0]:
df6.count()

Out[90]: 10

In [0]:
df6.dropna()

Out[91]: DataFrame[empid: bigint, name: string, age: bigint, salary: bigint, department: string]

In [0]:
df6.show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1| mahendra| 34| 56000| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|       am| 29| 54000| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|   agahow| 25| 44000|         Data|
|    1|mahendra1| 34|  5600| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|      am1| 29|  5400| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
df6.groupBy('department').count().orderBy('count').show()

+-------------+-----+
|   department|count|
+-------------+-----+
|         Data|    2|
| Data Science|    4|
|Data Engineer|    4|
+-------------+-----+



In [0]:
df7 = df6.groupBy('department').agg(collect_list('name').alias('employees_list'))

In [0]:
df7.show()

+-------------+--------------------+
|   department|      employees_list|
+-------------+--------------------+
| Data Science|[mahendra, am, ma...|
|Data Engineer|[mahi, aga, mahi,...|
|         Data|   [agahow, agahow1]|
+-------------+--------------------+



In [0]:
df7.show(truncate=False)

+-------------+------------------------------+
|department   |employees_list                |
+-------------+------------------------------+
|Data Science |[mahendra, am, mahendra1, am1]|
|Data Engineer|[mahi, aga, mahi, aga]        |
|Data         |[agahow, agahow1]             |
+-------------+------------------------------+



In [0]:
df7.withColumn('name', explode(col('employees_list'))).drop(col('employees_list')).show(truncate=False)

+-------------+---------+
|department   |name     |
+-------------+---------+
|Data Science |mahendra |
|Data Science |am       |
|Data Science |mahendra1|
|Data Science |am1      |
|Data Engineer|mahi     |
|Data Engineer|aga      |
|Data Engineer|mahi     |
|Data Engineer|aga      |
|Data         |agahow   |
|Data         |agahow1  |
+-------------+---------+



In [0]:
df6.show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1| mahendra| 34| 56000| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|       am| 29| 54000| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|   agahow| 25| 44000|         Data|
|    1|mahendra1| 34|  5600| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|      am1| 29|  5400| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
df6.write.mode('overwrite').format('csv').save('dbfs:/FileStore/tables/employees_info', header = True)

In [0]:
d = spark.read.csv('dbfs:/FileStore/tables/employees_info', header=True, inferSchema=True)

In [0]:
d.show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1| mahendra| 34| 56000| Data Science|
|    1|mahendra1| 34|  5600| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    2|     mahi| 35| 62000|Data Engineer|
|    4|      aga| 53| 80000|Data Engineer|
|    4|      aga| 53| 80000|Data Engineer|
|    3|       am| 29| 54000| Data Science|
|    3|      am1| 29|  5400| Data Science|
|    5|   agahow| 25| 44000|         Data|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
s= spark.read.format('csv').option('header', True)\
    .load('dbfs:/FileStore/tables/employees_info')

In [0]:
s.show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1| mahendra| 34| 56000| Data Science|
|    1|mahendra1| 34|  5600| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    2|     mahi| 35| 62000|Data Engineer|
|    4|      aga| 53| 80000|Data Engineer|
|    4|      aga| 53| 80000|Data Engineer|
|    3|       am| 29| 54000| Data Science|
|    3|      am1| 29|  5400| Data Science|
|    5|   agahow| 25| 44000|         Data|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
s.write.format('parquet').option('header',True).save('dbfs:/FileStore/tables/employees_info1')

In [0]:
spark.read.parquet('dbfs:/FileStore/tables/employees_info1', header = True).show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    2|     mahi| 35| 62000|Data Engineer|
|    2|     mahi| 35| 62000|Data Engineer|
|    4|      aga| 53| 80000|Data Engineer|
|    4|      aga| 53| 80000|Data Engineer|
|    1| mahendra| 34| 56000| Data Science|
|    1|mahendra1| 34|  5600| Data Science|
|    3|       am| 29| 54000| Data Science|
|    3|      am1| 29|  5400| Data Science|
|    5|   agahow| 25| 44000|         Data|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
df6.groupby(col('department')).agg(min(col('salary')).alias('minimum_salary'),
                                   max(col('salary')).alias('maximum_salary'), 
                                   avg('salary').alias('average_salary'),
                                   count('empid').alias('number_of_employees')).show()

+-------------+--------------+--------------+--------------+-------------------+
|   department|minimum_salary|maximum_salary|average_salary|number_of_employees|
+-------------+--------------+--------------+--------------+-------------------+
| Data Science|          5400|         56000|       30250.0|                  4|
|Data Engineer|         62000|         80000|       71000.0|                  4|
|         Data|          4400|         44000|       24200.0|                  2|
+-------------+--------------+--------------+--------------+-------------------+



In [0]:
df6.show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1| mahendra| 34| 56000| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|       am| 29| 54000| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|   agahow| 25| 44000|         Data|
|    1|mahendra1| 34|  5600| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|      am1| 29|  5400| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
df8 = df6.withColumn('age_group', when(col('age')<=30, 'Young')\
    .when((col('age')>30) & (col('age')<50), 'middle').otherwise('Old'))

In [0]:
df8.show()



+-----+---------+---+------+-------------+---------+
|empid|     name|age|salary|   department|age_group|
+-----+---------+---+------+-------------+---------+
|    1| mahendra| 34| 56000| Data Science|   middle|
|    2|     mahi| 35| 62000|Data Engineer|   middle|
|    3|       am| 29| 54000| Data Science|    Young|
|    4|      aga| 53| 80000|Data Engineer|      Old|
|    5|   agahow| 25| 44000|         Data|    Young|
|    1|mahendra1| 34|  5600| Data Science|   middle|
|    2|     mahi| 35| 62000|Data Engineer|   middle|
|    3|      am1| 29|  5400| Data Science|    Young|
|    4|      aga| 53| 80000|Data Engineer|      Old|
|    5|  agahow1| 25|  4400|         Data|    Young|
+-----+---------+---+------+-------------+---------+



In [0]:
from pyspark.sql.window import Window

In [0]:
wind_sp = Window.orderBy('employeeid')
df3.withColumn('cumilative_total', sum('salary').over(wind_sp)).show()

+----------+------------+---+------+-------------+----------------+
|employeeid|employeename|age|salary|   department|cumilative_total|
+----------+------------+---+------+-------------+----------------+
|         1|    mahendra| 34| 56000| Data Science|           56000|
|         2|        mahi| 35| 62000|Data Engineer|          118000|
|         3|          am| 29| 54000| Data Science|          172000|
|         4|         aga| 53| 80000|Data Engineer|          252000|
|         5|      agahow| 25| 44000|         Data|          296000|
+----------+------------+---+------+-------------+----------------+



In [0]:
df3.agg(sum(col('salary'))).show()

+-----------+
|sum(salary)|
+-----------+
|     296000|
+-----------+



In [0]:
wind_sp1 = Window.partitionBy('department').orderBy('employeeid')
df3.withColumn('dept_total',sum(col('salary')).over(wind_sp1)).show()

+----------+------------+---+------+-------------+----------+
|employeeid|employeename|age|salary|   department|dept_total|
+----------+------------+---+------+-------------+----------+
|         5|      agahow| 25| 44000|         Data|     44000|
|         2|        mahi| 35| 62000|Data Engineer|     62000|
|         4|         aga| 53| 80000|Data Engineer|    142000|
|         1|    mahendra| 34| 56000| Data Science|     56000|
|         3|          am| 29| 54000| Data Science|    110000|
+----------+------------+---+------+-------------+----------+



In [0]:
df6.show()

+-----+---------+---+------+-------------+
|empid|     name|age|salary|   department|
+-----+---------+---+------+-------------+
|    1| mahendra| 34| 56000| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|       am| 29| 54000| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|   agahow| 25| 44000|         Data|
|    1|mahendra1| 34|  5600| Data Science|
|    2|     mahi| 35| 62000|Data Engineer|
|    3|      am1| 29|  5400| Data Science|
|    4|      aga| 53| 80000|Data Engineer|
|    5|  agahow1| 25|  4400|         Data|
+-----+---------+---+------+-------------+



In [0]:
df8.show()

+-----+---------+---+------+-------------+---------+
|empid|     name|age|salary|   department|age_group|
+-----+---------+---+------+-------------+---------+
|    1| mahendra| 34| 56000| Data Science|   middle|
|    2|     mahi| 35| 62000|Data Engineer|   middle|
|    3|       am| 29| 54000| Data Science|    Young|
|    4|      aga| 53| 80000|Data Engineer|      Old|
|    5|   agahow| 25| 44000|         Data|    Young|
|    1|mahendra1| 34|  5600| Data Science|   middle|
|    2|     mahi| 35| 62000|Data Engineer|   middle|
|    3|      am1| 29|  5400| Data Science|    Young|
|    4|      aga| 53| 80000|Data Engineer|      Old|
|    5|  agahow1| 25|  4400|         Data|    Young|
+-----+---------+---+------+-------------+---------+



In [0]:
sec_h = Window.orderBy(col('salary').desc())
df6.withColumn('rnk', dense_rank().over(sec_h)).where(col('rnk')<=2).show(n=10)

+-----+----+---+------+-------------+---+
|empid|name|age|salary|   department|rnk|
+-----+----+---+------+-------------+---+
|    4| aga| 53| 80000|Data Engineer|  1|
|    4| aga| 53| 80000|Data Engineer|  1|
|    2|mahi| 35| 62000|Data Engineer|  2|
|    2|mahi| 35| 62000|Data Engineer|  2|
+-----+----+---+------+-------------+---+



In [0]:
df2.show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
|    3|      am| 29| 54000| Data Science|
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
+-----+--------+---+------+-------------+



In [0]:
dfy =df2.withColumnRenamed('empid', 'employeeid').withColumnRenamed('name', 
                                                               'employeename')\
                                                                   .withColumnRenamed('age', 'employeeage')\
                                                                       .withColumnRenamed('salary', 'employee_salary')\
                                                                           .withColumnRenamed('department', 'department_name')

In [0]:
df2.show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
|    3|      am| 29| 54000| Data Science|
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
+-----+--------+---+------+-------------+



In [0]:
dfx = df2.withColumn('mgrid', when(col('department')=='Data Engineer', 2)\
    .when(col('department')=='Data Science', 1).otherwise(3))

In [0]:
dfx.show()

+-----+--------+---+------+-------------+-----+
|empid|    name|age|salary|   department|mgrid|
+-----+--------+---+------+-------------+-----+
|    1|mahendra| 34| 56000| Data Science|    1|
|    2|    mahi| 35| 62000|Data Engineer|    2|
|    3|      am| 29| 54000| Data Science|    1|
|    4|     aga| 53| 80000|Data Engineer|    2|
|    5|  agahow| 25| 44000|         Data|    3|
+-----+--------+---+------+-------------+-----+



In [0]:
dfx.show()

+-----+--------+---+------+-------------+-----+
|empid|    name|age|salary|   department|mgrid|
+-----+--------+---+------+-------------+-----+
|    1|mahendra| 34| 56000| Data Science|    1|
|    2|    mahi| 35| 62000|Data Engineer|    2|
|    3|      am| 29| 54000| Data Science|    1|
|    4|     aga| 53| 80000|Data Engineer|    2|
|    5|  agahow| 25| 44000|         Data|    3|
+-----+--------+---+------+-------------+-----+



In [0]:
df2.show()

+-----+--------+---+------+-------------+
|empid|    name|age|salary|   department|
+-----+--------+---+------+-------------+
|    1|mahendra| 34| 56000| Data Science|
|    2|    mahi| 35| 62000|Data Engineer|
|    3|      am| 29| 54000| Data Science|
|    4|     aga| 53| 80000|Data Engineer|
|    5|  agahow| 25| 44000|         Data|
+-----+--------+---+------+-------------+



In [0]:
dfx.join(df2, dfx.mgrid == df2.empid)

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2755170561989769>:1[0m
[0;32m----> 1[0m [43mdfx[49m[38;5;241;43m.[39;49m[43mjoin[49m[43m([49m[43mdf2[49m[43m,[49m[43m [49m[43mdfx[49m[38;5;241;43m.[39;49m[43mmgrid[49m[43m [49m[38;5;241;43m==[39;49m[43m [49m[43mdf2[49m[38;5;241;43m.[39;49m[43mempid[49m[43m)[49m[38;5;241m.[39mselect(dfx[38;5;241m.[39mempid)[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;2

In [0]:
dfx.show()

+-----+--------+---+------+-------------+-----+
|empid|    name|age|salary|   department|mgrid|
+-----+--------+---+------+-------------+-----+
|    1|mahendra| 34| 56000| Data Science|    1|
|    2|    mahi| 35| 62000|Data Engineer|    2|
|    3|      am| 29| 54000| Data Science|    1|
|    4|     aga| 53| 80000|Data Engineer|    2|
|    5|  agahow| 25| 44000|         Data|    3|
+-----+--------+---+------+-------------+-----+



In [0]:
a = dfx.alias('a')
b = dfx.alias('b')
a.join(b, a.mgrid == b.empid).select(a['*'], b['name'].alias('manager_name')).show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2755170561989771>:3[0m
[1;32m      1[0m a [38;5;241m=[39m dfx[38;5;241m.[39malias([38;5;124m'[39m[38;5;124ma[39m[38;5;124m'[39m)
[1;32m      2[0m b [38;5;241m=[39m dfx[38;5;241m.[39malias([38;5;124m'[39m[38;5;124mb[39m[38;5;124m'[39m)
[0;32m----> 3[0m a[38;5;241m.[39mjoin(b, a[38;5;241m.[39mmgrid [38;5;241m==[39m b[38;5;241m.[39mempid)[38;5;241m.[39mselect(a[[38;5;124m'[39m[38;5;124m*[39m[38;5;124m'[39m], b[[38;5;124m'[39m[38;5;124mname[39m[38;5;124m'[39m][38;5;241m.[39malias([38;5;124m'[39m[38;5;124mmanager_name[39m[38;5;124m'[39m))[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2755170561989772>:4[0m
[1;32m      1[0m a [38;5;241m=[39m dfx[38;5;241m.[39malias([38;5;124m'[39m[38;5;124ma[39m[38;5;124m'[39m)
[1;32m      2[0m b [38;5;241m=[39m dfx[38;5;241m.[39malias([38;5;124m'[39m[38;5;124mb[39m[38;5;124m'[39m)
[0;32m----> 4[0m result [38;5;241m=[39m a[38;5;241m.[39mjoin(b, a[38;5;241m.[39mmgrid [38;5;241m==[39m b[38;5;241m.[39mempid) \
[1;32m      5[0m           [38;5;241m.[39mselect([38;5;241m*[39m[a[col] [38;5;28;01mfor[39;00m col [38;5;129;01min[39;00m dfx[38;5;241m.[39mcolumns], b[[38;5;124m'[39m[38;5;124mname[39m[38;5;124m'[39m][38;5;241m.[39malias([38;5;124m'[39m[38;5;124mmanager_name[39m[38;5;124m'[39m))
[1;32m      7[0m result[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/ins