In [0]:
data1 = [
    (1, 'hari', ['Data', 50000], 45), 
    (2, 'gani', ['Software', 60000], 34),
    (3, 'das', ['Data', 43000], 29),
    (4, 'bosh', ['Finance', 35000], 56),
    (5, 'teju', ['Software', 55000], 45)
]
sch1 = ['id', 'name', 'department_salary', 'age']

In [0]:
data1_df = spark.createDataFrame(data1, schema=sch1)
data1_df.show()

+---+----+-----------------+---+
| id|name|department_salary|age|
+---+----+-----------------+---+
|  1|hari|    [Data, 50000]| 45|
|  2|gani|[Software, 60000]| 34|
|  3| das|    [Data, 43000]| 29|
|  4|bosh| [Finance, 35000]| 56|
|  5|teju|[Software, 55000]| 45|
+---+----+-----------------+---+



In [0]:
from pyspark.sql.functions import *
data1_df = data1_df.withColumn('mgr_id', when(col('age')>50, 2)\
                        .when(col('age')<30, 1)\
                        .otherwise(3))

In [0]:
data1_df.show()

+---+----+-----------------+---+------+
| id|name|department_salary|age|mgr_id|
+---+----+-----------------+---+------+
|  1|hari|    [Data, 50000]| 45|     3|
|  2|gani|[Software, 60000]| 34|     3|
|  3| das|    [Data, 43000]| 29|     1|
|  4|bosh| [Finance, 35000]| 56|     2|
|  5|teju|[Software, 55000]| 45|     3|
+---+----+-----------------+---+------+



In [0]:
data1_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- department_salary: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- age: long (nullable = true)
 |-- mgr_id: integer (nullable = false)



In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
data1_df=data1_df.withColumn('salary', col=(col('department_salary')[1]*1.1).cast(IntegerType()))\
    .withColumn('department', col=col('department_salary')[0])
data1_df.show()

+---+----+-----------------+---+------+------+----------+
| id|name|department_salary|age|mgr_id|salary|department|
+---+----+-----------------+---+------+------+----------+
|  1|hari|    [Data, 50000]| 45|     3| 55000|      Data|
|  2|gani|[Software, 60000]| 34|     3| 66000|  Software|
|  3| das|    [Data, 43000]| 29|     1| 47300|      Data|
|  4|bosh| [Finance, 35000]| 56|     2| 38500|   Finance|
|  5|teju|[Software, 55000]| 45|     3| 60500|  Software|
+---+----+-----------------+---+------+------+----------+



In [0]:
data1_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- department_salary: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- age: long (nullable = true)
 |-- mgr_id: integer (nullable = false)
 |-- salary: integer (nullable = true)
 |-- department: string (nullable = true)



In [0]:
data1_df.withColumn('salary_dept', col= array(['salary', 'department'])).show()

+---+----+-----------------+---+------+------+----------+-----------------+
| id|name|department_salary|age|mgr_id|salary|department|      salary_dept|
+---+----+-----------------+---+------+------+----------+-----------------+
|  1|hari|    [Data, 50000]| 45|     3| 55000|      Data|    [55000, Data]|
|  2|gani|[Software, 60000]| 34|     3| 66000|  Software|[66000, Software]|
|  3| das|    [Data, 43000]| 29|     1| 47300|      Data|    [47300, Data]|
|  4|bosh| [Finance, 35000]| 56|     2| 38500|   Finance| [38500, Finance]|
|  5|teju|[Software, 55000]| 45|     3| 60500|  Software|[60500, Software]|
+---+----+-----------------+---+------+------+----------+-----------------+



In [0]:
data1_df.select('id', 'name', explode('department_salary').alias('dept&salary')).show()

+---+----+-----------+
| id|name|dept&salary|
+---+----+-----------+
|  1|hari|       Data|
|  1|hari|      50000|
|  2|gani|   Software|
|  2|gani|      60000|
|  3| das|       Data|
|  3| das|      43000|
|  4|bosh|    Finance|
|  4|bosh|      35000|
|  5|teju|   Software|
|  5|teju|      55000|
+---+----+-----------+



In [0]:
data1_df.select('id', 'name', 'salary', 'department')\
    .where(col('id')>4).show()

+---+----+------+----------+
| id|name|salary|department|
+---+----+------+----------+
|  5|teju| 60500|  Software|
+---+----+------+----------+



In [0]:
data1_df.select('id', 'name', 'salary')\
    .where((col('salary')>50000) & (col('department')=='Data')).show()

+---+----+------+
| id|name|salary|
+---+----+------+
|  1|hari| 55000|
+---+----+------+



In [0]:
data1_df.select('id', 'name', 'salary')\
    .where((col('salary')>50000) | (col('department')=='Data')).show()

+---+----+------+
| id|name|salary|
+---+----+------+
|  1|hari| 55000|
|  2|gani| 66000|
|  3| das| 47300|
|  5|teju| 60500|
+---+----+------+



In [0]:
data1_df.groupBy('department').count().show()

+----------+-----+
|department|count|
+----------+-----+
|      Data|    2|
|  Software|    2|
|   Finance|    1|
+----------+-----+



In [0]:
data1_df.groupBy('department').sum('salary').show()

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|      Data|     102300|
|  Software|     126500|
|   Finance|      38500|
+----------+-----------+



In [0]:
data1_df.groupBy(['department', 'name']).sum('salary').show()

+----------+----+-----------+
|department|name|sum(salary)|
+----------+----+-----------+
|      Data|hari|      55000|
|  Software|gani|      66000|
|      Data| das|      47300|
|   Finance|bosh|      38500|
|  Software|teju|      60500|
+----------+----+-----------+



In [0]:
data1_df.groupBy('department')\
    .agg(count('id').alias('number_of_emps'), 
         sum('salary').alias('total_salary'),
         avg('salary').alias('avg_salary')).show()

+----------+--------------+------------+----------+
|department|number_of_emps|total_salary|avg_salary|
+----------+--------------+------------+----------+
|      Data|             2|      102300|   51150.0|
|  Software|             2|      126500|   63250.0|
|   Finance|             1|       38500|   38500.0|
+----------+--------------+------------+----------+



In [0]:
data1_df.show()

+---+----+-----------------+---+------+------+----------+
| id|name|department_salary|age|mgr_id|salary|department|
+---+----+-----------------+---+------+------+----------+
|  1|hari|    [Data, 50000]| 45|     3| 55000|      Data|
|  2|gani|[Software, 60000]| 34|     3| 66000|  Software|
|  3| das|    [Data, 43000]| 29|     1| 47300|      Data|
|  4|bosh| [Finance, 35000]| 56|     2| 38500|   Finance|
|  5|teju|[Software, 55000]| 45|     3| 60500|  Software|
+---+----+-----------------+---+------+------+----------+



In [0]:
df1 = data1_df.alias("emp")
df2 = data1_df.alias("mgr")

result = df1.join(df2, col("emp.mgr_id") == col("mgr.id")).select('emp.id', 'emp.name', 'emp.salary', 'emp.department','emp.mgr_id', "mgr.salary")
result.show()

+---+----+------+----------+------+------+
| id|name|salary|department|mgr_id|salary|
+---+----+------+----------+------+------+
|  3| das| 47300|      Data|     1| 55000|
|  4|bosh| 38500|   Finance|     2| 66000|
|  1|hari| 55000|      Data|     3| 47300|
|  2|gani| 66000|  Software|     3| 47300|
|  5|teju| 60500|  Software|     3| 47300|
+---+----+------+----------+------+------+



In [0]:
result.orderBy(col('id')).show()

+---+----+------+----------+------+------+
| id|name|salary|department|mgr_id|salary|
+---+----+------+----------+------+------+
|  1|hari| 55000|      Data|     3| 47300|
|  2|gani| 66000|  Software|     3| 47300|
|  3| das| 47300|      Data|     1| 55000|
|  4|bosh| 38500|   Finance|     2| 66000|
|  5|teju| 60500|  Software|     3| 47300|
+---+----+------+----------+------+------+



In [0]:
result.orderBy(col('id').desc()).show()

+---+----+------+----------+------+------+
| id|name|salary|department|mgr_id|salary|
+---+----+------+----------+------+------+
|  5|teju| 60500|  Software|     3| 47300|
|  4|bosh| 38500|   Finance|     2| 66000|
|  3| das| 47300|      Data|     1| 55000|
|  2|gani| 66000|  Software|     3| 47300|
|  1|hari| 55000|      Data|     3| 47300|
+---+----+------+----------+------+------+



In [0]:
result.orderBy(col('id').desc(), col('name').asc()).show()

+---+----+------+----------+------+------+
| id|name|salary|department|mgr_id|salary|
+---+----+------+----------+------+------+
|  5|teju| 60500|  Software|     3| 47300|
|  4|bosh| 38500|   Finance|     2| 66000|
|  3| das| 47300|      Data|     1| 55000|
|  2|gani| 66000|  Software|     3| 47300|
|  1|hari| 55000|      Data|     3| 47300|
+---+----+------+----------+------+------+



In [0]:
data2 = [
    (2, 'maha', {'dept':'Data', 'salary':45000}),
    (3, 'hara', {'dept':'DataScience', 'salary': 55000}),
    (4, 'guru', {'dept': 'Data', 'salary': 34000})
]
sch2 = ['id', 'name', 'dept_salary']

In [0]:
data2_df = spark.createDataFrame(data2, schema=sch2)

In [0]:
data2_df.show(truncate=False)

+---+----+--------------------------------------+
|id |name|dept_salary                           |
+---+----+--------------------------------------+
|2  |maha|{salary -> 45000, dept -> Data}       |
|3  |hara|{salary -> 55000, dept -> DataScience}|
|4  |guru|{salary -> 34000, dept -> Data}       |
+---+----+--------------------------------------+



In [0]:
data2_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dept_salary: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [0]:
data2_df.withColumn('salary', col=col('dept_salary')['salary'])\
    .withColumn('department', col=col('dept_salary')['dept']).show()

+---+----+--------------------+------+-----------+
| id|name|         dept_salary|salary| department|
+---+----+--------------------+------+-----------+
|  2|maha|{salary -> 45000,...| 45000|       Data|
|  3|hara|{salary -> 55000,...| 55000|DataScience|
|  4|guru|{salary -> 34000,...| 34000|       Data|
+---+----+--------------------+------+-----------+



In [0]:
data = [
    (1, 'mahendra', ['data, 45000']),
    (2, 'mahi', ['analytics, 50000']),
    (3, 'hari',['data, 56000'])
]
sch = ['id', 'name', 'department_salary']

In [0]:
spark

In [0]:
data_df = spark.createDataFrame(data, schema= sch)

In [0]:
data_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- department_salary: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
data_df.show()

+---+--------+------------------+
| id|    name| department_salary|
+---+--------+------------------+
|  1|mahendra|     [data, 45000]|
|  2|    mahi|[analytics, 50000]|
|  3|    hari|     [data, 56000]|
+---+--------+------------------+



In [0]:
from pyspark.sql.functions import split, col, explode
from pyspark.sql.types import *
data_df = data_df.withColumn('department', split(col('department_salary')[0], ',')[0])\
    .withColumn('salary', col= (((split(col('department_salary')[0], ',')[1])*1.2).cast(IntegerType())))
data_df.show()

+---+--------+------------------+----------+------+
| id|    name| department_salary|department|salary|
+---+--------+------------------+----------+------+
|  1|mahendra|     [data, 45000]|      data| 54000|
|  2|    mahi|[analytics, 50000]| analytics| 60000|
|  3|    hari|     [data, 56000]|      data| 67200|
+---+--------+------------------+----------+------+



In [0]:
data_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- department_salary: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)



In [0]:
data_df.select('id', 'name', 'department', 'salary').show()

+---+--------+----------+------+
| id|    name|department|salary|
+---+--------+----------+------+
|  1|mahendra|      data| 54000|
|  2|    mahi| analytics| 60000|
|  3|    hari|      data| 67200|
+---+--------+----------+------+



In [0]:
data1_df.show()

+---+----+-----------------+---+------+------+----------+
| id|name|department_salary|age|mgr_id|salary|department|
+---+----+-----------------+---+------+------+----------+
|  1|hari|    [Data, 50000]| 45|     3| 55000|      Data|
|  2|gani|[Software, 60000]| 34|     3| 66000|  Software|
|  3| das|    [Data, 43000]| 29|     1| 47300|      Data|
|  4|bosh| [Finance, 35000]| 56|     2| 38500|   Finance|
|  5|teju|[Software, 55000]| 45|     3| 60500|  Software|
+---+----+-----------------+---+------+------+----------+



In [0]:
from pyspark.sql.window import Window

In [0]:
from pyspark.sql.functions import row_number

In [0]:
a = Window.orderBy(col('salary').desc())
b = Window.partitionBy(col('department')).orderBy(col('salary').desc())
data1_df.withColumn('rnk', rank().over(a))\
    .withColumn('rn', row_number().over(a))\
        .withColumn('drnk', dense_rank().over(a)).show()

+---+----+-----------------+---+------+------+----------+---+---+----+
| id|name|department_salary|age|mgr_id|salary|department|rnk| rn|drnk|
+---+----+-----------------+---+------+------+----------+---+---+----+
|  2|gani|[Software, 60000]| 34|     3| 66000|  Software|  1|  1|   1|
|  5|teju|[Software, 55000]| 45|     3| 60500|  Software|  2|  2|   2|
|  1|hari|    [Data, 50000]| 45|     3| 55000|      Data|  3|  3|   3|
|  3| das|    [Data, 43000]| 29|     1| 47300|      Data|  4|  4|   4|
|  4|bosh| [Finance, 35000]| 56|     2| 38500|   Finance|  5|  5|   5|
+---+----+-----------------+---+------+------+----------+---+---+----+



In [0]:
data1_df.withColumn('dept_wise_rnk', rank().over(b))\
    .withColumn('dept_wise_drnk', dense_rank().over(b))\
        .withColumn('dept_wise_rn', row_number().over(b)).show()

+---+----+-----------------+---+------+------+----------+-------------+--------------+------------+
| id|name|department_salary|age|mgr_id|salary|department|dept_wise_rnk|dept_wise_drnk|dept_wise_rn|
+---+----+-----------------+---+------+------+----------+-------------+--------------+------------+
|  1|hari|    [Data, 50000]| 45|     3| 55000|      Data|            1|             1|           1|
|  3| das|    [Data, 43000]| 29|     1| 47300|      Data|            2|             2|           2|
|  4|bosh| [Finance, 35000]| 56|     2| 38500|   Finance|            1|             1|           1|
|  2|gani|[Software, 60000]| 34|     3| 66000|  Software|            1|             1|           1|
|  5|teju|[Software, 55000]| 45|     3| 60500|  Software|            2|             2|           2|
+---+----+-----------------+---+------+------+----------+-------------+--------------+------------+



In [0]:
data1_df.withColumn('prev_emp_salary', lag(col('salary')).over(Window.orderBy(col('id'))))\
    .withColumn('next_emp_salary', lead(col('salary')).over(Window.orderBy(col('id')))).show()

+---+----+-----------------+---+------+------+----------+---------------+---------------+
| id|name|department_salary|age|mgr_id|salary|department|prev_emp_salary|next_emp_salary|
+---+----+-----------------+---+------+------+----------+---------------+---------------+
|  1|hari|    [Data, 50000]| 45|     3| 55000|      Data|           null|          66000|
|  2|gani|[Software, 60000]| 34|     3| 66000|  Software|          55000|          47300|
|  3| das|    [Data, 43000]| 29|     1| 47300|      Data|          66000|          38500|
|  4|bosh| [Finance, 35000]| 56|     2| 38500|   Finance|          47300|          60500|
|  5|teju|[Software, 55000]| 45|     3| 60500|  Software|          38500|           null|
+---+----+-----------------+---+------+------+----------+---------------+---------------+



In [0]:
data1_df.withColumn('prev_emp_salary', lag(col('salary')).over(Window.partitionBy(col('department')).orderBy(col('id'))))\
    .withColumn('next_emp_salary', lead(col('salary')).over(Window.partitionBy(col('department')).orderBy(col('id')))).show()

+---+----+-----------------+---+------+------+----------+---------------+---------------+
| id|name|department_salary|age|mgr_id|salary|department|prev_emp_salary|next_emp_salary|
+---+----+-----------------+---+------+------+----------+---------------+---------------+
|  1|hari|    [Data, 50000]| 45|     3| 55000|      Data|           null|          47300|
|  3| das|    [Data, 43000]| 29|     1| 47300|      Data|          55000|           null|
|  4|bosh| [Finance, 35000]| 56|     2| 38500|   Finance|           null|           null|
|  2|gani|[Software, 60000]| 34|     3| 66000|  Software|           null|          60500|
|  5|teju|[Software, 55000]| 45|     3| 60500|  Software|          66000|           null|
+---+----+-----------------+---+------+------+----------+---------------+---------------+



In [0]:
data1_df.show()

+---+----+-----------------+---+------+------+----------+
| id|name|department_salary|age|mgr_id|salary|department|
+---+----+-----------------+---+------+------+----------+
|  1|hari|    [Data, 50000]| 45|     3| 55000|      Data|
|  2|gani|[Software, 60000]| 34|     3| 66000|  Software|
|  3| das|    [Data, 43000]| 29|     1| 47300|      Data|
|  4|bosh| [Finance, 35000]| 56|     2| 38500|   Finance|
|  5|teju|[Software, 55000]| 45|     3| 60500|  Software|
+---+----+-----------------+---+------+------+----------+



In [0]:
from pyspark.sql.functions import avg, sum, count, min, max

In [0]:
x = Window.orderBy(col('id'))
data1_df.withColumn('rolling_sum', sum('salary').over(x))\
    .withColumn('rollingavg', avg('salary').over(x)).show()

+---+----+-----------------+---+------+------+----------+-----------+----------+
| id|name|department_salary|age|mgr_id|salary|department|rolling_sum|rollingavg|
+---+----+-----------------+---+------+------+----------+-----------+----------+
|  1|hari|    [Data, 50000]| 45|     3| 55000|      Data|      55000|   55000.0|
|  2|gani|[Software, 60000]| 34|     3| 66000|  Software|     121000|   60500.0|
|  3| das|    [Data, 43000]| 29|     1| 47300|      Data|     168300|   56100.0|
|  4|bosh| [Finance, 35000]| 56|     2| 38500|   Finance|     206800|   51700.0|
|  5|teju|[Software, 55000]| 45|     3| 60500|  Software|     267300|   53460.0|
+---+----+-----------------+---+------+------+----------+-----------+----------+



In [0]:
data1_df.select(avg('salary')).show()

+-----------+
|avg(salary)|
+-----------+
|    53460.0|
+-----------+

