In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Python Spark basic example").getOrCreate()
spark

In [4]:
## Read a csv file

df_pyspark = spark.read.csv('./data/test3.csv', header=True, inferSchema=True)

df_pyspark.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [5]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- salary: integer (nullable = true)



In [6]:
### Grouped to find the maximum salary

df_pyspark.groupBy('name').sum().show()

+---------+-----------+
|     name|sum(salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



In [12]:
df_pyspark.groupBy('Departments').sum().show()

+------------+-----------+
| Departments|sum(salary)|
+------------+-----------+
|         IOT|      15000|
|    Big Data|      15000|
|Data Science|      43000|
+------------+-----------+



In [13]:
df_pyspark.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



In [14]:
df_pyspark.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      73000|
+-----------+



In [16]:
df_pyspark.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [29]:
from pyspark.sql.functions import concat, col, lit
### Adding Columns in data frame

df_updated = df_pyspark.withColumn('salary after 2 years',df_pyspark['salary']*1.5 )\
    .withColumn('Status After 2 year', concat(lit('Senior_Dev_'),col('Name'),lit(' is promoted !!')))

## syntax new column name, operations
#converted_df = df_spark.withColumn("timestamp_column", F.from_unixtime(F.col("timestamp")))

df_updated.show()

+---------+------------+------+--------------------+--------------------+
|     Name| Departments|salary|salary after 2 years| Status After 2 year|
+---------+------------+------+--------------------+--------------------+
|    Krish|Data Science| 10000|             15000.0|Senior_Dev_Krish ...|
|    Krish|         IOT|  5000|              7500.0|Senior_Dev_Krish ...|
|   Mahesh|    Big Data|  4000|              6000.0|Senior_Dev_Mahesh...|
|    Krish|    Big Data|  4000|              6000.0|Senior_Dev_Krish ...|
|   Mahesh|Data Science|  3000|              4500.0|Senior_Dev_Mahesh...|
|Sudhanshu|Data Science| 20000|             30000.0|Senior_Dev_Sudhan...|
|Sudhanshu|         IOT| 10000|             15000.0|Senior_Dev_Sudhan...|
|Sudhanshu|    Big Data|  5000|              7500.0|Senior_Dev_Sudhan...|
|    Sunny|Data Science| 10000|             15000.0|Senior_Dev_Sunny ...|
|    Sunny|    Big Data|  2000|              3000.0|Senior_Dev_Sunny ...|
+---------+------------+------+-------

In [30]:
## Dropping a column and renaming another 

df_updated = df_updated.drop('salary after 2 years')

df_updated = df_updated.withColumnRenamed('Status After 2 year','Progression status')

df_updated.show()

+---------+------------+------+--------------------+
|     Name| Departments|salary|  Progression status|
+---------+------------+------+--------------------+
|    Krish|Data Science| 10000|Senior_Dev_Krish ...|
|    Krish|         IOT|  5000|Senior_Dev_Krish ...|
|   Mahesh|    Big Data|  4000|Senior_Dev_Mahesh...|
|    Krish|    Big Data|  4000|Senior_Dev_Krish ...|
|   Mahesh|Data Science|  3000|Senior_Dev_Mahesh...|
|Sudhanshu|Data Science| 20000|Senior_Dev_Sudhan...|
|Sudhanshu|         IOT| 10000|Senior_Dev_Sudhan...|
|Sudhanshu|    Big Data|  5000|Senior_Dev_Sudhan...|
|    Sunny|Data Science| 10000|Senior_Dev_Sunny ...|
|    Sunny|    Big Data|  2000|Senior_Dev_Sunny ...|
+---------+------------+------+--------------------+



In [31]:
## Select 
df_updated.select('Departments').show()


+------------+
| Departments|
+------------+
|Data Science|
|         IOT|
|    Big Data|
|    Big Data|
|Data Science|
|Data Science|
|         IOT|
|    Big Data|
|Data Science|
|    Big Data|
+------------+

