In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('dataframe').getOrCreate()

In [6]:
df_pyspark=spark.read.csv("test1.csv",header=True,inferSchema=True)

In [8]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [10]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, age: string, Experience: string, Salary: string]

In [12]:
df_pyspark.columns

['Name', 'age', 'Experience', 'Salary']

In [17]:
df_pyspark.filter("Salary>=30000").show()

+-----+---+----------+------+
| Name|age|Experience|Salary|
+-----+---+----------+------+
|Krish| 31|        10| 30000|
+-----+---+----------+------+



In [18]:
df_pyspark.filter("Salary>=20000").select(['Name','Age']).show()

+---------+---+
|     Name|Age|
+---------+---+
|    Krish| 31|
|Sudhanshu| 30|
|    Sunny| 29|
|     Paul| 24|
+---------+---+



In [21]:
df_pyspark.filter(df_pyspark["Salary"]>=20000).select(['Name','Age']).show() #same output as above, just the syntax is a bit like pandas

+---------+---+
|     Name|Age|
+---------+---+
|    Krish| 31|
|Sudhanshu| 30|
|    Sunny| 29|
|     Paul| 24|
+---------+---+



In [24]:
df_pyspark.filter((df_pyspark['Salary']>20000) & (df_pyspark['Age']>24)).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
+---------+---+----------+------+



In [26]:
df_pyspark.filter(~((df_pyspark['Salary']>20000) & (df_pyspark['Age']>24))).show() #inverse of the above cline of code 

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [27]:
#Groupby and Aggregate Functions using Pyspark

In [28]:
import pyspark

In [29]:
from pyspark.sql import SparkSession 

In [30]:
spark=SparkSession.builder.appName("dataframe1").getOrCreate()

In [31]:
spark

In [36]:
df_spark=spark.read.csv("test3.csv",header=True,inferSchema=True)

In [37]:
df_spark.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [38]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- salary: integer (nullable = true)



In [39]:
df_spark.head()

Row(Name='Krish', Departments='Data Science', salary=10000)

In [40]:
df_spark.columns

['Name', 'Departments', 'salary']

In [42]:
df_spark.groupby("Name").sum().show() #grouped to find the maximun salary

+---------+-----------+
|     Name|sum(salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



In [43]:
df_spark.groupby("Departments").max().show()

+------------+-----------+
| Departments|max(salary)|
+------------+-----------+
|         IOT|      10000|
|    Big Data|       5000|
|Data Science|      20000|
+------------+-----------+



In [44]:
df_spark.groupby("Departments").mean().show()

+------------+-----------+
| Departments|avg(salary)|
+------------+-----------+
|         IOT|     7500.0|
|    Big Data|     3750.0|
|Data Science|    10750.0|
+------------+-----------+



In [45]:
df_spark.groupby("Departments").count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



In [52]:
df_spark.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      73000|
+-----------+



In [54]:
df_spark.groupby("Name").min().show()

+---------+-----------+
|     Name|min(salary)|
+---------+-----------+
|Sudhanshu|       5000|
|    Sunny|       2000|
|    Krish|       4000|
|   Mahesh|       3000|
+---------+-----------+

