# Pyspark Introduction and Installation

https://spark.apache.org/

In [1]:
# install pyspark library
# !pip install pyspark





In [3]:
# load library
import pyspark as ps
import pandas as pd

In [45]:
# read data file using pandas
pd_df = pd.read_excel("test.xlsx")
pd_df.to_csv("test.csv")
pd_df

Unnamed: 0,Name,Sname,Age,Gender,Email_ID,Adderess,Profile,Experience
0,kiran,mungkar,29,m,kiran@gmail.com,virar,data analyst,1.8
1,kapil,nargund,27,m,kapil@gmail.com,vasai,data analyst,3.2
2,vinod,kadam,31,m,vinod#gmail.com,borivali,teacher,2.8
3,samira,sha,28,f,samira@yahoo.com,borivali,teacher,2.8
4,vidhan,wani,27,m,vidhan@hotmail.com,bhayandar,automation developer,1.5
5,abhijit,panpatil,27,m,abhijit@gmail.com,palghar,shell scripting engineer,4.5
6,namrata,deshamukh,28,f,manrata@yahoo.com,borivali,front end developer,2.6
7,neha,raut,27,f,neha@hotmail.com,virar,teacher,4.8
8,shubham,neve,26,m,shubham@hotmail.com,palghar,automation developer,3.5
9,darshan,sha,27,m,darshan@gmail.com,malad,.net devloper,3.4


# To work with spark 1st start the spark session

In [8]:
# load SparkSession
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName('practice').getOrCreate() 
# give session a name ie. 'practice'
# and then create spark session
spark

when you working on local you can see only one cluster nut when you working on cloud you can create multiple cluster and instances

In [46]:
# read data file using sparks
ps_df = spark.read.csv('test.csv')
ps_df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string]

In [47]:
ps_df.show()

+----+--------+---------+---+------+-------------------+----------+--------------------+----------+
| _c0|     _c1|      _c2|_c3|   _c4|                _c5|       _c6|                 _c7|       _c8|
+----+--------+---------+---+------+-------------------+----------+--------------------+----------+
|null|    Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
|   0|   kiran|  mungkar| 29|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|   1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|   2|   vinod|    kadam| 31|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|   3|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|
|   4|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|   5| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|


In [48]:
# to show / select the 1st row as header
ps_df = spark.read.option("header", "true").csv("test.csv")
ps_df

DataFrame[_c0: string, Name: string, Sname: string, Age: string, Gender: string, Email_ID: string, Adderess: string, Profile: string, Experience: string]

In [49]:
ps_df.show()

+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|_c0|    Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|  0|   kiran|  mungkar| 29|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|   vinod|    kadam| 31|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  3|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|
|  4|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|
|  6| namrata|deshamukh| 28|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  7|    n

In [50]:
# check the data type 
type(ps_df), type(pd_df) 

(pyspark.sql.dataframe.DataFrame, pandas.core.frame.DataFrame)

In [51]:
# to show few / 1st row information
ps_df.head()

Row(_c0='0', Name='kiran', Sname='mungkar', Age='29', Gender='m', Email_ID='kiran@gmail.com', Adderess='virar', Profile='data analyst', Experience='1.8')

In [52]:
# to show information about collumns
ps_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sname: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Email_ID: string (nullable = true)
 |-- Adderess: string (nullable = true)
 |-- Profile: string (nullable = true)
 |-- Experience: string (nullable = true)



# Pyspark DataFrames

- Pyspark DataFrames
- Reading the dataset
- Checking the datatypes of the collumns (schema)
- Selecting columns and indexing
- Check describe option as similar to pandas
- Adding collumns
- Droping collumns
- Rename the column

In [53]:
# create sparks session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('practice').getOrCreate() 
spark

In [54]:
# read dataset
ps_df = spark.read.option("header", "true").csv('test.csv')
ps_df

DataFrame[_c0: string, Name: string, Sname: string, Age: string, Gender: string, Email_ID: string, Adderess: string, Profile: string, Experience: string]

In [55]:
ps_df.show()

+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|_c0|    Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|  0|   kiran|  mungkar| 29|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|   vinod|    kadam| 31|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  3|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|
|  4|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|
|  6| namrata|deshamukh| 28|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  7|    n

In [56]:
# check schema
ps_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sname: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Email_ID: string (nullable = true)
 |-- Adderess: string (nullable = true)
 |-- Profile: string (nullable = true)
 |-- Experience: string (nullable = true)



by default all collumns data type are string to set it use .option("inferSchema", "true") or inferSchema=True.

In [74]:
# read dataset
ps_df = spark.read.csv('test.csv', header = True,  inferSchema = True)
ps_df

DataFrame[_c0: int, Name: string, Sname: string, Age: int, Gender: string, Email_ID: string, Adderess: string, Profile: string, Experience: double]

In [75]:
# check schema
ps_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sname: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Email_ID: string (nullable = true)
 |-- Adderess: string (nullable = true)
 |-- Profile: string (nullable = true)
 |-- Experience: double (nullable = true)



In [40]:
# check type
type(ps_df)

pyspark.sql.dataframe.DataFrame

- DataFrame : is a data structures and inside that we can perform different operations.
- DataFrame : is a data structure that organizes data into a 2-dimensional table of rows and columns, much like a spreadsheet.

In [41]:
# selecting collumns and indexing 
# how to get all the collumns
ps_df.columns

['_c0',
 'Name',
 'Sname',
 'Age',
 'Gender',
 'Email_ID',
 'Adderess',
 'Profile',
 'Experience']

In [42]:
ps_df.head() # default 1 record

Row(_c0=0, Name='kiran', Sname='mungkar', Age=29, Gender='m', Email_ID='kiran@gmail.com', Adderess='virar', Profile='data analyst', Experience=1.8)

In [43]:
ps_df.head(3) # show 3 records

[Row(_c0=0, Name='kiran', Sname='mungkar', Age=29, Gender='m', Email_ID='kiran@gmail.com', Adderess='virar', Profile='data analyst', Experience=1.8),
 Row(_c0=1, Name='kapil', Sname='nargund', Age=27, Gender='m', Email_ID='kapil@gmail.com', Adderess='vasai', Profile='data analyst', Experience=3.2),
 Row(_c0=2, Name='samira', Sname='sha', Age=28, Gender='f', Email_ID='samira@yahoo.com', Adderess='borivali', Profile='teacher', Experience=2.8)]

In [57]:
# to select the Name collumn
ps_df.select("Name")

DataFrame[Name: string]

In [58]:
ps_df.select("Name").show()

+--------+
|    Name|
+--------+
|   kiran|
|   kapil|
|   vinod|
|  samira|
|  vidhan|
| abhijit|
| namrata|
|    neha|
| shubham|
| darshan|
|    anuj|
|harshali|
|  nitesh|
|  nitesh|
|   vidit|
|shradhha|
|   jinal|
|   rohit|
|siddhesh|
|   nidhi|
+--------+
only showing top 20 rows



In [61]:
name_df = ps_df.select("Name")
name_df.show()

+--------+
|    Name|
+--------+
|   kiran|
|   kapil|
|   vinod|
|  samira|
|  vidhan|
| abhijit|
| namrata|
|    neha|
| shubham|
| darshan|
|    anuj|
|harshali|
|  nitesh|
|  nitesh|
|   vidit|
|shradhha|
|   jinal|
|   rohit|
|siddhesh|
|   nidhi|
+--------+
only showing top 20 rows



In [60]:
type(name_df)

pyspark.sql.dataframe.DataFrame

In [66]:
# select multiple collumns
cols_df = ps_df.select(["Name", "Age"])
cols_df.show()

+--------+---+
|    Name|Age|
+--------+---+
|   kiran| 29|
|   kapil| 27|
|   vinod| 31|
|  samira| 28|
|  vidhan| 27|
| abhijit| 27|
| namrata| 28|
|    neha| 27|
| shubham| 26|
| darshan| 27|
|    anuj| 28|
|harshali| 27|
|  nitesh| 26|
|  nitesh| 26|
|   vidit| 29|
|shradhha| 28|
|   jinal| 30|
|   rohit| 28|
|siddhesh| 28|
|   nidhi| 28|
+--------+---+
only showing top 20 rows



In [71]:
# dtypes
ps_df.dtypes

[('_c0', 'int'),
 ('Name', 'string'),
 ('Sname', 'string'),
 ('Age', 'int'),
 ('Gender', 'string'),
 ('Email_ID', 'string'),
 ('Adderess', 'string'),
 ('Profile', 'string'),
 ('Experience', 'double')]

In [78]:
# Check describe option as similar to pandas
# ps_df.describe().show()

In [80]:
# Adding colloumns in dataframe
EXP_after_2year_df = ps_df.withColumn("Exp. after 2 year", ps_df["Experience"]+2)

In [81]:
EXP_after_2year_df.show()

+---+--------+---------+---+------+-------------------+----------+--------------------+----------+-----------------+
|_c0|    Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|Exp. after 2 year|
+---+--------+---------+---+------+-------------------+----------+--------------------+----------+-----------------+
|  0|   kiran|  mungkar| 29|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|              3.8|
|  1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|              5.2|
|  2|   vinod|    kadam| 31|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|              4.8|
|  3|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|              4.8|
|  4|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|              3.5|
|  5| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palgha

In [82]:
# drop the columns
drop_df = EXP_after_2year_df.drop("Exp. after 2 year")
drop_df.show()

+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|_c0|    Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|  0|   kiran|  mungkar| 29|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|   vinod|    kadam| 31|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  3|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|
|  4|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|
|  6| namrata|deshamukh| 28|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  7|    n

In [83]:
# Rename the column
rename_col_df = drop_df.withColumnRenamed("Name", "F_Name")
rename_col_df.show()

+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|_c0|  F_Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|  0|   kiran|  mungkar| 29|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|   vinod|    kadam| 31|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  3|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|
|  4|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|
|  6| namrata|deshamukh| 28|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  7|    n

# Pyspark Handling Missing Values

- Dropping columns
- Droping rows
- Various parameter in dropping functionalities
- Handelingmissing values by mean, median and mode

In [2]:
# read excel and convert to csv
import pandas as pd
# read .xlsx
df = pd.read_excel('test1.xlsx')
# to save in .csv
df.to_csv('test1.csv')

In [7]:
# connect seseeion
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practrice').getOrCreate()

# read .csv file 
ps_df = spark.read.csv('test1.csv', header=True, inferSchema=True)
ps_df.show()

+---+--------+---------+----+------+-------------------+----------+--------------------+----------+
|_c0|    Name|    Sname| Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+----+------+-------------------+----------+--------------------+----------+
|  0|   kiran|  mungkar|29.0|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|   kapil|  nargund|27.0|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|   vinod|    kadam|31.0|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  3|    null|      sha|28.0|     f|   samira@yahoo.com|  borivali|             teacher|      null|
|  4|  vidhan|     wani|null|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5| abhijit| panpatil|null|     m|  abhijit@gmail.com|   palghar|                null|       4.5|
|  6| namrata|deshamukh|28.0|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|


In [8]:
# drop the column
drop_column_df = ps_df.drop('Experience')
drop_column_df.show()

+---+--------+---------+----+------+-------------------+----------+--------------------+
|_c0|    Name|    Sname| Age|Gender|           Email_ID|  Adderess|             Profile|
+---+--------+---------+----+------+-------------------+----------+--------------------+
|  0|   kiran|  mungkar|29.0|     m|    kiran@gmail.com|     virar|        data analyst|
|  1|   kapil|  nargund|27.0|     m|    kapil@gmail.com|     vasai|        data analyst|
|  2|   vinod|    kadam|31.0|     m|    vinod#gmail.com|  borivali|             teacher|
|  3|    null|      sha|28.0|     f|   samira@yahoo.com|  borivali|             teacher|
|  4|  vidhan|     wani|null|     m| vidhan@hotmail.com| bhayandar|automation developer|
|  5| abhijit| panpatil|null|     m|  abhijit@gmail.com|   palghar|                null|
|  6| namrata|deshamukh|28.0|     f|  manrata@yahoo.com|  borivali| front end developer|
|  7|    neha|     raut|27.0|     f|   neha@hotmail.com|     virar|             teacher|
|  8| shubham|     ne

In [9]:
# drop rows based on null values 
ps_df.na.drop().show()

+---+---------+---------+----+------+-------------------+----------+--------------------+----------+
|_c0|     Name|    Sname| Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+---------+---------+----+------+-------------------+----------+--------------------+----------+
|  0|    kiran|  mungkar|29.0|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|    kapil|  nargund|27.0|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|    vinod|    kadam|31.0|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  6|  namrata|deshamukh|28.0|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  7|     neha|     raut|27.0|     f|   neha@hotmail.com|     virar|             teacher|       4.8|
|  8|  shubham|     neve|26.0|     m|shubham@hotmail.com|   palghar|automation developer|       3.5|
| 10|     anuj|    kumar|28.0|     m|     anuj@yahoo.com|nalasopara|   backend developer|  

In [10]:
# ps_df.na.drop(how='any', thresh=None, subset=None)
# how: 'any' or 'all'.
# If 'any', drop a row if it contains any nulls.
# If 'all', drop a row only if all its values are null.
# drop when how == all OR how == any
ps_df.na.drop(how='any').show()

+---+---------+---------+----+------+-------------------+----------+--------------------+----------+
|_c0|     Name|    Sname| Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+---------+---------+----+------+-------------------+----------+--------------------+----------+
|  0|    kiran|  mungkar|29.0|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|    kapil|  nargund|27.0|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|    vinod|    kadam|31.0|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  6|  namrata|deshamukh|28.0|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  7|     neha|     raut|27.0|     f|   neha@hotmail.com|     virar|             teacher|       4.8|
|  8|  shubham|     neve|26.0|     m|shubham@hotmail.com|   palghar|automation developer|       3.5|
| 10|     anuj|    kumar|28.0|     m|     anuj@yahoo.com|nalasopara|   backend developer|  

In [11]:
ps_df.na.drop(how='all').show()

+---+--------+---------+----+------+-------------------+----------+--------------------+----------+
|_c0|    Name|    Sname| Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+----+------+-------------------+----------+--------------------+----------+
|  0|   kiran|  mungkar|29.0|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|   kapil|  nargund|27.0|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|   vinod|    kadam|31.0|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  3|    null|      sha|28.0|     f|   samira@yahoo.com|  borivali|             teacher|      null|
|  4|  vidhan|     wani|null|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5| abhijit| panpatil|null|     m|  abhijit@gmail.com|   palghar|                null|       4.5|
|  6| namrata|deshamukh|28.0|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|


In [12]:
# thresh: int, default None
# If specified, drop rows that have less than `thresh` non-null values.
# This overwrites the `how` parameter.
ps_df.na.drop(how='any', thresh=2).show()

+---+--------+---------+----+------+-------------------+----------+--------------------+----------+
|_c0|    Name|    Sname| Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+----+------+-------------------+----------+--------------------+----------+
|  0|   kiran|  mungkar|29.0|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|   kapil|  nargund|27.0|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|   vinod|    kadam|31.0|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  3|    null|      sha|28.0|     f|   samira@yahoo.com|  borivali|             teacher|      null|
|  4|  vidhan|     wani|null|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5| abhijit| panpatil|null|     m|  abhijit@gmail.com|   palghar|                null|       4.5|
|  6| namrata|deshamukh|28.0|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|


In [17]:
ps_df.na.drop(how='any', thresh=2).show(40)

+---+---------+---------+----+------+-------------------+----------+--------------------+----------+
|_c0|     Name|    Sname| Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+---------+---------+----+------+-------------------+----------+--------------------+----------+
|  0|    kiran|  mungkar|29.0|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|    kapil|  nargund|27.0|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|    vinod|    kadam|31.0|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  3|     null|      sha|28.0|     f|   samira@yahoo.com|  borivali|             teacher|      null|
|  4|   vidhan|     wani|null|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5|  abhijit| panpatil|null|     m|  abhijit@gmail.com|   palghar|                null|       4.5|
|  6|  namrata|deshamukh|28.0|     f|  manrata@yahoo.com|  borivali| front end developer|  

In [21]:
# subset: optional list of column names to consider
ps_df.na.drop(subset=['Email_ID', 'Profile', 'Experience']).show(40)

+---+---------+---------+----+------+-------------------+----------+--------------------+----------+
|_c0|     Name|    Sname| Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+---------+---------+----+------+-------------------+----------+--------------------+----------+
|  0|    kiran|  mungkar|29.0|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|    kapil|  nargund|27.0|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|    vinod|    kadam|31.0|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  4|   vidhan|     wani|null|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  6|  namrata|deshamukh|28.0|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  7|     neha|     raut|27.0|     f|   neha@hotmail.com|     virar|             teacher|       4.8|
|  8|  shubham|     neve|26.0|     m|shubham@hotmail.com|   palghar|automation developer|  

In [22]:
# fill the missing values
ps_df.na.fill('Missing Value').show(40)

+---+-------------+-------------+----+-------------+-------------------+-------------+--------------------+----------+
|_c0|         Name|        Sname| Age|       Gender|           Email_ID|     Adderess|             Profile|Experience|
+---+-------------+-------------+----+-------------+-------------------+-------------+--------------------+----------+
|  0|        kiran|      mungkar|29.0|            m|    kiran@gmail.com|        virar|        data analyst|       1.8|
|  1|        kapil|      nargund|27.0|            m|    kapil@gmail.com|        vasai|        data analyst|       3.2|
|  2|        vinod|        kadam|31.0|            m|    vinod#gmail.com|     borivali|             teacher|       2.8|
|  3|Missing Value|          sha|28.0|            f|   samira@yahoo.com|     borivali|             teacher|      null|
|  4|       vidhan|         wani|null|            m| vidhan@hotmail.com|    bhayandar|automation developer|       1.5|
|  5|      abhijit|     panpatil|null|          

In [24]:
ps_df.na.fill('Missing Value', ['Name', 'Gender']).show(40)

+---+-------------+---------+----+-------------+-------------------+----------+--------------------+----------+
|_c0|         Name|    Sname| Age|       Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+-------------+---------+----+-------------+-------------------+----------+--------------------+----------+
|  0|        kiran|  mungkar|29.0|            m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|        kapil|  nargund|27.0|            m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|        vinod|    kadam|31.0|            m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  3|Missing Value|      sha|28.0|            f|   samira@yahoo.com|  borivali|             teacher|      null|
|  4|       vidhan|     wani|null|            m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5|      abhijit| panpatil|null|            m|  abhijit@gmail.com|   palghar|                null|    

In [30]:
# fill null value by using imputer function
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols = ['Age', 'Experience'], 
                  outputCols = [c+'_Impu' for c in ['Age', 'Experience']]
                 ).setStrategy('median') # 'mean' 'median' 'mode'

# add imputer columns to database
imputer.fit(ps_df).transform(ps_df).show(40)

# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.Imputer.html#pyspark.ml.feature.Imputer.strategy

+---+---------+---------+----+------+-------------------+----------+--------------------+----------+--------+---------------+
|_c0|     Name|    Sname| Age|Gender|           Email_ID|  Adderess|             Profile|Experience|Age_Impu|Experience_Impu|
+---+---------+---------+----+------+-------------------+----------+--------------------+----------+--------+---------------+
|  0|    kiran|  mungkar|29.0|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|    29.0|            1.8|
|  1|    kapil|  nargund|27.0|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|    27.0|            3.2|
|  2|    vinod|    kadam|31.0|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|    31.0|            2.8|
|  3|     null|      sha|28.0|     f|   samira@yahoo.com|  borivali|             teacher|      null|    28.0|            3.2|
|  4|   vidhan|     wani|null|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|    28.0|         

# Pyspark Filter Operations

- Filter Opration
- &, |, ==
- ~

In [33]:
ps_df = spark.read.csv('test.csv', header=True, inferSchema=True)
ps_df.show()

+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|_c0|    Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|  0|   kiran|  mungkar| 29|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|   vinod|    kadam| 31|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  3|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|
|  4|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|
|  6| namrata|deshamukh| 28|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  7|    n

In [35]:
# apply filter operation
# 1. find data who's age is less than and equal to 28
ps_df.filter("Age<=28").show()

+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|_c0|    Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|  1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  3|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|
|  4|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|
|  6| namrata|deshamukh| 28|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  7|    neha|     raut| 27|     f|   neha@hotmail.com|     virar|             teacher|       4.8|
|  8| shubham|     neve| 26|     m|shubham@hotmail.com|   palghar|automation developer|       3.5|
|  9| dars

In [36]:
# show only name and age columns
ps_df.filter("Age<=28").select(['Name', 'Age']).show()

+--------+---+
|    Name|Age|
+--------+---+
|   kapil| 27|
|  samira| 28|
|  vidhan| 27|
| abhijit| 27|
| namrata| 28|
|    neha| 27|
| shubham| 26|
| darshan| 27|
|    anuj| 28|
|harshali| 27|
|  nitesh| 26|
|  nitesh| 26|
|shradhha| 28|
|   rohit| 28|
|siddhesh| 28|
|   nidhi| 28|
|  snehal| 27|
+--------+---+



In [37]:
ps_df.filter(ps_df["Age"]<=28).show() # same output in diffrent way

+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|_c0|    Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|  1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  3|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|
|  4|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|
|  6| namrata|deshamukh| 28|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  7|    neha|     raut| 27|     f|   neha@hotmail.com|     virar|             teacher|       4.8|
|  8| shubham|     neve| 26|     m|shubham@hotmail.com|   palghar|automation developer|       3.5|
|  9| dars

In [39]:
# multiple condition in one filter
# 2. find data who's age is less than and equal to 28 and experience is greater than 3 year.
ps_df.filter((ps_df["Age"]<=28) & (ps_df["Experience"]>3)).show()

+---+--------+--------+---+------+-------------------+----------+--------------------+----------+
|_c0|    Name|   Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+--------+---+------+-------------------+----------+--------------------+----------+
|  1|   kapil| nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  5| abhijit|panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|
|  7|    neha|    raut| 27|     f|   neha@hotmail.com|     virar|             teacher|       4.8|
|  8| shubham|    neve| 26|     m|shubham@hotmail.com|   palghar|automation developer|       3.5|
|  9| darshan|     sha| 27|     m|  darshan@gmail.com|     malad|       .net devloper|       3.4|
| 10|    anuj|   kumar| 28|     m|     anuj@yahoo.com|nalasopara|   backend developer|       4.7|
| 11|harshali|   desai| 27|     f| harshali@gmail.com|  borivali|      java developer|       4.1|
| 15|shradhha|  maha

In [40]:
# 3. find data who's age is less than and equal to 28 or greater than 30.
ps_df.filter((ps_df["Age"]<=28) | (ps_df["Age"]>30)).show()

+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|_c0|    Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|  1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|   vinod|    kadam| 31|     m|    vinod#gmail.com|  borivali|             teacher|       2.8|
|  3|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|
|  4|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  5| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|
|  6| namrata|deshamukh| 28|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  7|    neha|     raut| 27|     f|   neha@hotmail.com|     virar|             teacher|       4.8|
|  8| shub

In [41]:
# 3. find data who's age is not less than and equal to 28 or greater than 30.
ps_df.filter(~((ps_df["Age"]<=28) | (ps_df["Age"]>30))).show()

+---+-----+-------+---+------+---------------+--------+-----------------+----------+
|_c0| Name|  Sname|Age|Gender|       Email_ID|Adderess|          Profile|Experience|
+---+-----+-------+---+------+---------------+--------+-----------------+----------+
|  0|kiran|mungkar| 29|     m|kiran@gmail.com|   virar|     data analyst|       1.8|
| 14|vidit|    sha| 29|     m|vidit@yahoo.com|borivali| business analyst|       4.0|
| 16|jinal|   jain| 30|     f|jinal@gmail.com|   malad|  software tester|       3.2|
| 20|harsh|divecha| 30|     m|harsh@yahoo.com|   vasai|software security|       4.1|
| 23|sonal|gondane| 30|     f|sonal@gmail.com|borivali|          teacher|       4.3|
+---+-----+-------+---+------+---------------+--------+-----------------+----------+



# Pyspark GroupBy And Aggregate Functions

In [1]:
# https://www.analyticsvidhya.com/blog/2022/05/data-preprocessing-using-pyspark-handling-missing-values/
# read excel and convert to csv
import pandas as pd
# read .xlsx
df = pd.read_excel('test2.xlsx')
# to save in .csv
df.to_csv('test2.csv')

In [2]:
# connect seseeion
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practrice').getOrCreate()

# read .csv file 
ps_df = spark.read.csv('test2.csv', header=True, inferSchema=True)
ps_df.show(40)

+---+---------+---------+---+------+-------------------+----------+--------------------+----------+------+
|_c0|     Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|Salary|
+---+---------+---------+---+------+-------------------+----------+--------------------+----------+------+
|  0|    kiran|  mungkar| 29|     m|    kiran@gmail.com|     virar|        data analyst|       1.8| 94892|
|  1|    kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2| 93239|
|  2|    vinod|    kadam| 31|     m|    vinod#gmail.com|  borivali|             teacher|       2.8| 83308|
|  3|   samira|     shah| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.7| 39953|
|  4|   vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5| 70240|
|  5|  abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|automation developer|       4.5| 88133|
|  6|  namrata|deshamukh| 28|     f| 

In [3]:
ps_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sname: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Email_ID: string (nullable = true)
 |-- Adderess: string (nullable = true)
 |-- Profile: string (nullable = true)
 |-- Experience: double (nullable = true)
 |-- Salary: integer (nullable = true)



In [7]:
# groupby
# sum od salary by profile
ps_df.groupBy('Profile').sum('Salary').show()

+--------------------+-----------+
|             Profile|sum(Salary)|
+--------------------+-----------+
|             teacher|     574898|
|     software tester|     130721|
|       .net devloper|     109495|
|   software security|      69140|
|automation developer|     221434|
|    business analyst|      84386|
|        data analyst|     256141|
|        bi developer|      86432|
|      java developer|      31332|
|   backend developer|     138135|
|             maneger|     167068|
| front end developer|     164374|
+--------------------+-----------+



In [8]:
ps_df.groupBy('Profile').mean('Salary').show()

+--------------------+------------------+
|             Profile|       avg(Salary)|
+--------------------+------------------+
|             teacher|           57489.8|
|     software tester|           65360.5|
|       .net devloper|           54747.5|
|   software security|           69140.0|
|automation developer| 73811.33333333333|
|    business analyst|           42193.0|
|        data analyst| 85380.33333333333|
|        bi developer|           86432.0|
|      java developer|           31332.0|
|   backend developer|           69067.5|
|             maneger|55689.333333333336|
| front end developer|54791.333333333336|
+--------------------+------------------+



In [9]:
ps_df.groupBy('Profile').count().show()

+--------------------+-----+
|             Profile|count|
+--------------------+-----+
|             teacher|   10|
|     software tester|    2|
|       .net devloper|    2|
|   software security|    1|
|automation developer|    3|
|    business analyst|    2|
|        data analyst|    3|
|        bi developer|    1|
|      java developer|    1|
|   backend developer|    2|
|             maneger|    3|
| front end developer|    3|
+--------------------+-----+



In [11]:
ps_df.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|    2033556|
+-----------+



In [17]:
ps_df.agg({'Salary':'max'}).show()

+-----------+
|max(Salary)|
+-----------+
|      94892|
+-----------+



In [22]:
ps_df.groupBy('Profile').max('Salary').show()

+--------------------+-----------+
|             Profile|max(Salary)|
+--------------------+-----------+
|             teacher|      88794|
|     software tester|      78918|
|       .net devloper|      54845|
|   software security|      69140|
|automation developer|      88133|
|    business analyst|      43783|
|        data analyst|      94892|
|        bi developer|      86432|
|      java developer|      31332|
|   backend developer|      89113|
|             maneger|      72535|
| front end developer|      71443|
+--------------------+-----------+



In [27]:
ps_df.groupBy('Profile').agg({'Salary':'Avg'}).show()

+--------------------+------------------+
|             Profile|       avg(Salary)|
+--------------------+------------------+
|             teacher|           57489.8|
|     software tester|           65360.5|
|       .net devloper|           54747.5|
|   software security|           69140.0|
|automation developer| 73811.33333333333|
|    business analyst|           42193.0|
|        data analyst| 85380.33333333333|
|        bi developer|           86432.0|
|      java developer|           31332.0|
|   backend developer|           69067.5|
|             maneger|55689.333333333336|
| front end developer|54791.333333333336|
+--------------------+------------------+



In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("LoanStats_2018Q4.csv")
df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,10000.0,10000.0,10000.0,36 months,10.33%,324.23,B,B1,...,,,DirectPay,N,,,,,,
1,,,2500.0,2500.0,2500.0,36 months,13.56%,84.92,C,C1,...,,,Cash,N,,,,,,
2,,,12000.0,12000.0,12000.0,60 months,13.56%,276.49,C,C1,...,,,Cash,N,,,,,,
3,,,15000.0,15000.0,14975.0,60 months,14.47%,352.69,C,C2,...,,,Cash,N,,,,,,
4,,,16000.0,16000.0,16000.0,60 months,17.97%,406.04,D,D1,...,,,Cash,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128411,,,10000.0,10000.0,10000.0,36 months,16.91%,356.08,C,C5,...,,,Cash,N,,,,,,
128412,,,,,,,,,,,...,,,,,,,,,,
128413,,,,,,,,,,,...,,,,,,,,,,
128414,Total amount funded in policy code 1: 2050909275,,,,,,,,,,...,,,,,,,,,,


In [3]:
len(df)*0.4

51366.4

In [6]:
df.iloc[:10000].to_csv("demo10000.csv", index=False)