# Pyspark Introduction and Installation

https://spark.apache.org/

In [1]:
# install pyspark library
!pip install pyspark





In [3]:
# load library
import pyspark as ps
import pandas as pd

In [19]:
# read data file using pandas
pd_df = pd.read_excel("test.xlsx")
pd_df.to_csv("test.csv")
pd_df

Unnamed: 0,Name,Sname,Age,Gender,Email_ID,Adderess,Profile,Experience
0,kiran,mungkar,29,m,kiran@gmail.com,virar,data analyst,1.8
1,kapil,nargund,27,m,kapil@gmail.com,vasai,data analyst,3.2
2,samira,sha,28,f,samira@yahoo.com,borivali,teacher,2.8
3,vidhan,wani,27,m,vidhan@hotmail.com,bhayandar,automation developer,1.5
4,abhijit,panpatil,27,m,abhijit@gmail.com,palghar,shell scripting engineer,4.5
5,namrata,deshamukh,28,f,manrata@yahoo.com,borivali,front end developer,2.6
6,neha,raut,27,f,neha@hotmail.com,virar,teacher,4.8
7,shubham,neve,26,m,shubham@hotmail.com,palghar,automation developer,3.5
8,darshan,sha,27,m,darshan@gmail.com,malad,.net devloper,3.4
9,anuj,kumar,28,m,anuj@yahoo.com,nalasopara,backend developer,4.7


# To work with spark 1st start the spark session

In [8]:
# load SparkSession
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName('practice').getOrCreate() 
# give session a name ie. 'practice'
# and then create spark session
spark

when you working on local you can see only one cluster nut when you working on cloud you can create multiple cluster and instances

In [20]:
# read data file using sparks
ps_df = spark.read.csv('test.csv')
ps_df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string]

In [21]:
ps_df.show()

+----+--------+---------+---+------+-------------------+----------+--------------------+----------+
| _c0|     _c1|      _c2|_c3|   _c4|                _c5|       _c6|                 _c7|       _c8|
+----+--------+---------+---+------+-------------------+----------+--------------------+----------+
|null|    Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
|   0|   kiran|  mungkar| 29|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|   1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|   2|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|
|   3|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|   4| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|
|   5| namrata|deshamukh| 28|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|


In [23]:
# to show / select the 1st row as header
ps_df = spark.read.option("header", "true").csv("test.csv")
ps_df

DataFrame[_c0: string, Name: string, Sname: string, Age: string, Gender: string, Email_ID: string, Adderess: string, Profile: string, Experience: string]

In [25]:
ps_df.show()

+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|_c0|    Name|    Sname|Age|Gender|           Email_ID|  Adderess|             Profile|Experience|
+---+--------+---------+---+------+-------------------+----------+--------------------+----------+
|  0|   kiran|  mungkar| 29|     m|    kiran@gmail.com|     virar|        data analyst|       1.8|
|  1|   kapil|  nargund| 27|     m|    kapil@gmail.com|     vasai|        data analyst|       3.2|
|  2|  samira|      sha| 28|     f|   samira@yahoo.com|  borivali|             teacher|       2.8|
|  3|  vidhan|     wani| 27|     m| vidhan@hotmail.com| bhayandar|automation developer|       1.5|
|  4| abhijit| panpatil| 27|     m|  abhijit@gmail.com|   palghar|shell scripting e...|       4.5|
|  5| namrata|deshamukh| 28|     f|  manrata@yahoo.com|  borivali| front end developer|       2.6|
|  6|    neha|     raut| 27|     f|   neha@hotmail.com|     virar|             teacher|       4.8|
|  7| shub

In [27]:
# check the data type 
type(ps_df), type(pd_df) 

(pyspark.sql.dataframe.DataFrame, pandas.core.frame.DataFrame)

In [28]:
# to show few / 1st row information
ps_df.head()

Row(_c0='0', Name='kiran', Sname='mungkar', Age='29', Gender='m', Email_ID='kiran@gmail.com', Adderess='virar', Profile='data analyst', Experience='1.8')

In [29]:
# to show information about collumns
ps_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sname: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Email_ID: string (nullable = true)
 |-- Adderess: string (nullable = true)
 |-- Profile: string (nullable = true)
 |-- Experience: string (nullable = true)

