<a href="https://colab.research.google.com/github/pallavichandan/PySpark_Tutorial/blob/basics/Pyspark_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

installing and importing PySpark

In [1]:
# !pip install pyspark
import pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=5faf03079da5b708269b1fe3d9cee395d411103130885c67177ee32896ebbe49
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [10]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("practice").getOrCreate()

Creating PySpark DataFrame

In [None]:
data = [(1,"John","NY"),(2,"Alan","DC"),(3,"Dabby","NJ"),(4,"Joey","NY")]

In [None]:
columns = ["Id","Name","Location"]

In [None]:
df = spark.createDataFrame( data,columns)

In [None]:
df.show()

+---+-----+--------+
| Id| Name|Location|
+---+-----+--------+
|  1| John|      NY|
|  2| Alan|      DC|
|  3|Dabby|      NJ|
|  4| Joey|      NY|
+---+-----+--------+



In [None]:
df.describe()

DataFrame[summary: string, Id: string, Name: string, Location: string]

In [None]:
df.printSchema()

root
 |-- Id: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Location: string (nullable = true)



Creating PySpark DataFrame from multiple lists

In [None]:
Id  = [1,2,3,4]
Name = ["John","Alan","Dabby","Joey"]
Location = ["NY","DC","NJ","NY"]

zipping lists into single dataframe

In [None]:
df1 = spark.createDataFrame(zip(Id,Name,Location),["Id","Name","Location"])

In [None]:
df1.show()

+---+-----+--------+
| Id| Name|Location|
+---+-----+--------+
|  1| John|      NY|
|  2| Alan|      DC|
|  3|Dabby|      NJ|
|  4| Joey|      NY|
+---+-----+--------+



creating dataframe by defining schema

In [None]:
from pyspark.sql.types import *

In [None]:
customer1=[1,"John","NY"]
customer2=[2,"Alan","DC"]
customer3=[3,"Dabby","NJ"]
customer4=[4,"Joey","NY"]

In [None]:
schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Location", StringType(), True)
])

converting list to tuple

In [None]:
data = [tuple(customer1),
        tuple(customer2),
        tuple(customer3),
        tuple(customer4)]

In [None]:
df2 = spark.createDataFrame(data,schema= schema)

In [None]:
df2.show()

+---+-----+--------+
| Id| Name|Location|
+---+-----+--------+
|  1| John|      NY|
|  2| Alan|      DC|
|  3|Dabby|      NJ|
|  4| Joey|      NY|
+---+-----+--------+



Create DataFrame from Dictionary

In [4]:
data = [('James',{'hair':'brown','eyes':'black'}),
        ('Joey',{'hair':'blonde','eyes':'brown'}),
        ('Sammy',{'hair':'red','eyes':None}),
        ('Rachel',{'hair':'black','eyes':'black'}),
        ('John',{'hair':'black','eyes':''})]

In [5]:
dict_df= spark.createDataFrame(data = data, schema = ['Name','Properties'])

In [6]:
dict_df.show()

+------+--------------------+
|  Name|          Properties|
+------+--------------------+
| James|{eyes -> black, h...|
|  Joey|{eyes -> brown, h...|
| Sammy|{eyes -> NULL, ha...|
|Rachel|{eyes -> black, h...|
|  John|{eyes -> , hair -...|
+------+--------------------+



In [8]:
dict_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



creating dataframe by defining schema

In [13]:
from pyspark.sql.types import *
schema = StructType([StructField('Name',StringType(),True),
                     StructField('Properties',MapType(StringType(),StringType()),True)])


In [19]:
dict_df1 = spark.createDataFrame(data=data, schema =schema)

In [20]:
dict_df1.show()

+------+--------------------+
|  Name|          Properties|
+------+--------------------+
| James|{eyes -> black, h...|
|  Joey|{eyes -> brown, h...|
| Sammy|{eyes -> NULL, ha...|
|Rachel|{eyes -> black, h...|
|  John|{eyes -> , hair -...|
+------+--------------------+



In [21]:
dict_df1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



Extracting data from dataframe

In [27]:
dict_df1 = dict_df1.withColumn('hair',dict_df1.Properties.getItem('hair'))\
.withColumn('eyes',dict_df1.Properties.getItem('eyes'))\
.drop("Properties")

In [28]:
dict_df1.show()

+------+------+-----+
|  Name|  hair| eyes|
+------+------+-----+
| James| brown|black|
|  Joey|blonde|brown|
| Sammy|   red| NULL|
|Rachel| black|black|
|  John| black|     |
+------+------+-----+



In [29]:
dict_df1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- hair: string (nullable = true)
 |-- eyes: string (nullable = true)

