### In this Video We will Cover

* PySpark Dataframe
* Reading The Dataset
* Checking the Datatypes of the Column(Schema)
* Selecting Columns And Indexing
* Check Describe option similar to Pandas
* Adding columns
* Dropping columns
* Renaming columns

In [1]:
import pyspark

In [2]:
import pandas as pd

In [3]:
from pyspark.sql import SparkSession

In [4]:
# Create SparkSession
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [5]:
spark

In [6]:
# PySpark Dataframe
# Read Dataset

df = spark.read.option('header', 'true').csv('test1_edit.csv')
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [7]:
# Print schema

df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [8]:
# reading will consider all columns of type string unless we say to inferSchema while reading

# With inferring schema

In [9]:
df = spark.read.option('header', 'true').csv('test1_edit.csv', inferSchema = True)
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [10]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [11]:
# and reading other way (simply)
df2 = spark.read.csv('test1_edit.csv', header = True, inferSchema = True)
df2.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [12]:
df2.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [13]:
type(df2)

pyspark.sql.dataframe.DataFrame

In [14]:
df2.columns

['Name', 'Age', 'Experience']

In [15]:
df2.head(3)

[Row(Name='Krish', Age=31, Experience=10),
 Row(Name='Sudhanshu', Age=30, Experience=8),
 Row(Name='Sunny', Age=29, Experience=4)]

In [16]:
df2.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [17]:
# Selecting Columns And Indexing
df2.select('Name').show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
+---------+



In [18]:
type(df2.select('Name'))

pyspark.sql.dataframe.DataFrame

In [19]:
df2.select('Name', 'Age').show()

+---------+---+
|     Name|Age|
+---------+---+
|    Krish| 31|
|Sudhanshu| 30|
|    Sunny| 29|
+---------+---+



In [20]:
df2['Name']

Column<'Name'>

In [21]:
# Checking the Datatypes of the Column(Schema)
df2.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [22]:
# Check Describe option similar to Pandas
df2.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [23]:
df2.describe().show()

+-------+-----+----+-----------------+
|summary| Name| Age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| NULL|30.0|7.333333333333333|
| stddev| NULL| 1.0|3.055050463303893|
|    min|Krish|  29|                4|
|    max|Sunny|  31|               10|
+-------+-----+----+-----------------+



In [24]:
# Adding columns

In [25]:
df3 = df2.withColumn('Experience After 2 Years', df2['Experience'] + 2)

In [26]:
df3.show()

+---------+---+----------+------------------------+
|     Name|Age|Experience|Experience After 2 Years|
+---------+---+----------+------------------------+
|    Krish| 31|        10|                      12|
|Sudhanshu| 30|         8|                      10|
|    Sunny| 29|         4|                       6|
+---------+---+----------+------------------------+



In [27]:
# Dropping Columns

In [28]:
df4 = df3.drop('Experience After 2 Years')

In [29]:
df4.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [30]:
# Renaming columns

In [31]:
df5 = df4.withColumnRenamed('Name', 'New Name')
df5.show()

+---------+---+----------+
| New Name|Age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+

