In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [3]:
spark

In [10]:
# read the dataset
#inferschema - for not taking all columns to be as string

df_pyspark = spark.read.option('header','true').csv(r"C:\Users\pavan\OneDrive\Desktop\prep\datasets\test1.csv",inferSchema=True)

In [11]:
## Check the schema

df_pyspark.printSchema()

root
 |-- Id : integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)



In [13]:
#another way to read dataset 

df_pyspark = spark.read.csv(r"C:\Users\pavan\OneDrive\Desktop\prep\datasets\test1.csv",header = 'True', inferSchema = True)

In [14]:
df_pyspark.show()

+---+-------+---+
|Id |   Name|Age|
+---+-------+---+
|  1|   john| 22|
|  2|Stephen| 22|
|  3|  Stacy| 24|
|  4|   paul| 23|
+---+-------+---+



In [15]:
#check schema

df_pyspark.printSchema()

root
 |-- Id : integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)



In [16]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [17]:
# to see all columns

df_pyspark.columns

['Id ', 'Name', 'Age']

In [19]:
df_pyspark.head(3)

# Unlike pandas rows are dispalyed in list format instead of dataframe

[Row(Id =1, Name='john', Age=22),
 Row(Id =2, Name='Stephen', Age=22),
 Row(Id =3, Name='Stacy', Age=24)]

In [20]:
#pick only one column

df_pyspark.select('Name').show()

+-------+
|   Name|
+-------+
|   john|
|Stephen|
|  Stacy|
|   paul|
+-------+



In [21]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [24]:
# pick multiple columns

df_pyspark.select(['Name','Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|   john| 22|
|Stephen| 22|
|  Stacy| 24|
|   paul| 23|
+-------+---+



In [25]:
df_pyspark['name']

# but this cant be used to show the rows sicne it's of type column

Column<'name'>

In [26]:
#datatypes

df_pyspark.dtypes

[('Id ', 'int'), ('Name', 'string'), ('Age', 'int')]

In [27]:
#describe

df_pyspark.describe()

DataFrame[summary: string, Id : string, Name: string, Age: string]

In [28]:
df_pyspark.describe().show()

# gives count,mean,std,min, max on string columns as well
#Mean and std can't for string columns is given as Null

+-------+------------------+-----+-----------------+
|summary|               Id | Name|              Age|
+-------+------------------+-----+-----------------+
|  count|                 4|    4|                4|
|   mean|               2.5| NULL|            22.75|
| stddev|1.2909944487358056| NULL|0.957427107756338|
|    min|                 1|Stacy|               22|
|    max|                 4| paul|               24|
+-------+------------------+-----+-----------------+



In [31]:
#Adding columns in dataframe

df_pyspark.withColumn('Age after 2 years', df_pyspark['Age']+2).show()

+---+-------+---+-----------------+
|Id |   Name|Age|Age after 2 years|
+---+-------+---+-----------------+
|  1|   john| 22|               24|
|  2|Stephen| 22|               24|
|  3|  Stacy| 24|               26|
|  4|   paul| 23|               25|
+---+-------+---+-----------------+



In [30]:
#original dataframe is not changed
df_pyspark.show()

+---+-------+---+
|Id |   Name|Age|
+---+-------+---+
|  1|   john| 22|
|  2|Stephen| 22|
|  3|  Stacy| 24|
|  4|   paul| 23|
+---+-------+---+



In [32]:
df_pyspark = df_pyspark.withColumn('Age after 2 years', df_pyspark['Age']+2)

In [33]:
df_pyspark.show()

+---+-------+---+-----------------+
|Id |   Name|Age|Age after 2 years|
+---+-------+---+-----------------+
|  1|   john| 22|               24|
|  2|Stephen| 22|               24|
|  3|  Stacy| 24|               26|
|  4|   paul| 23|               25|
+---+-------+---+-----------------+



In [34]:
#Drop the columns

df_pyspark.drop('Age after 2 years').show()

+---+-------+---+
|Id |   Name|Age|
+---+-------+---+
|  1|   john| 22|
|  2|Stephen| 22|
|  3|  Stacy| 24|
|  4|   paul| 23|
+---+-------+---+



In [36]:
df_pyspark.show()

+---+-------+---+-----------------+
|Id |   Name|Age|Age after 2 years|
+---+-------+---+-----------------+
|  1|   john| 22|               24|
|  2|Stephen| 22|               24|
|  3|  Stacy| 24|               26|
|  4|   paul| 23|               25|
+---+-------+---+-----------------+



In [37]:
df_pyspark = df_pyspark.drop('Age after 2 years')

In [38]:
df_pyspark.show()

+---+-------+---+
|Id |   Name|Age|
+---+-------+---+
|  1|   john| 22|
|  2|Stephen| 22|
|  3|  Stacy| 24|
|  4|   paul| 23|
+---+-------+---+



In [41]:
#Rename the column

df_pyspark = df_pyspark.withColumnRenamed('Name','New Name')

In [42]:
df_pyspark.show()

+---+--------+---+
|Id |New Name|Age|
+---+--------+---+
|  1|    john| 22|
|  2| Stephen| 22|
|  3|   Stacy| 24|
|  4|    paul| 23|
+---+--------+---+

