In [3]:
#Starting the SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Dataframe Practice").getOrCreate()
spark

In [8]:
#Reading in the dataset
spark.read.option('header','true').csv('test1-1.csv').show()
df_pyspark = spark.read.option('header','true').csv('test1-1.csv')


+-----+---+----------+
| name|age|experience|
+-----+---+----------+
|krish| 31|        10|
|  sam| 30|         8|
| prim| 29|         1|
+-----+---+----------+



In [13]:
##Check the schema
df_pyspark.printSchema()

#Reading in the dataset, but now the columns won't all be read in as strings
df_pyspark = spark.read.option('header','true').csv('test1-1.csv',inferSchema=True)
#Checking the datatype of the columns
df_pyspark.printSchema()


root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [36]:
#Another method for reading in the data
df_pyspark = spark.read.csv('test1-1.csv',header=True,inferSchema=True)
df_pyspark.show()

+-----+---+----------+
| name|age|experience|
+-----+---+----------+
|krish| 31|        10|
|  sam| 30|         8|
| prim| 29|         1|
+-----+---+----------+



In [16]:
#Checking the datatypes
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [18]:
#Get the column names
df_pyspark.columns


['name', 'age', 'experience']

In [19]:
#Head of the dataframe -- top 3 rows
#Shows in a list format, pandas would normally show a dataframe format
df_pyspark.head(3)

[Row(name='krish', age=31, experience=10),
 Row(name='sam', age=30, experience=8),
 Row(name='prim', age=29, experience=1)]

In [23]:
#How to select a column and see the elements

#Showing all of the columns
df_pyspark.show()

#Showing only the name column return type
df_pyspark.select('name')

#Showing only the name column plus elements
df_pyspark.select('name').show()

#Showing the type of the 'name' column
type(df_pyspark.select('name'))



+-----+---+----------+
| name|age|experience|
+-----+---+----------+
|krish| 31|        10|
|  sam| 30|         8|
| prim| 29|         1|
+-----+---+----------+

+-----+
| name|
+-----+
|krish|
|  sam|
| prim|
+-----+



pyspark.sql.dataframe.DataFrame

In [24]:
#Selecting multiple columns and viewing the elements

#Showing only the name and experience column return types
df_pyspark.select(['name','experience'])

#Showing the name and experience columns plus elements
df_pyspark.select(['name','experience']).show()

#Showing the type of the 'name' and 'experience' columns
type(df_pyspark.select(['name','experience']))


#Note: slice will not work here

+-----+----------+
| name|experience|
+-----+----------+
|krish|        10|
|  sam|         8|
| prim|         1|
+-----+----------+



pyspark.sql.dataframe.DataFrame

In [25]:
#Showing what the feature 'name' is
df_pyspark['name']

#Note: the show() function will not work in this instance. To get the column and view the contents, use .select()

Column<'name'>

In [26]:
#Checking the datatypes of each column
df_pyspark.dtypes

[('name', 'string'), ('age', 'int'), ('experience', 'int')]

In [28]:
#Noting similarities in .describe() with pandas. Basically the "summary" function in R
df_pyspark.describe()

df_pyspark.describe().show()

+-------+-----+----+-----------------+
|summary| name| age|       experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| null|30.0|6.333333333333333|
| stddev| null| 1.0|4.725815626252609|
|    min|krish|  29|                1|
|    max|  sam|  31|               10|
+-------+-----+----+-----------------+



In [37]:
#Adding and dropping columns in the dataframe

#Adding columns
#dataframe.withColumn('new column name',what the column contains or an operation with an existing column)
df_pyspark = df_pyspark.withColumn('experience after 2 years',df_pyspark['experience']+2)
df_pyspark.show()

#Dropping columns
#dataframe.drop('list','of','columns')
df_pyspark.drop('experience after 2 years').show()
df_pyspark = df_pyspark.drop('experience after 2 years')

+-----+---+----------+------------------------+
| name|age|experience|experience after 2 years|
+-----+---+----------+------------------------+
|krish| 31|        10|                      12|
|  sam| 30|         8|                      10|
| prim| 29|         1|                       3|
+-----+---+----------+------------------------+

+-----+---+----------+
| name|age|experience|
+-----+---+----------+
|krish| 31|        10|
|  sam| 30|         8|
| prim| 29|         1|
+-----+---+----------+

+-----+---+----------+
| name|age|experience|
+-----+---+----------+
|krish| 31|        10|
|  sam| 30|         8|
| prim| 29|         1|
+-----+---+----------+



In [39]:
#Renaming the columns

#dataframe.withColumnRenamed('existing column name','new column name')
df_pyspark.withColumnRenamed('name','new name').show()

+--------+---+----------+
|new name|age|experience|
+--------+---+----------+
|   krish| 31|        10|
|     sam| 30|         8|
|    prim| 29|         1|
+--------+---+----------+

