In [2]:
from pyspark.sql import SparkSession

In [3]:
spark_session = SparkSession.builder.appName("spark_df").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/08 10:48:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/10/08 10:48:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/10/08 10:48:41 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
spark_session

In [5]:
# Read the dataset
pii_df = spark_session.read.option("header", "true").csv("./data/pii_sample_data.csv")

In [6]:
pii_df.show()

+---------------+------+---+--------------------+--------------------+------------+
|           Name|   Sex|Age|             Address|               Email|Phone Number|
+---------------+------+---+--------------------+--------------------+------------+
|       John Doe|  Male| 35| 123 Main St, City A|john.doe@example.com|123-456-7890|
|     Jane Smith|Female| 28|  456 Elm St, City B|jane.smith@exampl...|987-654-3210|
|      David Lee|  Male| 45|  789 Oak St, City C|david.lee@example...|555-123-4567|
|     Mary Brown|Female| 22|321 Birch St, City D|mary.brown@exampl...|777-888-9999|
|     James Wang|  Male| 32| 567 Pine St, City E|james.wang@exampl...|111-222-3333|
|      Lisa Chen|Female| 40|654 Cedar St, City F|lisa.chen@example...|999-777-8888|
|       Mark Kim|  Male| 27|890 Willow St, Ci...|mark.kim@example.com|333-444-5555|
|      Sarah Liu|Female| 33|432 Redwood St, C...|sarah.liu@example...|666-777-8888|
|Michael Johnson|  Male| 55|  765 Oak St, City I|michael.johnson@e...|123-98

In [7]:
type(pii_df)

pyspark.sql.dataframe.DataFrame

In [8]:
# Get the datatype
# By default, it will consider all the data as string
# To fix it, we need to add one extra parameter while reading dataset
# i.e inferSchema = True
pii_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone Number: string (nullable = true)



In [9]:
pii_df = spark_session.read.option("header", "true").csv("./data/pii_sample_data.csv", inferSchema=True)

In [10]:
pii_df.printSchema() # Now, Age age is imported as integer and rest as string

root
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Address: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone Number: string (nullable = true)



In [11]:
# Alternative method
pii_df = spark_session.read.csv("./data/pii_sample_data.csv", header=True, inferSchema=True)

In [12]:
pii_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Address: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone Number: string (nullable = true)



- Changing Data types

In [16]:
# Change the datatype of the "Age" column from integer to string
from pyspark.sql.functions import col
df_temp = pii_df.withColumn("Age", col("Age").cast("string"))

In [17]:
df_temp.printSchema() # Now, datatype of age converted to string

root
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone Number: string (nullable = true)



- Select rows

In [17]:
# pick n rows
pii_df.show(5)

+----------+------+---+--------------------+--------------------+------------+
|      Name|   Sex|Age|             Address|               Email|Phone Number|
+----------+------+---+--------------------+--------------------+------------+
|  John Doe|  Male| 35| 123 Main St, City A|john.doe@example.com|123-456-7890|
|Jane Smith|Female| 28|  456 Elm St, City B|jane.smith@exampl...|987-654-3210|
| David Lee|  Male| 45|  789 Oak St, City C|david.lee@example...|555-123-4567|
|Mary Brown|Female| 22|321 Birch St, City D|mary.brown@exampl...|777-888-9999|
|James Wang|  Male| 32| 567 Pine St, City E|james.wang@exampl...|111-222-3333|
+----------+------+---+--------------------+--------------------+------------+
only showing top 5 rows



In [18]:
pii_df.head(5)

[Row(Name='John Doe', Sex='Male', Age=35, Address='123 Main St, City A', Email='john.doe@example.com', Phone Number='123-456-7890'),
 Row(Name='Jane Smith', Sex='Female', Age=28, Address='456 Elm St, City B', Email='jane.smith@example.com', Phone Number='987-654-3210'),
 Row(Name='David Lee', Sex='Male', Age=45, Address='789 Oak St, City C', Email='david.lee@example.com', Phone Number='555-123-4567'),
 Row(Name='Mary Brown', Sex='Female', Age=22, Address='321 Birch St, City D', Email='mary.brown@example.com', Phone Number='777-888-9999'),
 Row(Name='James Wang', Sex='Male', Age=32, Address='567 Pine St, City E', Email='james.wang@example.com', Phone Number='111-222-3333')]

### Select columns

In [21]:
# Select a column
# return type is also dataframe
name_df = pii_df.select("Name")
name_df

DataFrame[Name: string]

In [23]:
name_df.show(2)

+----------+
|      Name|
+----------+
|  John Doe|
|Jane Smith|
+----------+
only showing top 2 rows



In [34]:
# or
pii_df["Name"]

Column<'Name'>

In [25]:
# Select n columns
name_df = pii_df.select(["Name", "sex"])
name_df

DataFrame[Name: string, sex: string]

In [28]:
name_df.show(2)

+----------+------+
|      Name|   sex|
+----------+------+
|  John Doe|  Male|
|Jane Smith|Female|
+----------+------+
only showing top 2 rows



In [31]:
# Checking data types
pii_df.dtypes

[('Name', 'string'),
 ('Sex', 'string'),
 ('Age', 'int'),
 ('Address', 'string'),
 ('Email', 'string'),
 ('Phone Number', 'string')]

### Find Statistics on Data

In [32]:
# Getting statistics in pyspark
# pandas has similar .describe() method as well
pii_df.describe().show(5)

23/10/07 15:35:46 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+--------------+------+-----------------+--------------------+--------------------+------------+
|summary|          Name|   Sex|              Age|             Address|               Email|Phone Number|
+-------+--------------+------+-----------------+--------------------+--------------------+------------+
|  count|            31|    31|               31|                  31|                  31|          31|
|   mean|          NULL|  NULL|33.41935483870968|                NULL|                NULL|        NULL|
| stddev|          NULL|  NULL|7.522296606526437|                NULL|                NULL|        NULL|
|    min|Abigail Wilson|Female|               22| 123 Main St, City A|abigail.wilson@ex...|111-222-3333|
|    max|   Zoe Johnson|  Male|               55|987 Pine St, City JJ|zoe.johnson@examp...|999-777-8888|
+-------+--------------+------+-----------------+--------------------+--------------------+------------+



### Adding columns

In [38]:
pii_df = pii_df.withColumn("Age Normal", pii_df["Age"]/100)
pii_df.show()

+---------------+------+---+--------------------+--------------------+------------+----------+
|           Name|   Sex|Age|             Address|               Email|Phone Number|Age Normal|
+---------------+------+---+--------------------+--------------------+------------+----------+
|       John Doe|  Male| 35| 123 Main St, City A|john.doe@example.com|123-456-7890|      0.35|
|     Jane Smith|Female| 28|  456 Elm St, City B|jane.smith@exampl...|987-654-3210|      0.28|
|      David Lee|  Male| 45|  789 Oak St, City C|david.lee@example...|555-123-4567|      0.45|
|     Mary Brown|Female| 22|321 Birch St, City D|mary.brown@exampl...|777-888-9999|      0.22|
|     James Wang|  Male| 32| 567 Pine St, City E|james.wang@exampl...|111-222-3333|      0.32|
|      Lisa Chen|Female| 40|654 Cedar St, City F|lisa.chen@example...|999-777-8888|       0.4|
|       Mark Kim|  Male| 27|890 Willow St, Ci...|mark.kim@example.com|333-444-5555|      0.27|
|      Sarah Liu|Female| 33|432 Redwood St, C...|s

### Dropping columns

In [42]:
# Drop single column
pii_df = pii_df.drop("Age Normal")

In [44]:
pii_df.show()

+---------------+------+---+--------------------+--------------------+------------+
|           Name|   Sex|Age|             Address|               Email|Phone Number|
+---------------+------+---+--------------------+--------------------+------------+
|       John Doe|  Male| 35| 123 Main St, City A|john.doe@example.com|123-456-7890|
|     Jane Smith|Female| 28|  456 Elm St, City B|jane.smith@exampl...|987-654-3210|
|      David Lee|  Male| 45|  789 Oak St, City C|david.lee@example...|555-123-4567|
|     Mary Brown|Female| 22|321 Birch St, City D|mary.brown@exampl...|777-888-9999|
|     James Wang|  Male| 32| 567 Pine St, City E|james.wang@exampl...|111-222-3333|
|      Lisa Chen|Female| 40|654 Cedar St, City F|lisa.chen@example...|999-777-8888|
|       Mark Kim|  Male| 27|890 Willow St, Ci...|mark.kim@example.com|333-444-5555|
|      Sarah Liu|Female| 33|432 Redwood St, C...|sarah.liu@example...|666-777-8888|
|Michael Johnson|  Male| 55|  765 Oak St, City I|michael.johnson@e...|123-98

In [51]:
# Drop multiple column
pii_df.drop(*("Email", "Phone Number")).show(3)

+----------+------+---+-------------------+
|      Name|   Sex|Age|            Address|
+----------+------+---+-------------------+
|  John Doe|  Male| 35|123 Main St, City A|
|Jane Smith|Female| 28| 456 Elm St, City B|
| David Lee|  Male| 45| 789 Oak St, City C|
+----------+------+---+-------------------+
only showing top 3 rows



### Renaming columns

In [55]:
pii_df = pii_df.withColumnRenamed("Sex","Gender")

In [57]:
pii_df.show(10)

+---------------+------+---+--------------------+--------------------+------------+
|           Name|Gender|Age|             Address|               Email|Phone Number|
+---------------+------+---+--------------------+--------------------+------------+
|       John Doe|  Male| 35| 123 Main St, City A|john.doe@example.com|123-456-7890|
|     Jane Smith|Female| 28|  456 Elm St, City B|jane.smith@exampl...|987-654-3210|
|      David Lee|  Male| 45|  789 Oak St, City C|david.lee@example...|555-123-4567|
|     Mary Brown|Female| 22|321 Birch St, City D|mary.brown@exampl...|777-888-9999|
|     James Wang|  Male| 32| 567 Pine St, City E|james.wang@exampl...|111-222-3333|
|      Lisa Chen|Female| 40|654 Cedar St, City F|lisa.chen@example...|999-777-8888|
|       Mark Kim|  Male| 27|890 Willow St, Ci...|mark.kim@example.com|333-444-5555|
|      Sarah Liu|Female| 33|432 Redwood St, C...|sarah.liu@example...|666-777-8888|
|Michael Johnson|  Male| 55|  765 Oak St, City I|michael.johnson@e...|123-98