### Read Data 

In [1]:
import pyspark

In [47]:
import pandas as pd
pd_df = pd.read_csv("data/pii_sample_data.csv")
pd_df.head()

Unnamed: 0,Name,Sex,Age,Address,Email,Phone Number
0,John Doe,Male,35,"123 Main St, City A",john.doe@example.com,123-456-7890
1,Jane Smith,Female,28,"456 Elm St, City B",jane.smith@example.com,987-654-3210
2,David Lee,Male,45,"789 Oak St, City C",david.lee@example.com,555-123-4567
3,Mary Brown,Female,22,"321 Birch St, City D",mary.brown@example.com,777-888-9999
4,James Wang,Male,32,"567 Pine St, City E",james.wang@example.com,111-222-3333


In [9]:
# import spark session
from pyspark.sql import SparkSession

In [13]:
# Create spark session: big_data
spark_session = SparkSession.builder.appName("big_data").getOrCreate()

In [14]:
# Our spark version is 3.5..0
# Local has only one node master
spark_session

In [36]:
# Read  .csv data
pii_df = spark_session.read.csv("data/pii_sample_data.csv")

In [37]:
pii_df.show()

+---------------+------+---+--------------------+--------------------+------------+
|            _c0|   _c1|_c2|                 _c3|                 _c4|         _c5|
+---------------+------+---+--------------------+--------------------+------------+
|           Name|   Sex|Age|             Address|               Email|Phone Number|
|       John Doe|  Male| 35| 123 Main St, City A|john.doe@example.com|123-456-7890|
|     Jane Smith|Female| 28|  456 Elm St, City B|jane.smith@exampl...|987-654-3210|
|      David Lee|  Male| 45|  789 Oak St, City C|david.lee@example...|555-123-4567|
|     Mary Brown|Female| 22|321 Birch St, City D|mary.brown@exampl...|777-888-9999|
|     James Wang|  Male| 32| 567 Pine St, City E|james.wang@exampl...|111-222-3333|
|      Lisa Chen|Female| 40|654 Cedar St, City F|lisa.chen@example...|999-777-8888|
|       Mark Kim|  Male| 27|890 Willow St, Ci...|mark.kim@example.com|333-444-5555|
|      Sarah Liu|Female| 33|432 Redwood St, C...|sarah.liu@example...|666-77

In [38]:
# Reading data along with the header
spark_df = spark_session.read.option("header", "true").csv("data/pii_sample_data.csv")

### Display Data

In [48]:
# Display all data, if size of data is greater than 20 then it only display first 20 rows
spark_df.show() # like .head() method in pandas

+---------------+------+---+--------------------+--------------------+------------+
|           Name|   Sex|Age|             Address|               Email|Phone Number|
+---------------+------+---+--------------------+--------------------+------------+
|       John Doe|  Male| 35| 123 Main St, City A|john.doe@example.com|123-456-7890|
|     Jane Smith|Female| 28|  456 Elm St, City B|jane.smith@exampl...|987-654-3210|
|      David Lee|  Male| 45|  789 Oak St, City C|david.lee@example...|555-123-4567|
|     Mary Brown|Female| 22|321 Birch St, City D|mary.brown@exampl...|777-888-9999|
|     James Wang|  Male| 32| 567 Pine St, City E|james.wang@exampl...|111-222-3333|
|      Lisa Chen|Female| 40|654 Cedar St, City F|lisa.chen@example...|999-777-8888|
|       Mark Kim|  Male| 27|890 Willow St, Ci...|mark.kim@example.com|333-444-5555|
|      Sarah Liu|Female| 33|432 Redwood St, C...|sarah.liu@example...|666-777-8888|
|Michael Johnson|  Male| 55|  765 Oak St, City I|michael.johnson@e...|123-98

In [49]:
# Display only first 5 rows
spark_df.show(5)

+----------+------+---+--------------------+--------------------+------------+
|      Name|   Sex|Age|             Address|               Email|Phone Number|
+----------+------+---+--------------------+--------------------+------------+
|  John Doe|  Male| 35| 123 Main St, City A|john.doe@example.com|123-456-7890|
|Jane Smith|Female| 28|  456 Elm St, City B|jane.smith@exampl...|987-654-3210|
| David Lee|  Male| 45|  789 Oak St, City C|david.lee@example...|555-123-4567|
|Mary Brown|Female| 22|321 Birch St, City D|mary.brown@exampl...|777-888-9999|
|James Wang|  Male| 32| 567 Pine St, City E|james.wang@exampl...|111-222-3333|
+----------+------+---+--------------------+--------------------+------------+
only showing top 5 rows



In [41]:
spark_df.head(5) # get first 5 rows

[Row(Name='John Doe', Sex='Male', Age='35', Address='123 Main St, City A', Email='john.doe@example.com', Phone Number='123-456-7890'),
 Row(Name='Jane Smith', Sex='Female', Age='28', Address='456 Elm St, City B', Email='jane.smith@example.com', Phone Number='987-654-3210'),
 Row(Name='David Lee', Sex='Male', Age='45', Address='789 Oak St, City C', Email='david.lee@example.com', Phone Number='555-123-4567'),
 Row(Name='Mary Brown', Sex='Female', Age='22', Address='321 Birch St, City D', Email='mary.brown@example.com', Phone Number='777-888-9999'),
 Row(Name='James Wang', Sex='Male', Age='32', Address='567 Pine St, City E', Email='james.wang@example.com', Phone Number='111-222-3333')]

In [53]:
spark_df.tail(5) # get last 5 rows

[Row(Name='Charlotte Turner', Sex='Female', Age='27', Address='987 Pine St, City JJ', Email='charlotte.turner@example.com', Phone Number='333-666-7777'),
 Row(Name='Michael Walker', Sex='Male', Age='40', Address='876 Redwood St, City KK', Email='michael.walker@example.com', Phone Number='777-444-5555'),
 Row(Name='Harper King', Sex='Female', Age='24', Address='543 Elm St, City LL', Email='harper.king@example.com', Phone Number='666-555-4444'),
 Row(Name='Elijah Adams', Sex='Male', Age='38', Address='234 Cedar St, City MM', Email='elijah.adams@example.com', Phone Number='555-444-3333'),
 Row(Name='Lily Hall', Sex='Female', Age='33', Address='432 Pine St, City NN', Email='lily.hall@example.com', Phone Number='333-555-6666')]

### Data types

In [43]:
type(pd_df)

pandas.core.frame.DataFrame

In [42]:
# Let's see the type of the dataframe in the spark
type(spark_df)

pyspark.sql.dataframe.DataFrame

In [44]:
pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          31 non-null     object
 1   Sex           31 non-null     object
 2   Age           31 non-null     int64 
 3   Address       31 non-null     object
 4   Email         31 non-null     object
 5   Phone Number  31 non-null     object
dtypes: int64(1), object(5)
memory usage: 1.6+ KB


In [45]:
# Getting information about the dataframe columns
spark_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone Number: string (nullable = true)

