In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=9123503b3eab89440adee30b9aca9f59df12dce87242bafb08cd4a9135914679
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [41]:
#spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [42]:
spark = SparkSession.builder.appName("data_clean_person").getOrCreate()

In [43]:
# Define the schema so it matches with the Big Query later
schema = StructType([
    StructField("BusinessEntityID", IntegerType(), nullable=False),
    StructField("PersonType", StringType(), nullable=True),
    StructField("NameStyle", IntegerType(), nullable=True),
    StructField("Title", StringType(), nullable=True),
    StructField("FirstName", StringType(), nullable=True),
    StructField("MiddleName", StringType(), nullable=True),
    StructField("LastName", StringType(), nullable=True),
    StructField("Suffix", StringType(), nullable=True),
    StructField("EmailPromotion", IntegerType(), nullable=True),
    StructField("AdditionalContactInfo", StringType(), nullable=True),
    StructField("Demographics", StringType(), nullable=True),
    StructField("rowguid", StringType(), nullable=True),
    StructField("ModifiedDate", TimestampType(), nullable=True)
])

# Read the CSV file using the defined schema
df = spark.read.csv('./raw_data/Person.Person.csv', sep=';', encoding='utf-8', header=True, schema=schema)

In [44]:
# Defining a UDF (User Defined Function)
def clean_non_ascii(value):
    if value is None:
        return None
    return value.encode('ascii', 'ignore').decode('ascii')

clean_udf = udf(clean_non_ascii, StringType())

In [45]:
# Apply the UDF to all string columns of the dataframe
for col_name in df.columns:
    if isinstance(df.schema[col_name].dataType, StringType):
        df = df.withColumn(col_name, clean_udf(col_name))

df.show()

+----------------+----------+---------+-----+---------+----------+----------+------+--------------+---------------------+--------------------+--------------------+-------------------+
|BusinessEntityID|PersonType|NameStyle|Title|FirstName|MiddleName|  LastName|Suffix|EmailPromotion|AdditionalContactInfo|        Demographics|             rowguid|       ModifiedDate|
+----------------+----------+---------+-----+---------+----------+----------+------+--------------+---------------------+--------------------+--------------------+-------------------+
|               1|        EM|        0| NULL|      Ken|         J|    Snchez|  NULL|             0|                 NULL|"<IndividualSurve...|92C4279F-1207-48A...|2009-01-07 00:00:00|
|               2|        EM|        0| NULL|    Terri|       Lee|     Duffy|  NULL|             1|                 NULL|"<IndividualSurve...|D8763459-8AA8-47C...|2008-01-24 00:00:00|
|               3|        EM|        0| NULL|  Roberto|      NULL|Tamburello|  N

In [46]:
df.printSchema()

root
 |-- BusinessEntityID: integer (nullable = true)
 |-- PersonType: string (nullable = true)
 |-- NameStyle: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- MiddleName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- Suffix: string (nullable = true)
 |-- EmailPromotion: integer (nullable = true)
 |-- AdditionalContactInfo: string (nullable = true)
 |-- Demographics: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



In [47]:
# Writing the transformed dataframe to a new CSV file
df.coalesce(1).write.csv('table_person.csv', header=True, mode='overwrite')

In [49]:
# Stopping the Spark session
spark.stop()