In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=4aa9639b374aec1728da0542b1440868995a6008381a9e24022b7e21c0a24cb2
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [53]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType
from pyspark.sql.functions import col, regexp_replace, trim

# Create a Spark session
spark = SparkSession.builder.appName("data_clean_customers").getOrCreate()

# Define the schema so it matches with the Big Query later
# NOTE: Please adjust the column names and types according to your CSV's columns.
schema = StructType([
    StructField("CustomerID", IntegerType(), nullable=False),
    StructField("PersonID", StringType(), nullable=True),
    StructField("StoreID", FloatType(), nullable=True),
    StructField("TerritoryID", IntegerType(), nullable=True),
    StructField("AccountNumber", StringType(), nullable=True),
    StructField("rowguid", StringType(), nullable=True),
    StructField("ModifiedDate", TimestampType(), nullable=True)
])

# Read the CSV file using the defined schema
df = spark.read.csv('Sales.Customer.csv', sep=';',encoding='utf-8', header=True, schema=schema)
df.printSchema()



root
 |-- CustomerID: integer (nullable = true)
 |-- PersonID: string (nullable = true)
 |-- StoreID: float (nullable = true)
 |-- TerritoryID: integer (nullable = true)
 |-- AccountNumber: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



In [54]:
# Remove leading and trailing white spaces from all column names
df = df.select([col(column).alias(column.strip()) for column in df.columns])

# Clean the 'StoreID' column by stripping leading and trailing white spaces
df = df.withColumn('StoreID', trim(col('StoreID')))

# Convert the 'StoreID' column to integers, replacing non-numeric values with null
df = df.withColumn('StoreID', col('StoreID').cast(IntegerType()))

# Drop rows with null values in the 'StoreID' column
df = df.filter(col('StoreID').isNotNull())
# df.show(5)



In [55]:

for column in df.columns:
    # Store the original data type
    original_type = df.schema[column].dataType

    # Replace "null" cases and then cast back to original type
    df = df.withColumn(column, regexp_replace(col(column), "(?i)NULL", "").cast(original_type))

# Show top rows
df.show()
df.printSchema()



+----------+--------+-------+-----------+-------------+--------------------+--------------------+
|CustomerID|PersonID|StoreID|TerritoryID|AccountNumber|             rowguid|        ModifiedDate|
+----------+--------+-------+-----------+-------------+--------------------+--------------------+
|         1|        |    934|          1|   AW00000001|3F5AE95E-B87D-4AE...|2014-09-12 11:15:...|
|         2|        |   1028|          1|   AW00000002|E552F657-A9AF-4A7...|2014-09-12 11:15:...|
|         3|        |    642|          4|   AW00000003|130774B1-DB21-4EF...|2014-09-12 11:15:...|
|         4|        |    932|          4|   AW00000004|FF862851-1DAA-404...|2014-09-12 11:15:...|
|         5|        |   1026|          4|   AW00000005|83905BDC-6F5E-4F7...|2014-09-12 11:15:...|
|         6|        |    644|          4|   AW00000006|1A92DF88-BFA2-467...|2014-09-12 11:15:...|
|         7|        |    930|          1|   AW00000007|03E9273E-B193-448...|2014-09-12 11:15:...|
|         8|        

In [57]:
# Show top rows
df.show()

# Write the dataframe back to a CSV
df.write.csv('table_customer.csv', header=True, mode='overwrite')



+----------+--------+-------+-----------+-------------+--------------------+--------------------+
|CustomerID|PersonID|StoreID|TerritoryID|AccountNumber|             rowguid|        ModifiedDate|
+----------+--------+-------+-----------+-------------+--------------------+--------------------+
|         1|        |    934|          1|   AW00000001|3F5AE95E-B87D-4AE...|2014-09-12 11:15:...|
|         2|        |   1028|          1|   AW00000002|E552F657-A9AF-4A7...|2014-09-12 11:15:...|
|         3|        |    642|          4|   AW00000003|130774B1-DB21-4EF...|2014-09-12 11:15:...|
|         4|        |    932|          4|   AW00000004|FF862851-1DAA-404...|2014-09-12 11:15:...|
|         5|        |   1026|          4|   AW00000005|83905BDC-6F5E-4F7...|2014-09-12 11:15:...|
|         6|        |    644|          4|   AW00000006|1A92DF88-BFA2-467...|2014-09-12 11:15:...|
|         7|        |    930|          1|   AW00000007|03E9273E-B193-448...|2014-09-12 11:15:...|
|         8|        

In [51]:
df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- PersonID: string (nullable = true)
 |-- StoreID: integer (nullable = true)
 |-- TerritoryID: integer (nullable = true)
 |-- AccountNumber: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



In [52]:
# Close Spark session
spark.stop()