In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=019995b6ca84b02e82fc59a8f04ec5123c0ef0a35471f3e15862238a505e3a58
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [40]:
spark = SparkSession.builder.appName("table_products").getOrCreate()

schema = StructType([
    StructField("ProductID", IntegerType()),
    StructField("Name", StringType()),
    StructField("ProductNumber", StringType()),
    StructField("MakeFlag", IntegerType()),
    StructField("FinishedGoodsFlag", IntegerType()),
    StructField("Color", StringType()),
    StructField("SafetyStockLevel", IntegerType()),
    StructField("ReorderPoint", IntegerType()),
    StructField("StandardCost", FloatType()),
    StructField("ListPrice", FloatType()),
    StructField("Size", StringType()),
    StructField("SizeUnitMeasureCode", StringType()),
    StructField("WeightUnitMeasureCode", StringType()),
    StructField("Weight", FloatType()),
    StructField("DaysToManufacture", IntegerType()),
    StructField("ProductLine", StringType()),
    StructField("Class", StringType()),
    StructField("Style", StringType()),
    StructField("ProductSubcategoryID", IntegerType()),
    StructField("ProductModelID", IntegerType()),
    StructField("SellStartDate", TimestampType()),
    StructField("SellEndDate", TimestampType()),
    StructField("DiscontinuedDate", TimestampType()),
    StructField("rowguid", StringType()),
    StructField("ModifiedDate", TimestampType())
])


In [41]:
# Read the CSV using Spark's read.csv method
df = spark.read.csv('Production.Product.csv', sep=';', encoding='utf-8', header=True, schema=schema)

In [42]:
# Replace "null" cases in any case (upper/lower)
df = df.replace("NULL", "").replace("null", "")

In [43]:
# Substitute commas for dots in the required columns
# These operations will change column types to string, we will cast them back after
df = df.withColumn("StandardCost", regexp_replace(col("StandardCost"), ",", "."))
df = df.withColumn("ListPrice", regexp_replace(col("ListPrice"), ",", "."))
df.printSchema()

root
 |-- ProductID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- ProductNumber: string (nullable = true)
 |-- MakeFlag: integer (nullable = true)
 |-- FinishedGoodsFlag: integer (nullable = true)
 |-- Color: string (nullable = true)
 |-- SafetyStockLevel: integer (nullable = true)
 |-- ReorderPoint: integer (nullable = true)
 |-- StandardCost: string (nullable = true)
 |-- ListPrice: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- SizeUnitMeasureCode: string (nullable = true)
 |-- WeightUnitMeasureCode: string (nullable = true)
 |-- Weight: float (nullable = true)
 |-- DaysToManufacture: integer (nullable = true)
 |-- ProductLine: string (nullable = true)
 |-- Class: string (nullable = true)
 |-- Style: string (nullable = true)
 |-- ProductSubcategoryID: integer (nullable = true)
 |-- ProductModelID: integer (nullable = true)
 |-- SellStartDate: timestamp (nullable = true)
 |-- SellEndDate: timestamp (nullable = true)
 |-- DiscontinuedDate: time

In [44]:
# 3. Recast columns back to their original types
df = df.withColumn("StandardCost", col("StandardCost").cast(DoubleType()))
df = df.withColumn("ListPrice", col("ListPrice").cast(DoubleType()))
df.printSchema()

root
 |-- ProductID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- ProductNumber: string (nullable = true)
 |-- MakeFlag: integer (nullable = true)
 |-- FinishedGoodsFlag: integer (nullable = true)
 |-- Color: string (nullable = true)
 |-- SafetyStockLevel: integer (nullable = true)
 |-- ReorderPoint: integer (nullable = true)
 |-- StandardCost: double (nullable = true)
 |-- ListPrice: double (nullable = true)
 |-- Size: string (nullable = true)
 |-- SizeUnitMeasureCode: string (nullable = true)
 |-- WeightUnitMeasureCode: string (nullable = true)
 |-- Weight: float (nullable = true)
 |-- DaysToManufacture: integer (nullable = true)
 |-- ProductLine: string (nullable = true)
 |-- Class: string (nullable = true)
 |-- Style: string (nullable = true)
 |-- ProductSubcategoryID: integer (nullable = true)
 |-- ProductModelID: integer (nullable = true)
 |-- SellStartDate: timestamp (nullable = true)
 |-- SellEndDate: timestamp (nullable = true)
 |-- DiscontinuedDate: time

In [45]:
df.write.csv('table_product.csv', header=True, mode='overwrite')

In [46]:
spark.stop()