In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=bfd7dca782664ba1b26575e69f6a48a86d10bebad200b7899678466de9d83254
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, TimestampType
from pyspark.sql import functions as F

# 1. Initialize a Spark session
spark = SparkSession.builder \
    .appName("clean_table_specialofferproduct") \
    .getOrCreate()

# 2. Read the CSV data into a DataFrame
df = spark.read.option("delimiter", ";").csv('./raw_data/Sales.SpecialOfferProduct.csv', header=True,encoding='utf-8', inferSchema=True)

# 3. Apply transformations similar to Pandas
# Drop duplicates based on 'ProductID'
df_without_duplicates = df.dropDuplicates(['ProductID'])

# 4. Cast columns to match the desired schema
df_with_schema = df_without_duplicates \
    .withColumn("SpecialOfferID", F.col("SpecialOfferID").cast(IntegerType())) \
    .withColumn("ProductID", F.col("ProductID").cast(IntegerType())) \
    .withColumn("rowguid", F.col("rowguid").cast(StringType())) \
    .withColumn("ModifiedDate", F.col("ModifiedDate").cast(TimestampType()))

# Show the DataFrame
df_with_schema.show()



+--------------+---------+--------------------+-------------------+
|SpecialOfferID|ProductID|             rowguid|       ModifiedDate|
+--------------+---------+--------------------+-------------------+
|             1|      680|BB30B868-D86C-455...|2011-04-01 00:00:00|
|             1|      706|B3C9A4B1-2AE6-4CB...|2011-04-01 00:00:00|
|             1|      707|27B711FE-0B77-4EA...|2011-04-01 00:00:00|
|             1|      708|46CBB78B-246E-4D6...|2011-04-01 00:00:00|
|             1|      709|CF102AA0-055F-4D2...|2011-04-01 00:00:00|
|             1|      710|63718DA1-464B-432...|2011-04-01 00:00:00|
|             1|      711|457EB971-D1C9-48C...|2011-04-01 00:00:00|
|             1|      712|5B948448-BAE5-4F2...|2011-04-01 00:00:00|
|             1|      713|07768F40-6E46-430...|2011-04-01 00:00:00|
|             1|      714|85004BCE-C74A-4D4...|2011-04-01 00:00:00|
|             1|      715|BE469616-B279-452...|2011-04-01 00:00:00|
|             1|      716|96D141FD-1D8E-431...|2

In [17]:
#df.printSchema()

In [25]:
df.coalesce(1).write.csv('table_specialofferproduct.csv', header=True, mode='overwrite')

In [19]:
spark.stop()