In [1]:
import pyspark 
sc = pyspark.SparkContext('local[*]')
# do something to prove it works
rdd = sc.parallelize(range(1000))
rdd.takeSample(False, 5)


[720, 21, 67, 437, 238]

In [12]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Read CSV File") \
    .getOrCreate()

# Read the CSV file into a DataFrame
df = spark.read.csv("work/clean_me_out.csv", header=True, sep=",", quote = "\"")
print(f"#df: {df.count()}")


#df: 499999


In [14]:
df.show(truncate=False)

+--------+----------------+--------+------+------------+--------------------------------+
|order_id|delivery_company|quantity|price |ordered_date|address                         |
+--------+----------------+--------+------+------------+--------------------------------+
|1       |delivery_comp_1 |1       |245,52|9-2-2022    |Cedar Lane Houston,CA 90001     |
|2       |delivery_comp_2 |2       |114,77|null        |Main Street,New York CA 60601   |
|3       |delivery_comp_3 |null    |739,43|14-3-2022   |Main Street,Chicago TX 10001    |
|4       |delivery_comp_0 |1       |878.93|20/4/2022   |Oak Avenue,Los Angeles FL 90001 |
|5       |delivery_comp_1 |2       |481,44|null        |Maple Drive Chicago,FL 60601    |
|6       |delivery_comp_2 |#NA     |78,13 |null        |Main Street,Houston NY 77001    |
|7       |delivery_comp_3 |1       |832.17|20-2-2022   |Oak Avenue New York,CA 10001    |
|8       |delivery_comp_0 |2       |687,8 |1/4/2022    |Maple Drive,Los Angeles,CA,10001|
|9       |

In [18]:
def unify_price(price):
    # replace comma with a dot
    return float(price.replace(",","."))


    

In [107]:
def unify_date(date):
    if date == 'null':
        return None
    #replace slahs with dash
    words = date.replace("/","-").split("-")
    #add leading zeroes
    return f"{('0' + words[0])[-2:]}-{('0'+ words[1])[-2:]}-{words[2]}"
    


In [113]:
def unify_address(address):
    """
        traverse from end to begging of a string to get zipcode, state, city, street
    """
    words = address.split(",")
    if words[-1].isdigit():
        zipCode = words[-1]
        words.remove(words[-1])
    else:
        zwords = words[-1].split(" ")
        zipCode = zwords[-1]
        words.remove(words[-1])
        zwords.remove(zwords[-1])
        words.append(" ".join(zwords))

    if len(words[-1]) == 2:
        state = words[-1]
        words.remove(words[-1])
    else:
        state = words[-1][-2:]
        words[-1] = words[-1][:-2].strip()

    if len(words) == 2:
        city = words[-1]
        street = words[0]
    else:
        city = words[0].split(' ')[-1]
        street = " ".join(words[0].split(' ')[:-1])
    return f"{street}, {city}, {state}, {zipCode}"
    
#print(unify_address('Maple Drive,Miami,IL 77001'))
#print(unify_address('Elm Street,Miami TX 90001'))
#print(unify_address('Cedar Lane Houston,CA 90001'))
#print(unify_address('Main Street,New York CA 60601'))

In [108]:
from pyspark.sql.functions import udf
udf_unify_price = udf(unify_price)
udf_unify_date = udf(unify_date)
udf_unify_address = udf(unify_address)

In [112]:
df.select(udf_unify_price('price'), udf_unify_date('ordered_date'), udf_unify_address('address')).show(50, truncate = False)

+------------------+------------------------+-----------------------------------+
|unify_price(price)|unify_date(ordered_date)|unify_address(address)             |
+------------------+------------------------+-----------------------------------+
|245.52            |09-02-2022              |Cedar Lane, Houston, CA, 90001     |
|114.77            |null                    |Main Street, New York, CA, 60601   |
|739.43            |14-03-2022              |Main Street, Chicago, TX, 10001    |
|878.93            |20-04-2022              |Oak Avenue, Los Angeles, FL, 90001 |
|481.44            |null                    |Maple Drive, Chicago, FL, 60601    |
|78.13             |null                    |Main Street, Houston, NY, 77001    |
|832.17            |20-02-2022              |Oak Avenue New, York, CA, 10001    |
|687.8             |01-04-2022              |Maple Drive, Los Angeles, CA, 10001|
|338.44            |13-04-2022              |Cedar Lane, Miami, NY, 77001       |
|461.33         