In [1]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import input_file_name
import re

In [2]:
spark

In [2]:
%%time
#sdf_prices = spark.read.format("csv").option("header","true").load("gs://lz-prices-csv/*.csv")
sdf_prices = spark.read.csv("gs://lz-prices-csv/*.csv", header=True, inferSchema= True, sep=';')

                                                                                

CPU times: user 56.1 ms, sys: 6.99 ms, total: 63.1 ms
Wall time: 42.2 s


In [3]:
# Define a UDF to extract the date from the filename
def extract_date_from_filename(filename):
    pattern = r'(\d{4})(\d{2})(\d{2})'
    match = re.search(pattern, filename)
    if match:
        year = match.group(1)
        month = match.group(2)
        day = match.group(3)
        return f"{year}-{month}-{day}"
    else:
        return None

# Register the UDF
extract_date_udf = udf(extract_date_from_filename, StringType())

# Extract date with UDF
sdf_prices = sdf_prices.withColumn("date", extract_date_udf(input_file_name()))

In [53]:
sdf_prices.select("name", "date").toPandas()

                                                                                

Unnamed: 0,name,date
0,uCLEAN ADL-1420 EHP,2023-06-24
1,siaspeed1950,2023-06-24
2,metaBOX 215,2023-06-24
3,metaBOX 118 Organizer,2023-06-24
4,iPulse H-1635,2023-06-24
...,...,...
45475,3M 51370 737U Hookit disc P120 150 mm,2023-07-17
45476,3M 31651 Cubitron 150mm P120 - 50 stuks,2023-07-17
45477,"19-TLG. HSS-CO SPIRALBOHRERSET, PROBOX",2023-07-17
45478,1230 M AFC EU,2023-07-17


In [4]:
columns = ["brand", "name", "date", "Median (Price)", "contorion", "werkzeugstore24", "Min shop"]

In [49]:
sdf_prices.printSchema()

root
 |--  key: string (nullable = true)
 |-- EAN: string (nullable = true)
 |-- name: string (nullable = true)
 |-- group: string (nullable = true)
 |-- category: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- PI product link: string (nullable = true)
 |-- Median (Price): string (nullable = true)
 |-- cheapest shop name: string (nullable = true)
 |-- cheapest shop shipping time: string (nullable = true)
 |-- cheapest shop availability: string (nullable = true)
 |-- info: string (nullable = true)
 |-- total number of offers: string (nullable = true)
 |-- price difference: string (nullable = true)
 |-- enableMarketplacesColumn: string (nullable = true)
 |-- google.de Name: string (nullable = true)
 |-- google.de Url: string (nullable = true)
 |-- idealo.de Name: string (nullable = true)
 |-- idealo.de Url: string (nullable = true)
 |-- amazon.de Name: string (nullable = true)
 |-- amazon.de Url: string (nullable = true)
 |-- kaufland.de Name: string (nullable = true)

In [61]:
sdf_prices.select(columns).toPandas()

                                                                                

Unnamed: 0,brand,name,date,Median (Price),contorion Delta
0,Starmix,uCLEAN ADL-1420 EHP,2023-06-24,36926,
1,SIA,siaspeed1950,2023-06-24,6234,
2,Metabo,metaBOX 215,2023-06-24,4153,
3,Metabo,metaBOX 118 Organizer,2023-06-24,5942,
4,Starmix,iPulse H-1635,2023-06-24,105099,
...,...,...,...,...,...
45475,3M,3M 51370 737U Hookit disc P120 150 mm,2023-07-17,,
45476,3M,3M 31651 Cubitron 150mm P120 - 50 stuks,2023-07-17,4479,
45477,Bosch,"19-TLG. HSS-CO SPIRALBOHRERSET, PROBOX",2023-07-17,3767,
45478,Mirka,1230 M AFC EU,2023-07-17,120580,


In [7]:
sdf_prices_filtered = sdf_prices.select(columns).filter(sdf_prices.brand == "Festool")

In [None]:
sdf_prices_filtered.toPandas()

In [11]:
output_path = "gs://lz-gcs/prices/prices_2.csv"
sdf_prices_filtered = sdf_prices_filtered.coalesce(1)
sdf_prices_filtered.write.csv(output_path, header=True, mode="overwrite", sep = ";")

                                                                                