**Initial import & configuration**

In [None]:
pip install pyarrow

In [23]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import pyarrow as pa
import pyarrow.parquet as pq

In [24]:
spark = SparkSession.builder\
        .appName("PysparkCustomerSegmentation")\
        .getOrCreate()

**JSON configuration**

In [None]:
config = spark.read.option("multiline","true").json("C:/Your/Configuration/Path/config_file.json")
config.printSchema()

In [None]:
source_config = config.withColumn("sources", explode(col("sources"))).select("sources.*")
source_config.show()
source_config = source_config.collect()

**Ingest data from csv files with json congifuration**

In [27]:
def load_data():

    for i in range(len(source_config)):

        source = source_config[i]

        df_name = f"df_{source['table_name']}"

        config_schema = source['schema']

        schema = StructType([
            StructField(field['name'], IntegerType() if field['type'] == "integer" else
                                              (StringType() if field['type'] == "string" else
                                              (DateType() if field['type'] == "date" else
                                              (TimestampType() if field['type'] == "timestamp" else
                                              (FloatType() if field['type'] == "float" else
                                              (BooleanType() if field['type'] == "boolean" else StringType()))))),
            True
        )
            for field in config_schema['fields']
        ])

        globals()[df_name] = spark.read.format("csv")\
                                .option("header","true")\
                                .option("delimiter", ';')\
                                .schema(schema)\
                                .csv(source['path'])

In [None]:
load_data()

df_DimCustomer.show()
df_SalesOnline.show()
df_vSales.show()

**Data tranform & aggregations for customer segmentation**

In [None]:
df_vSales = df_vSales.drop(col("OrderDate"))
df = df_SalesOnline.join(df_vSales, on="OrderID", how="left")
df = df.filter(col("Customer") != 'purchase without registration')
df = df.groupby("CustomerID").agg(\
                                    round(sum(col("SalesAmount")),2).alias("TotalSpends"),\
                                    sum(col("PositionCount")).alias("PurchasedItems"),\
                                    count(col("OrderID")).alias("OrdersCount"),\
                                    min(col("OrderDate")).alias("FirstOrder"),\
                                    max(col("OrderDate")).alias("LastOrder")
                                    )
df = df\
        .withColumn("AverageBasketSize", round(df.PurchasedItems/df.OrdersCount,2))\
        .withColumn("AverageBasketValue", round(df.TotalSpends/df.OrdersCount,2))
df.show()

In [None]:
df_DimCustomer = df_DimCustomer.drop("DeliveryAddress")

df_DimCustomer = \
    df_DimCustomer.withColumn("City", substring_index("CorrespondenceAddress"," ",-1))\
                    .withColumn("Age", round(datediff(current_date(),'BirthDate')/365.25,0))\
                    .withColumn("AgeSexSegment", when((col("Gender")=="M") & (col("Age")<26), "M18")\
                                                .when((col("Gender")=="M") & (col("Age")<35), "M26")\
                                                .when((col("Gender")=="M") & (col("Age")<50), "M35")\
                                                .when((col("Gender")=="M") & (col("Age")>50), "M50")\
                                                .when((col("Gender")=="F") & (col("Age")<26), "F18")\
                                                .when((col("Gender")=="F") & (col("Age")<35), "F26")\
                                                .when((col("Gender")=="F") & (col("Age")<50), "F35")\
                                                .when((col("Gender")=="F") & (col("Age")>50), "F50"))\
                    .withColumn("FamilySegment", when((col("Gender")=="M") & (col("MartialStatus")=="married") & (col("Kids")>0), "head of the family")\
                                                .when((col("Gender")=="M") & (col("MartialStatus")=="married") & (col("Kids")==0), "husband")\
                                                .when((col("Gender")=="M") & (col("MartialStatus")!="married") & (col("Kids")>0),"father")\
                                                .when((col("Gender")=="M") & (col("MartialStatus")!="married"),"single")\
                                                .when((col("Gender")=="F") & (col("MartialStatus")=="married") & (col("Kids")>0),"mother&wife")\
                                                .when((col("Gender")=="F") & (col("MartialStatus")=="married") & (col("Kids")==0),"wife")\
                                                .when((col("Gender")=="F") & (col("MartialStatus")!="married") & (col("Kids")>0),"single mother")\
                                                .when((col("Gender")=="F") & (col("MartialStatus")!="married"), "single"))\
                    .withColumn("DemographicSegment",when(col("City").isin(["Warszawa", "Kraków", "Poznań", "Wrocław", "Łódź", "Gdańsk"]), "big city")\
                                                .when(col("City").isin(["Katowice", "Szczecin", "Bydgoszcz", "Częstochowa", "Lublin", "Białystok"]), "city")\
                                                .otherwise("small town/village"))

df = df_DimCustomer.join(df, on="CustomerID", how="inner")

df.show()

**Save results to parquet file**

In [None]:
pandas_df = df.toPandas()
table = pa.Table.from_pandas(pandas_df)
pq.write_table(table, "C:/Your/Result/Path/customer_segmentation.parquet")