In [1]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Spark Classification") \
    .getOrCreate()

In [2]:
df_transactions = spark.read.csv("transactions.csv", header=True, inferSchema=True, sep=";")

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def categorize_transactions(transactions):
    # Define a Window specification to order by amount in descending order
    windowSpec = Window.orderBy(F.desc("amount"))

    # Calculate the percentiles for each transaction amount
    transactions_with_percentile = transactions.withColumn(
        "percentile",
        F.percent_rank().over(windowSpec)
    )

    # Determine the "level" for each transaction based on its percentile
    categorized_transactions = transactions_with_percentile.withColumn(
        "level",
        F.when(F.col("percentile") <= 0.25, "high")
        .when((F.col("percentile") > 0.25) & (F.col("percentile") <= 0.75), "average")
        .otherwise("low")
    ).drop("percentile") # Drop the percentile column since it's not needed in the final output

    return categorized_transactions



In [4]:
categorized_transactions_df = categorize_transactions(df_transactions)

In [5]:
categorized_transactions_df.show()

+------+-------+------------+----------------+-------+-----+
|    id| amount|account_type|transaction_date|country|level|
+------+-------+------------+----------------+-------+-----+
|426326|9999.99|    Business|      2015-08-07|     NL| high|
|281332|9999.99|Professional|      2019-05-17|     GT| high|
|103983|9999.98|    Personal|      2020-03-22|     CR| high|
|259119|9999.98|    Personal|      2017-03-11|     UG| high|
|317820|9999.98|Professional|      2019-11-07|     EE| high|
|303197|9999.98|    Personal|      2016-11-11|     UA| high|
|342602|9999.97|    Personal|      2014-02-16|     PE| high|
| 58619|9999.97|    Personal|      2012-12-07|     SA| high|
|189443|9999.97|    Business|      2016-06-19|     CN| high|
|446171|9999.97|    Personal|      2017-04-21|     GB| high|
| 99581|9999.97|    Business|      2011-12-06|     KE| high|
|471114|9999.96|    Personal|      2012-01-25|     GR| high|
|199378|9999.96|Professional|      2017-07-25|     PT| high|
|360159|9999.96|    Pers