In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import datetime as dt

In [2]:
# import pandas as pd
# df = pd.read_excel("data/online_retail.xlsx")
# df.to_csv("data/online_retail.csv")

In [2]:
spark = SparkSession.builder \
      .master("local[2]") \
      .appName("SparkFirst")   \
      .config("spark.executor.memory", "32g")\
      .config("spark.executor.cores", 4) \
      .config("spark.dynamicAllocation.enabled", "true") \
      .config("spark.dynamicAllocation.maxExecutors", 4) \
      .config("spark.shuffle.service.enabled", "true") \
      .getOrCreate()
df = spark.read.option("header", True).option("inferSchema", "true").csv("data/online_retail.csv")

In [3]:
print("Количество строк в файле: {}".format(df.count()))
print("Количество уникальных клиентов: {}".format(
  df.select(countDistinct("CustomerID")).collect()[0][0]
))
df.createOrReplaceTempView("df_t")
print("В какой стране совершается большинство покупок: {}".format(
  spark.sql("select count (*) as cnt, Country from df_t group by Country order by cnt desc limit 1").collect()[0][1]
))
dates = spark.sql("select min(InvoiceDate), max(InvoiceDate) from df_t").collect()[0]
print("Даты самой ранней и самой последней покупки на платформе: {} и {}".format(dates[0].date(), dates[1].date()))

Количество строк в файле: 541909
Количество уникальных клиентов: 4372
В какой стране совершается большинство покупок: United Kingdom
Даты самой ранней и самой последней покупки на платформе: 2010-12-01 и 2011-12-09


In [4]:
# Предполагаем, что анализ происходит 2012-01-01
# Добавляем столбцы RFM
df = spark.sql(
    """ select CustomerID,
        min(datediff(date '2012-01-01', InvoiceDate)) as recency, 
        count(*) as frequency,
        avg(Quantity * UnitPrice) as monetary
        from df_t 
        where CustomerID is not null
        group by CustomerID""" 
)
df.show()

+----------+-------+---------+------------------+
|CustomerID|recency|frequency|          monetary|
+----------+-------+---------+------------------+
|   16916.0|     46|      143|4.0297902097902085|
|   17884.0|     26|      117| 6.132051282051281|
|   13094.0|     44|       30| 56.96200000000002|
|   16596.0|     38|       12|20.845833333333335|
|   17633.0|     54|       72|17.254722222222217|
|   18114.0|    313|       28| 7.860714285714286|
|   13973.0|    310|       11|24.063636363636363|
|   14473.0|     97|        7| 33.47714285714286|
|   13956.0|     28|      152| 6.752763157894736|
|   13533.0|    205|       76|3.5630263157894753|
|   13918.0|     72|       30|            40.428|
|   12493.0|    188|       23|18.121304347826086|
|   14285.0|     44|       27| 70.74111111111112|
|   15776.0|    156|       18|13.423333333333336|
|   14768.0|     40|        6|23.250000000000004|
|   17267.0|    150|       38| 8.358421052631577|
|   14024.0|    144|       16|          20.48125|


In [5]:
# Добавляем столбцы с рейтингами
df.createOrReplaceTempView("df_t")
df = spark.sql(
    """ select CustomerID, recency, frequency, monetary,
        case
          when recency < 30 then 'A'
          when recency < 60 then 'B'
          else 'C'
        end as r_rating,
        case
          when frequency > 50 then 'A'
          when frequency > 20 then 'B'
          else 'C'
        end as f_rating,
        case
          when monetary > 50 then 'A'
          when monetary > 25 then 'B'
          else 'C'
        end as m_rating,
        concat(r_rating, f_rating, m_rating) as rating
        from df_t"""
)
df.show()

+----------+-------+---------+------------------+--------+--------+--------+------+
|CustomerID|recency|frequency|          monetary|r_rating|f_rating|m_rating|rating|
+----------+-------+---------+------------------+--------+--------+--------+------+
|   16916.0|     46|      143|4.0297902097902085|       B|       A|       C|   BAC|
|   17884.0|     26|      117| 6.132051282051281|       A|       A|       C|   AAC|
|   13094.0|     44|       30| 56.96200000000002|       B|       B|       A|   BBA|
|   16596.0|     38|       12|20.845833333333335|       B|       C|       C|   BCC|
|   17633.0|     54|       72|17.254722222222217|       B|       A|       C|   BAC|
|   18114.0|    313|       28| 7.860714285714286|       C|       B|       C|   CBC|
|   13973.0|    310|       11|24.063636363636363|       C|       C|       C|   CCC|
|   14473.0|     97|        7| 33.47714285714286|       C|       C|       B|   CCB|
|   13956.0|     28|      152| 6.752763157894736|       A|       A|       C|

In [6]:
df.createOrReplaceTempView("df_t")
df = spark.sql(
    """ select CustomerID
        from df_t
        where rating = 'AAA'"""
)
df.show()

+----------+
|CustomerID|
+----------+
|   16353.0|
|   18092.0|
|   17949.0|
|   15061.0|
|   12753.0|
|   13408.0|
|   13694.0|
|   15187.0|
|   14739.0|
|   16525.0|
|   17677.0|
|   16672.0|
|   16210.0|
|   18102.0|
|   13199.0|
|   13881.0|
|   17857.0|
|   17511.0|
|   13777.0|
|   15694.0|
+----------+
only showing top 20 rows



In [7]:
df.count()

29

In [8]:
df.coalesce(1).write.mode("overwrite").csv("result")