In [3]:
from functools import reduce 
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [4]:
spark = SparkSession \
        .builder \
        .appName('First-Payment') \
        .getOrCreate()

In [5]:
online_retail_df = spark.read.format('csv').options(header='true').load('./Data/order/online_retail.csv')

In [6]:
online_retail_df.limit(10).toPandas()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/2010 8:26,7.65,17850,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/2010 8:26,4.25,17850,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,12/1/2010 8:28,1.85,17850,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,12/1/2010 8:28,1.85,17850,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/2010 8:34,1.69,13047,United Kingdom


In [54]:
tmp_online_retail_df = online_retail_df.withColumn('Date', to_date(split(col('InvoiceDate'), ' ').getItem(0), 'M/d/yyyy'))\
    .withColumn('RelationWeek', date_trunc('week', col('Date'))) \
    .where(col('CustomerID').isNotNull()) \
    .select('CustomerId', 'RelationWeek')
tmp_online_retail_df.limit(10).toPandas()

Unnamed: 0,CustomerId,RelationWeek
0,17850,2010-11-29
1,17850,2010-11-29
2,17850,2010-11-29
3,17850,2010-11-29
4,17850,2010-11-29
5,17850,2010-11-29
6,17850,2010-11-29
7,17850,2010-11-29
8,17850,2010-11-29
9,13047,2010-11-29


In [55]:
new_customers_weekly = tmp_online_retail_df.groupby(col('CustomerId')) \
    .agg(min('RelationWeek').alias('Week')) \
    .withColumnRenamed('CustomerId', 'CustomerId1')
new_customers_weekly.limit(10).toPandas()

Unnamed: 0,CustomerId1,Week
0,16250,2010-11-29
1,15574,2010-11-29
2,15555,2010-11-29
3,15271,2010-12-06
4,17714,2011-01-17
5,17757,2010-11-29
6,17551,2010-12-13
7,13187,2011-01-03
8,16549,2011-01-03
9,12637,2011-01-17


In [56]:
sum_new_customer_weekly = new_customers_weekly.groupby('Week') \
    .agg(count('CustomerId1').alias('NumberNewCustomerWeekly')) \
    .withColumnRenamed('Week', 'FirstWeek')
sum_new_customer_weekly.orderBy(col('FirstWeek').asc()).limit(10).toPandas()

Unnamed: 0,FirstWeek,NumberNewCustomerWeekly
0,2010-11-29,323
1,2010-12-06,368
2,2010-12-13,214
3,2010-12-20,43
4,2011-01-03,94
5,2011-01-10,103
6,2011-01-17,86
7,2011-01-24,120
8,2011-01-31,114
9,2011-02-07,69


In [57]:
retained_customers = tmp_online_retail_df.join(new_customers_weekly, col('CustomerId') == col('CustomerId1'), 'left') \
    .groupby('RelationWeek', 'Week') \
    .agg(count(col('CustomerId')).alias('TotalTimesPayment')) \
    .orderBy(col('Week').asc(), col('RelationWeek').asc())
retained_customers.toPandas()

Unnamed: 0,RelationWeek,Week,TotalTimesPayment
0,2010-11-29,2010-11-29,7853
1,2010-12-06,2010-11-29,1906
2,2010-12-13,2010-11-29,1734
3,2010-12-20,2010-11-29,491
4,2011-01-03,2010-11-29,1709
...,...,...,...
1406,2011-11-28,2011-11-21,46
1407,2011-12-05,2011-11-21,179
1408,2011-11-28,2011-11-28,1614
1409,2011-12-05,2011-11-28,198


In [58]:
result = retained_customers.join(sum_new_customer_weekly, col('Week') == col('FirstWeek'), 'left') \
    .select('Week', 'RelationWeek', 'NumberNewCustomerWeekly','TotalTimesPayment') \
    .orderBy('Week', 'RelationWeek') 
result.limit(20).toPandas()

Unnamed: 0,Week,RelationWeek,NumberNewCustomerWeekly,TotalTimesPayment
0,2010-11-29,2010-11-29,323,7853
1,2010-11-29,2010-12-06,323,1906
2,2010-11-29,2010-12-13,323,1734
3,2010-11-29,2010-12-20,323,491
4,2010-11-29,2011-01-03,323,1709
5,2010-11-29,2011-01-10,323,898
6,2010-11-29,2011-01-17,323,1103
7,2010-11-29,2011-01-24,323,912
8,2010-11-29,2011-01-31,323,1254
9,2010-11-29,2011-02-07,323,854


In [59]:
cohort_analytis = result.groupby('Week').agg(collect_list(col('TotalTimesPayment'))).orderBy('Week')
cohort_analytis.limit(20).toPandas()

Unnamed: 0,Week,collect_list(TotalTimesPayment)
0,2010-11-29,"[7853, 1906, 1734, 491, 1709, 898, 1103, 912, ..."
1,2010-12-06,"[7941, 1205, 606, 840, 875, 832, 969, 665, 753..."
2,2010-12-13,"[4306, 102, 367, 331, 538, 522, 441, 356, 698,..."
3,2010-12-20,"[706, 61, 129, 148, 55, 28, 15, 51, 210, 19, 4..."
4,2011-01-03,"[2443, 114, 64, 120, 239, 29, 210, 47, 183, 15..."
5,2011-01-10,"[2466, 92, 232, 110, 182, 199, 304, 117, 148, ..."
6,2011-01-17,"[2021, 36, 6, 95, 205, 174, 281, 74, 250, 187,..."
7,2011-01-24,"[2999, 42, 64, 235, 94, 138, 153, 123, 68, 135..."
8,2011-01-31,"[2398, 120, 72, 60, 62, 90, 171, 254, 381, 167..."
9,2011-02-07,"[1557, 105, 101, 99, 23, 75, 183, 202, 8, 37, ..."
