In [1]:
browse_data = "recsys_challenge/yoochoose-clicks.dat"
lines = sc.textFile(browse_data).map(lambda row: row.split(","))
lines.take(5)

[[u'1', u'2014-04-07T10:51:09.277Z', u'214536502', u'0'],
 [u'1', u'2014-04-07T10:54:09.868Z', u'214536500', u'0'],
 [u'1', u'2014-04-07T10:54:46.998Z', u'214536506', u'0'],
 [u'1', u'2014-04-07T10:57:00.306Z', u'214577561', u'0'],
 [u'2', u'2014-04-07T13:56:37.614Z', u'214662742', u'0']]

**yoochoose-clicks.dat**
Each record/line in the file has the following fields/format: Session ID, Timestamp, Item ID, Category
* Session ID – the id of the session. In one session there are one or many clicks. Could be represented as an integer number.
* Timestamp – the time when the click occurred. Format of YYYY-MM-DDThh:mm:ss.SSSZ
* Item ID – the unique identifier of the item that has been clicked. Could be represented as an integer number.
* Category – the context of the click. The value "S" indicates a special offer, "0" indicates  a missing value, a number between 1 to 12 indicates a real category identifier, any other number indicates a brand. E.g. if an item has been clicked in the context of a promotion or special offer then the value will be "S", if the context was a brand i.e BOSCH, then the value will be an 8-10 digits number. If the item has been clicked under regular category, i.e. sport, then the value will be a number between 1 to 12. 
 

In [2]:
from datetime import date

#calculate dateID, we need this to have itteger date for simplicity
def calcDateID(year, month, day):
    day1 = date(2006, 1, 1)
    dayWanted = date(year, month, day)
    delta = dayWanted - day1
    return delta.days + 1 # we need to add day one as first day not 0


In [3]:
browse = (lines
          .map(lambda row: row + [row[1][:10], calcDateID(int(row[1][:4]), int(row[1][5:7]), int(row[1][8:10]))])
          )
browse.take(3)

[[u'1', u'2014-04-07T10:51:09.277Z', u'214536502', u'0', u'2014-04-07', 3019],
 [u'1', u'2014-04-07T10:54:09.868Z', u'214536500', u'0', u'2014-04-07', 3019],
 [u'1', u'2014-04-07T10:54:46.998Z', u'214536506', u'0', u'2014-04-07', 3019]]

In [4]:
browse.unpersist()

PythonRDD[4] at RDD at PythonRDD.scala:43

# Dates

In [5]:
dates = browse.map(lambda row: (row[5], 1)).reduceByKey(lambda a, b: a+b)

In [6]:
dates.takeOrdered(10, lambda row: row[0])

[(3013, 220872),
 (3014, 211495),
 (3015, 223702),
 (3016, 165630),
 (3017, 180876),
 (3018, 312974),
 (3019, 239043),
 (3020, 179555),
 (3021, 202836),
 (3022, 184023)]

In [7]:
dates.count()

183

# Eval data

In [8]:
browse.take(2)

[[u'1', u'2014-04-07T10:51:09.277Z', u'214536502', u'0', u'2014-04-07', 3019],
 [u'1', u'2014-04-07T10:54:09.868Z', u'214536500', u'0', u'2014-04-07', 3019]]

In [9]:
train_end = 3100
train_period = 90
train_start = train_end - train_period
test_start = train_end + 1
test_end = test_start + 14


In [10]:
import operator
rec_context = (browse
               .filter(lambda row: test_end >= row[5] and row[5] >= test_start)
               .map(lambda row: (int(row[2]), int(row[0])))
               .cache()
               )
rec_context.take(10)

[(214582170, 5563134),
 (214582170, 5563134),
 (214845583, 5563133),
 (214845585, 5563133),
 (214645179, 5563132),
 (214558772, 5563132),
 (214542579, 5563132),
 (214845491, 5563128),
 (214586610, 5563123),
 (214586610, 5563123)]

**yoochoose-buys.dat**
* Session ID - the id of the session. In one session there are one or many buying events. Could be represented as an integer number.
* Timestamp - the time when the buy occurred. Format of YYYY-MM-DDThh:mm:ss.SSSZ
* Item ID – the unique identifier of item that has been bought. Could be represented as an integer number.
* Price – the price of the item. Could be represented as an integer number.
* Quantity – the quantity in this buying.  Could be represented as an integer number.

In [18]:
orders = (lines
          .map(lambda row: row + [row[1][:10], calcDateID(int(row[1][:4]), int(row[1][5:7]), int(row[1][8:10]))])
          .cache()
          )
orders.take(3)

[[u'420374',
  u'2014-04-06T18:44:58.314Z',
  u'214537888',
  u'12462',
  u'1',
  u'2014-04-06',
  3018],
 [u'420374',
  u'2014-04-06T18:44:58.325Z',
  u'214537850',
  u'10471',
  u'1',
  u'2014-04-06',
  3018],
 [u'281626',
  u'2014-04-06T09:40:13.032Z',
  u'214535653',
  u'1883',
  u'1',
  u'2014-04-06',
  3018]]

In [19]:
#it is not realy good because I can match with recomendation done after order we need to fix
purchased_pairs = (orders
                   .filter(lambda row: test_end >= row[6] and row[6] >= test_start)
                   .map(lambda row: ((int(row[0]), int(row[2])), 1))
                   .reduceByKey(operator.add)
               )
purchased_pairs.take(2)

[((5926073, 214846397), 1), ((5912968, 214850870), 1)]

In [20]:
num_purch_all = purchased_pairs.count()

# Evaluate KNN

In [393]:
#loading model
knn_model = "knn_model.txt"
knn_product_relations = sc.textFile(knn_model).map(prodList).repartition(128).cache()
knn_product_relations.take(2)

[(214687792, [(214697585, 0.156174), (214559803, 0.125218)]),
 (214674620, [(214674671, 0.408248)])]

In [394]:
# generate predictions
knn_join = rec_context.join(knn_product_relations)
knn_recomendations = knn_join.flatMap(get_recommendations)
knn_recomendations.take(15)

[(5563279, 214538829, 0.105851),
 (5563784, 214538829, 0.105851),
 (5563843, 214538829, 0.105851),
 (5565927, 214538829, 0.105851),
 (5554507, 214538829, 0.105851),
 (5554819, 214538829, 0.105851),
 (5555113, 214538829, 0.105851),
 (5555582, 214538829, 0.105851),
 (5555582, 214538829, 0.105851),
 (5556728, 214538829, 0.105851),
 (5556829, 214538829, 0.105851),
 (5557831, 214538829, 0.105851),
 (5557854, 214538829, 0.105851),
 (5559118, 214538829, 0.105851),
 (5560712, 214538829, 0.105851)]

In [395]:
knn_num_sess = (knn_recomendations
            .map(lambda row: (row[0], 1))
            .reduceByKey(operator.add)
            .count()
           )

In [396]:
knn_unique_rec = (knn_recomendations
              .map(lambda row: ((row[0], row[1]), 1))
              .reduceByKey(operator.add)
              )

In [397]:
knn_num_purch = (purchased_pairs
             .join(knn_unique_rec)
             .count()
             )

In [398]:
knn_num_rec = knn_recomendations.count()

In [399]:
#page visits	recomendations	all sessions	session	orders	conversion rate
print "%d\t%d\t%d\t%d\t%d\t%d\t%g" % (num_page_visits, knn_num_rec, 
                                      num_rec_sess, knn_num_sess, 
                                      num_purch_all, knn_num_purch, float(knn_num_purch)/knn_num_sess)

2010000	3780143	555341	425450	73889	20232	0.0475544


# Top Sellers

In [400]:
top_sellers = (orders
               .filter(lambda row: train_end >= row[6] and row[6] >= train_end - 30)
               .map(lambda row: (row[2], 1))
               .reduceByKey(operator.add)
               )
ts = top_sellers.takeOrdered(max_rec, lambda row: -row[1])

In [401]:
ts

[(u'214835167', 2241),
 (u'214835109', 2225),
 (u'214836932', 1927),
 (u'214845131', 1889),
 (u'214839973', 1884)]

In [402]:
ts_products = [int(t[0]) for t in ts]

In [403]:
ts_products

[214835167, 214835109, 214836932, 214845131, 214839973]

In [404]:
def ts_rec(row):
    (prod, sess) = row
    for ts_p in ts_products:
        yield (sess, ts_p)

In [405]:
ts_recomendations = rec_context.flatMap(ts_rec)
ts_recomendations.take(15)

[(5563134, 214835167),
 (5563134, 214835109),
 (5563134, 214836932),
 (5563134, 214845131),
 (5563134, 214839973),
 (5563134, 214835167),
 (5563134, 214835109),
 (5563134, 214836932),
 (5563134, 214845131),
 (5563134, 214839973),
 (5563133, 214835167),
 (5563133, 214835109),
 (5563133, 214836932),
 (5563133, 214845131),
 (5563133, 214839973)]

In [406]:
ts_num_sess = (ts_recomendations
            .map(lambda row: (row[0], 1))
            .reduceByKey(operator.add)
            .count()
           )

In [407]:
ts_unique_rec = (ts_recomendations
              .map(lambda row: ((row[0], row[1]), 1))
              .reduceByKey(operator.add)
              )

In [408]:
ts_num_purch = (purchased_pairs
             .join(ts_unique_rec)
             .count()
             )

In [409]:
ts_num_rec = ts_recomendations.count()

In [410]:
#page visits	recomendations	all sessions	session	orders	conversion rate
print "%d\t%d\t%d\t%d\t%d\t%d\t%g" % (num_page_visits, ts_num_rec, 
                                      num_rec_sess, ts_num_sess, 
                                      num_purch_all, ts_num_purch, float(ts_num_purch)/ts_num_sess)

2010000	10050000	555341	555341	73889	995	0.00179169


# Apriori

In [28]:
path_apriori = "apriori_out.txt"
def convert_apriori(row):
    # (214712514, 9, 214712516, 10, 9, 1.0, 112344.20000000003)
    data = row.split("\t")
    return (int(data[0]),int(data[1]),int(data[2]),int(data[3]),int(data[4]),float(data[5]),float(data[6]))
    
apriori_data = sc.textFile(path_apriori).map(convert_apriori)
apriori_data.take(4)

[(214843393, 72, 214843432, 211, 1, 0.0138889, 73.9496),
 (214843393, 72, 214836512, 1386, 1, 0.0138889, 11.2578),
 (214843393, 72, 214843676, 158, 1, 0.0138889, 98.7555),
 (214843393, 72, 214691636, 83, 1, 0.0138889, 187.992)]

In [29]:
def sort_iterable(row):
    prod = row[0]
    rp_it = row[1]
    rp_list = []
    for (p, s) in rp_it:
        rp_list.append((p, s))
    rp_list = sorted(rp_list, key=lambda tup: -tup[1])
    return (prod, rp_list)
    

In [30]:
apriori_product_relations = (apriori_data
                             .filter(lambda row: row[5]> 0.1 and row[4] >= 5 and row[6] >= 1)
                             .map(lambda row: (row[0], (row[2], row[5])))
                             .groupByKey()
                             .map(sort_iterable)
                            )
apriori_product_relations.take(15)

[(214602240, [(214821296, 0.357143)]),
 (214717440, [(214542225, 0.158416)]),
 (214716930, [(214716932, 0.13986)]),
 (214553604, [(214544357, 0.278481)]),
 (214652934, [(214821277, 0.137931)]),
 (214836744,
  [(214602525, 0.333333), (214602581, 0.333333), (214602562, 0.333333)]),
 (214843914,
  [(214691675, 0.147826),
   (214840867, 0.147826),
   (214842418, 0.13913),
   (214838495, 0.13913)]),
 (214837260, [(214639200, 0.115789)]),
 (214569474, [(214717412, 0.113208)]),
 (214821390, [(214718203, 0.107843)]),
 (214820274, [(214842555, 0.135135)]),
 (214827024, [(214820415, 0.252918), (214717331, 0.101167)]),
 (214685784, [(214842555, 0.109589)]),
 (214542354, [(214516749, 0.230769)]),
 (214542360, [(214835512, 0.18), (214748310, 0.18), (214705659, 0.18)])]

In [31]:
# num rec
#max_rec = 10

In [32]:
        
# generate predictions
apriori_join = rec_context.join(apriori_product_relations)
apriori_recomendations = apriori_join.flatMap(get_recommendations)
apriori_recomendations.take(15)

[(5564963, 214838113, 0.125326),
 (5551178, 214838113, 0.125326),
 (5544651, 214838113, 0.125326),
 (5522032, 214838113, 0.125326),
 (5522672, 214838113, 0.125326),
 (5523423, 214838113, 0.125326),
 (5516504, 214838113, 0.125326),
 (5520117, 214838113, 0.125326),
 (5508304, 214838113, 0.125326),
 (5507426, 214838113, 0.125326),
 (5507668, 214838113, 0.125326),
 (5511614, 214838113, 0.125326),
 (5511933, 214838113, 0.125326),
 (5509284, 214838113, 0.125326),
 (5236563, 214838113, 0.125326)]

In [33]:
apriori_num_sess = (apriori_recomendations
            .map(lambda row: (row[0], 1))
            .reduceByKey(operator.add)
            .count()
           )

In [34]:
apriori_unique_rec = (apriori_recomendations
              .map(lambda row: ((row[0], row[1]), 1))
              .reduceByKey(operator.add)
              )

In [35]:
apriori_num_purch = (purchased_pairs
             .join(apriori_unique_rec)
             .count()
             )

In [36]:
apriori_num_rec = apriori_recomendations.count()

In [37]:
#page visits	recomendations	all sessions	session	orders	conversion rate
print "%d\t%d\t%d\t%d\t%d\t%d\t%g" % (num_page_visits, apriori_num_rec, 
                                      num_rec_sess, apriori_num_sess, 
                                      num_purch_all, apriori_num_purch, float(apriori_num_purch)/apriori_num_sess)

2010000	629808	555341	221577	73889	5190	0.023423
