# Import Library

In [1]:
import re
import numpy as np
import pandas as pd
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

import matplotlib.pyplot as plt
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import plotly.io as pio
pio.renderers.default = 'colab'

from tqdm.auto import tqdm 
tqdm.pandas()

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Install Library

In [2]:
!pip install Unidecode
!pip install rank_bm25
!pip install mlxtend
!pip install sparse_dot_topn

import unidecode
import mlxtend
import rank_bm25



# Read Data

In [3]:
meta_df = pd.read_json("drive/MyDrive/Hepsiburada_Recommendation/data/meta.json")
event_df = pd.read_json("drive/MyDrive/Hepsiburada_Recommendation/data/events.json")

event_df = pd.DataFrame.from_records(event_df["events"])
meta_df = pd.DataFrame.from_records(meta_df["meta"])

meta_df["brand"] = meta_df["brand"].fillna("Other")


event_df = event_df.astype({'eventtime': 'datetime64[ns]',
                            'price': 'float'})

event_df = event_df.sort_values(["sessionid", "eventtime"],
                     ascending=[True, True]).reset_index(drop=True)

# EDA

## Meta info of products in the events

In [4]:
meta_df.sample(3)

Unnamed: 0,productid,brand,category,subcategory,name
3479,HBV00000PVQCL,Greenway,Ev Yaşam ve Bahçe,Bahçe ve Mangal,Greenway Çiçekli Bitkiler Sıvı Bitki Besini 500 cc
2992,HBV00000NE1RS,Sek,Kahvaltılık ve Süt,Süt,Sek Günlük Süt 200 ml
4018,HBV00000SP828,Lipton,İçecekler,Çay,Lıpton Zencefıl Lımon Cay 20 x 2 gr


In [5]:
meta_df.shape

(10236, 5)

In [6]:
for col in meta_df:
    print(col, meta_df[col].nunique())

productid 10235
brand 790
category 20
subcategory 132
name 10123


In [7]:
for col in meta_df:
    print(col, meta_df[col].isna().sum())

productid 1
brand 0
category 1
subcategory 1
name 1


In [8]:
meta_df[meta_df["productid"].isna()]

Unnamed: 0,productid,brand,category,subcategory,name
5092,,Other,,,


In [9]:
meta_df = meta_df.dropna(subset=["productid"]).reset_index(drop=True)

## add2cart events for a period of time

In [10]:
event_df.sample(3)

Unnamed: 0,event,sessionid,eventtime,price,productid
80412,cart,34aede3f-4de2-4865-85df-2e346c1337ea,2020-06-08 06:28:49.372,29.99,OFISBIC829735
282147,cart,ba67051a-0bc4-413f-8931-fa630469320e,2020-06-02 05:45:46.216,4.5,PTRIT12179033
152453,cart,63f5f950-1b8d-40be-969d-28a6d5a8cd1a,2020-06-01 00:27:22.699,14.48,HBV00000NVZE8


In [11]:
event_df.shape

(387656, 5)

In [12]:
for col in event_df:
    print(col, event_df[col].nunique())

event 1
sessionid 54442
eventtime 387196
price 1217
productid 10235


## Session Başına Ortalama 7 ürün

In [13]:
387656 / 54442

7.1205319422504685

## Session Length Histogram

In [14]:
event_df["session_length"] = event_df.groupby(["sessionid"])["eventtime"].transform("nunique")
event_df.drop_duplicates(subset=["sessionid"])["session_length"].iplot(kind="hist")

# Product Count

In [15]:
product_count_df = pd.DataFrame(event_df["productid"].value_counts().sort_values(ascending=False))
product_count_df = product_count_df.reset_index()
product_count_df.columns = ['productid', 'counts']

product_count_df["ratio"] = product_count_df["counts"] / product_count_df["counts"].sum()
product_count_df["cum_sum"] = product_count_df["counts"].cumsum()
product_count_df["cum_ratio"] = product_count_df["cum_sum"] / product_count_df["counts"].sum()

In [16]:
product_count_df

Unnamed: 0,productid,counts,ratio,cum_sum,cum_ratio
0,HBV00000NVZGU,17082,0.044066,17082,0.044066
1,HBV00000NVZBI,5557,0.014335,22639,0.058401
2,HBV00000OE7X7,5070,0.013079,27709,0.071479
3,HBV00000NVZBY,3824,0.009865,31533,0.081344
4,HBV00000O2S62,3704,0.009555,35237,0.090899
...,...,...,...,...,...
10230,HBV00000PNGDM,1,0.000003,387646,0.999990
10231,HBV00000NFRXB,1,0.000003,387647,0.999992
10232,PTTEMPO340367,1,0.000003,387648,0.999995
10233,SGUNIDEPKADBUY,1,0.000003,387649,0.999997


In [17]:
product_count_df["cum_ratio"].iplot()

In [18]:
for col in event_df:
    print(col, event_df[col].isna().sum())

event 0
sessionid 0
eventtime 0
price 6
productid 6
session_length 0


In [19]:
event_df[event_df["price"].isna()]

Unnamed: 0,event,sessionid,eventtime,price,productid,session_length
45580,cart,1e50875d-c395-4e6d-8ea4-52e2bcb4975a,2020-06-04 06:08:54.849,,,7
57644,cart,25f19afd-6cb2-46dd-9ebd-0fd97cf10b63,2020-06-03 19:03:31.293,,,52
183785,cart,78ca9e26-f4ee-4fb3-a166-b124fdc32426,2020-06-03 21:17:37.145,,,3
227606,cart,967f0239-a7f0-4ca9-b31e-92ad4e0798d7,2020-06-01 21:41:43.474,,,3
311300,cart,cd86710a-f292-49e7-b48e-6b57d51755ae,2020-06-06 21:27:48.620,,,49
382546,cart,fc1ba51a-224b-4818-bff0-fa2dc1c93acd,2020-06-01 12:32:22.012,,,2


In [20]:
event_df = event_df.dropna(subset=["productid"]).reset_index(drop=True)

In [21]:
event_df.head(3)

Unnamed: 0,event,sessionid,eventtime,price,productid,session_length
0,cart,000280f4-62fc-4dcd-b51d-c66ac14d7d8c,2020-06-07 14:30:58.804,9.99,HBV00000NE1WT,1
1,cart,0002e53b-1f60-4309-8380-31ca03de51f8,2020-06-06 17:51:18.003,22.48,HBV00000NVZGQ,2
2,cart,0002e53b-1f60-4309-8380-31ca03de51f8,2020-06-06 17:52:42.480,5.5,HBV00000NE1LU,2


## Cleanize Text

In [22]:
# 100 gr --> gr (for extract size identifier statistics)
def get_identifier_value(x):
    if len(x) != 0:
        x = x[0].split()[-1]
        x = ''.join(c for c in x if not c.isdigit())
    else:
        x = ""
    return x



# extract all word came after number
# extract size_identifiers like 30 ml, 1 kg
size_regex = fr'\b([0-9]+\s*[a-z]+)\b'
meta_df["size_identifiers"] = meta_df["name"].apply(lambda x: re.findall(size_regex, x))
meta_df["identifier_value"] = meta_df["size_identifiers"].apply(lambda x: get_identifier_value(x))
meta_df["identifier_value"].value_counts().index
meta_df["size_identifiers"] = meta_df["size_identifiers"].apply(lambda x: x[0] if len(x)>0 else "")

In [23]:
from string import digits, punctuation

remove_digits = str.maketrans('', '', digits)
remove_punc = str.maketrans('', '', punctuation)
remove_turkish_chars = str.maketrans("çğıöşü", "cgiosu")

# select real identifiers 
size_identifiers = ['gr', 'ml', 'g', 'kg', 'lt', 'cm', 'cc', 'li', 'm', 'mm',
                    'adet', 'gram', 'l', 'in', 'mt',  'litre', 'lu', 'kutu', 'w']
                    
size_identifiers = sorted(size_identifiers, key=len, reverse=True)

size_identifiers = "(" + "|".join(size_identifiers) + ")"
size_regex = fr'\b([0-9]+\s*{size_identifiers})\b'

In [24]:
def cleanize_text(row):
    text = row["name"]

    # remove size identifiers in title 1kg, 200gr
    text = text.replace(row["size_identifiers"], "")

    # remove digits 10-4 --> -
    text = text.translate(remove_digits)

    # remove punctuations
    text = text.translate(remove_punc)

    # delete single character words 
    text = " ".join([word for word in text.split() if len(word) > 1])

    # remove turkish chars and lowerize them
    text = text.translate(remove_turkish_chars)
    text = text.lower()

    # nescafé --> nescafe
    text = unidecode.unidecode(text)
    return text


meta_df["clean_name"] = meta_df.apply(lambda x: cleanize_text(x), axis=1)

In [25]:
meta_df.head()

Unnamed: 0,productid,brand,category,subcategory,name,size_identifiers,identifier_value,clean_name
0,HBV00000AX6LR,Palette,Kişisel Bakım,Saç Bakımı,Palette Kalıcı Doğal Renkler 10-4 PAPATYA,,,palette kalici dogal renkler papatya
1,HBV00000BSAQG,Best,Pet Shop,Kedi,Best Pet Jöle İçinde Parça Etli Somonlu Konserve Yetişkin Kedi Maması 415 gr,415 gr,gr,best pet jole icinde parca etli somonlu konserve yetiskin kedi mamasi
2,HBV00000JUHBA,Tarım Kredi,Temel Gıda,"Bakliyat, Pirinç, Makarna",Türkiye Tarım Kredi Koop.Yeşil Mercimek 1 kg,1 kg,kg,turkiye tarim kredi koopyesil mercimek
3,HBV00000NE0QI,Namet,"Et, Balık, Şarküteri",Şarküteri,Namet Fıstıklı Macar Salam 100 gr,100 gr,gr,namet fistikli macar salam
4,HBV00000NE0UQ,Muratbey,Kahvaltılık ve Süt,Peynir,Muratbey Burgu Peyniri 250 gr,250 gr,gr,muratbey burgu peyniri


In [26]:
event_df[event_df["sessionid"] == "b8d43d4b-9dce-47fe-b276-e0cfddcb56d3"]

Unnamed: 0,event,sessionid,eventtime,price,productid,session_length
279891,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:09:07.620,7.95,HBV00000NE25R,22
279892,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:10:04.348,2.74,HBV00000OE7I2,22
279893,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:10:07.070,2.74,HBV00000OE7I2,22
279894,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:10:18.779,4.0,HBV00000OE7X7,22
279895,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:10:19.752,4.0,HBV00000OE7X7,22
279896,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:11:22.659,1.34,HBV00000NVZBI,22
279897,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:11:23.803,1.34,HBV00000NVZBI,22
279898,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:11:29.180,1.34,HBV00000NVZBI,22
279899,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:11:36.902,2.72,HBV00000OE7UF,22
279900,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:11:37.765,2.72,HBV00000OE7UF,22


## Merge DF

In [27]:
df = pd.merge(meta_df, event_df, how="inner", on="productid")
df = df.sort_values(["sessionid", "eventtime"],
                     ascending=[True, True]).reset_index(drop=True)

In [28]:
df[df["sessionid"] == "b8d43d4b-9dce-47fe-b276-e0cfddcb56d3"]

Unnamed: 0,productid,brand,category,subcategory,name,size_identifiers,identifier_value,clean_name,event,sessionid,eventtime,price,session_length
279891,HBV00000NE25R,Carrefour,Kahvaltılık ve Süt,Yumurta,Carrefour Yumurta 15'li,,,carrefour yumurta li,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:09:07.620,7.95,22
279892,HBV00000OE7I2,Other,Meyve ve Sebze,Sebze,Domates Kokteyl 500 gr,500 gr,gr,domates kokteyl,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:10:04.348,2.74,22
279893,HBV00000OE7I2,Other,Meyve ve Sebze,Sebze,Domates Kokteyl 500 gr,500 gr,gr,domates kokteyl,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:10:07.070,2.74,22
279894,HBV00000OE7X7,Other,Meyve ve Sebze,Sebze,Domates Pembe 500 gr,500 gr,gr,domates pembe,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:10:18.779,4.0,22
279895,HBV00000OE7X7,Other,Meyve ve Sebze,Sebze,Domates Pembe 500 gr,500 gr,gr,domates pembe,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:10:19.752,4.0,22
279896,HBV00000NVZBI,Other,Meyve ve Sebze,Sebze,Patates 1 kg,1 kg,kg,patates,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:11:22.659,1.34,22
279897,HBV00000NVZBI,Other,Meyve ve Sebze,Sebze,Patates 1 kg,1 kg,kg,patates,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:11:23.803,1.34,22
279898,HBV00000NVZBI,Other,Meyve ve Sebze,Sebze,Patates 1 kg,1 kg,kg,patates,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:11:29.180,1.34,22
279899,HBV00000OE7UF,Other,Meyve ve Sebze,Sebze,Havuç 500 gr,500 gr,gr,havuc,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:11:36.902,2.72,22
279900,HBV00000OE7UF,Other,Meyve ve Sebze,Sebze,Havuç 500 gr,500 gr,gr,havuc,cart,b8d43d4b-9dce-47fe-b276-e0cfddcb56d3,2020-06-01 20:11:37.765,2.72,22


# Birliktelik Analizi

In [29]:
from mlxtend.preprocessing import TransactionEncoder


def get_relations(df, col):

    basket_df =  pd.DataFrame(df.groupby(["sessionid"])[col].apply(lambda x: list(x)))
    data = basket_df[col].values

    te = TransactionEncoder()
    te_data = te.fit(data).transform(data)
    basket_df = pd.DataFrame(te_data, columns = te.columns_)
    return basket_df


subcategory_df = get_relations(df, 'subcategory')
category_df = get_relations(df, 'category')
brand_df = get_relations(df, 'brand')

In [30]:
from mlxtend.frequent_patterns import apriori

category_apriori_df = apriori(category_df, min_support=0.02, use_colnames=True)
subcategory_apriori_df = apriori(subcategory_df, min_support=0.02, use_colnames=True)
brand_apriori_df = apriori(brand_df, min_support=0.02, use_colnames=True)

category_apriori_df.head(10)

Unnamed: 0,support,itemsets
0,0.131571,(Atıştırmalık)
1,0.046251,(Bebek)
2,0.069175,(Dondurma)
3,0.366592,"(Et, Balık, Şarküteri)"
4,0.130836,(Ev Bakım ve Temizlik)
5,0.047225,(Ev Yaşam ve Bahçe)
6,0.093163,(Fırın)
7,0.280721,(Kahvaltılık ve Süt)
8,0.064289,(Kişisel Bakım)
9,0.291466,(Meyve ve Sebze)


In [31]:
brand_apriori_df.sample(10)

Unnamed: 0,support,itemsets
40,0.046967,"(İçim, Other)"
5,0.021546,(Eker)
15,0.035763,(Sek)
25,0.051872,"(Other, Banvit)"
11,0.031814,(Nestle)
33,0.031593,"(Other, Eti)"
20,0.031024,(Torku)
37,0.026248,"(Superfresh, Other)"
4,0.037728,(Dr. Oetker)
27,0.116638,"(Other, Carrefour)"


In [32]:
from mlxtend.frequent_patterns import association_rules

association_rules(brand_apriori_df, metric="confidence", min_threshold=0.05)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Banvit),(Carrefour),0.075236,0.215055,0.026285,0.349365,1.624542,0.010105,1.20643
1,(Carrefour),(Banvit),0.215055,0.075236,0.026285,0.122224,1.624542,0.010105,1.053531
2,(Other),(Banvit),0.531997,0.075236,0.051872,0.097504,1.295971,0.011846,1.024673
3,(Banvit),(Other),0.075236,0.531997,0.051872,0.689453,1.295971,0.011846,1.507027
4,(Eti),(Carrefour),0.054296,0.215055,0.023824,0.438769,2.040267,0.012147,1.398613
5,(Carrefour),(Eti),0.215055,0.054296,0.023824,0.110779,2.040267,0.012147,1.063519
6,(Other),(Carrefour),0.531997,0.215055,0.116638,0.219245,1.019487,0.002229,1.005367
7,(Carrefour),(Other),0.215055,0.531997,0.116638,0.542364,1.019487,0.002229,1.022653
8,(Pınar),(Carrefour),0.084218,0.215055,0.036479,0.433152,2.014147,0.018368,1.384754
9,(Carrefour),(Pınar),0.215055,0.084218,0.036479,0.169628,2.014147,0.018368,1.102857


# Recommendation

## Simple Baseline

## Item2Vec Based Recomendation

In [33]:
dummy_df = pd.DataFrame(df.groupby(["sessionid"])['productid'].apply(lambda x: list(x)))
session_product_sequences = dummy_df["productid"].tolist()

In [34]:
import multiprocessing
from gensim.models import Word2Vec

cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=7,
                     window=3,
                     size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

w2v_model.build_vocab(session_product_sequences, progress_per=10000)
w2v_model.train(session_product_sequences,
                total_examples=w2v_model.corpus_count,
                epochs=30,
                report_delay=1)
w2v_model.init_sims(replace=True)

In [35]:
product_dict = dict(zip(df["productid"], df["name"]))
product_vectors = w2v_model.wv.vectors

In [36]:
def print_similar_products(query_productid):
    results = w2v_model.wv.most_similar(positive=[query_productid])
    print(product_dict.get(query_productid, ""))
    print([product_dict[result[0]] for result in results])

In [37]:
query_productid = "HBV00000OE882"
print_similar_products(query_productid)

Portakal Sıkmalık 500 gr
['Kivi 250 gr', 'Kiraz Salihli 500 gr', 'Limon Lamas 500 gr', 'Greyfurt Kanlı 500 gr', 'Çilek 250 gr', 'Golden Elma 500 gr', 'Şeftali Paket 500 gr', 'Kayısı Paket 500 gr', 'Granny Smith Elma 500 gr', 'Valensiya Portakal 500 gr']


In [38]:
query_productid = "HBV00000OE7UF"
print_similar_products(query_productid)

Havuç 500 gr
['Soğan Taze Demet', 'Maydanoz', 'Kıvırcık Salata Adet', 'Dereotu', 'Göbek Salata Adet', 'Nane', 'Roka', 'Karnabahar 500 gr', 'Kırmızı Lahana 750 gr', 'Semizotu']


In [39]:
query_productid = "HBV00000NVZGU"
print_similar_products(query_productid)

Dana Biftek 250 gr
['Dana Tas Kebabı 500 gr', 'Dana Antrikot 250 gr', 'Dana Sote 500 gr', 'Dana Kuşbaşı 500 gr', 'Dana Kıyma (%14-%20 Yağ) 250 gr', 'Kasap Köfte 250 gr', 'Dana Döş Sarma 500 gr', 'İnegöl Köfte 500 gr', 'Lezita Taze Poşetli Bütün Piliç 1,5 kg', 'Dana Bonfile 250 gr']


In [40]:
product_vectors.shape

(5661, 100)

# OOV item2Vec Use BM25 keyword based retreival

In [41]:
from rank_bm25 import BM25Okapi

corpus = meta_df["name"]

tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [42]:
query_id = 0
query = meta_df["name"][query_id]
print(query)
tokenized_query = query.split()
doc_scores = bm25.get_scores(tokenized_query)

Palette Kalıcı Doğal Renkler 10-4 PAPATYA


In [43]:
n=11
idx = (-doc_scores).argsort()[:n]

for i in idx:
    if i != query_id:
        print(corpus[i], doc_scores[i], meta_df.loc[i, "productid"])

Palette Kalıcı Doğal Renkler 1-0 SİYAH 22.319308303917282 HBV00000AX6LV
Palette Kalıcı Doğal Renkler 4-0 KAHVE 22.319308303917282 HBV00000AX6LH
Palette Kalıcı Doğal Renkler 1-1 GECE MAVİSİ 20.815797271222806 HBV00000AX6LF
Palette Kalıcı Doğal Renkler 5-89 GECE KIZILI 20.815797271222806 HBV00000AX6LD
Palette Kalıcı Doğal Renkler 9-4 SAHRA SARISI 20.815797271222806 HBV00000AX6LT
Palette Kalıcı Doğal Renkler 6-70 BRONZ KAHVE 20.815797271222806 HBV00000AX6L9
Palette Kalıcı Doğal Renkler 6-0 KOYU KUMRAL 20.815797271222806 HBV00000AX6LL
Palette Kalıcı Doğal Renkler 10-0 AÇIK SARI 20.815797271222806 HBV00000AX6L5
Palette Kalıcı Doğal Renkler 4-60 AÇIK ÇİKOLATA KAHVE 19.502066078898437 HBV00000AX6KZ
Palette Saç Boyası Kalıcı Doğal Renkler 4-0 Amber Kahve 18.34431558435793 HBV00000PVB3G


In [44]:
import numpy as np
from scipy.sparse import rand
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
from sparse_dot_topn import awesome_cossim_topn

class SessionMatcher:
    def __init__(self, col, df, topk=10, lower_bound=0.01):
        self.col = col
        self.df = df
        self.topk = topk
        self.lower_bound = lower_bound
        
        
    def get_matches_df(self, sparse_matrix, names):
        non_zeros = sparse_matrix.nonzero()
        
        name_indices = non_zeros[0]
        gt_indices = non_zeros[1]

        left_side = np.empty(gt_indices.size, dtype=object)
        right_side = np.empty(gt_indices.size, dtype=object)
        match_score = np.zeros(gt_indices.size)

        for index in range(gt_indices.size):
            left_side[index] = name_indices[index]
            right_side[index] = gt_indices[index]
            match_score[index] = sparse_matrix.data[index]

        res_df = pd.DataFrame({self.col: left_side,
                               self.col + "_similar": right_side,
                               'match_score': match_score})

        return res_df


    def match(self, n_threads=8):

        # Get unique sessions
        sessions = list(np.sort(self.df["sessionid"].unique())) 

        # Get unique products
        products = list(self.df["productid"].unique()) 

        sessions_cat_type = CategoricalDtype(categories=sessions, ordered=False)
        products_cat_type = CategoricalDtype(categories=products, ordered=False)

        # Get ratings
        ratings = list(self.df["rating"].astype("double")) # All of our ratings
        
        # Get the associated row indices
        rows = self.df["sessionid"].astype(sessions_cat_type).cat.codes 

        # Get the associated column indices
        cols = self.df["productid"].astype(products_cat_type).cat.codes 

        # create sparse matrix
        apply_sparse = csr_matrix((ratings, (rows, cols)), shape=(len(sessions), len(products)))
        
        # create map
        sessionDict = dict(zip(self.df["sessionid"].astype(sessions_cat_type).cat.codes , self.df["sessionid"]))

        # get similar seekers
        sparse_matrix = awesome_cossim_topn(apply_sparse,
                                            apply_sparse.T,
                                            self.topk,
                                            self.lower_bound,
                                            use_threads=True,
                                            n_jobs=8)
        
        return sessionDict, self.get_matches_df(sparse_matrix, self.df["sessionid"])

In [45]:
df = df[df["session_length"] > 1]
df["rating"] = 1

In [46]:
%%time
jsm = SessionMatcher(col="sessionid", df=df, topk=10, lower_bound=0.00)
sessionDict, res_df = jsm.match(n_threads=2)

res_df = res_df[res_df["sessionid"] != res_df["sessionid_similar"]]
res_df["sessionid"] = res_df["sessionid"].map(sessionDict)
res_df["sessionid_similar"] = res_df["sessionid_similar"].map(sessionDict)

CPU times: user 4.65 s, sys: 27.4 ms, total: 4.68 s
Wall time: 3.79 s


In [73]:
res_df

Unnamed: 0,sessionid,sessionid_similar,match_score
0,0002e53b-1f60-4309-8380-31ca03de51f8,cbc7c09a-c30a-4573-979e-c7332ccfd578,11.0
1,0002e53b-1f60-4309-8380-31ca03de51f8,e9d17fb6-9ad5-41d0-ad8d-6711a011a464,10.0
2,0002e53b-1f60-4309-8380-31ca03de51f8,0ffeb461-228c-4032-aaa0-71263d03c8ca,10.0
3,0002e53b-1f60-4309-8380-31ca03de51f8,405b56c3-72b4-4644-9de0-a38ee514bd13,9.0
4,0002e53b-1f60-4309-8380-31ca03de51f8,10034d89-6f91-4e1c-8377-1f5bd77ad3bf,9.0
...,...,...,...
381427,ffffcd3c-da03-4667-9c75-9fcafb609c9e,a9fe9038-02f8-4a1e-990a-9efc6fb8c98b,18.0
381428,ffffcd3c-da03-4667-9c75-9fcafb609c9e,512e7e30-2118-4d8a-9975-6a9dbde1a64e,18.0
381429,ffffcd3c-da03-4667-9c75-9fcafb609c9e,b1d5296f-9107-42e0-b3bf-d420749003ce,17.0
381431,ffffcd3c-da03-4667-9c75-9fcafb609c9e,a1319b58-e16d-4c2d-8f8f-99dd0f737c17,16.0
