In [1]:
import numpy as np
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta
from scipy import sparse
%matplotlib inline

sns.set_theme(style="ticks")

In [2]:
df_all = pd.read_csv('../data/Booking/booking_train_set.csv', 
                 dtype={"user_id": str, "city_id": str, 'affiliate_id': str,
                       'utrip_id': str},parse_dates=['checkin', 'checkout'])

df_all.head()

Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id
0,0,1006220,2016-04-09,2016-04-11,31114,desktop,384,Gondal,Gondal,1006220_1
1,1,1006220,2016-04-11,2016-04-12,39641,desktop,384,Gondal,Gondal,1006220_1
2,2,1006220,2016-04-12,2016-04-16,20232,desktop,384,Gondal,Glubbdubdrib,1006220_1
3,3,1006220,2016-04-16,2016-04-17,24144,desktop,384,Gondal,Gondal,1006220_1
4,4,1010293,2016-07-09,2016-07-10,5325,mobile,359,The Devilfire Empire,Cobra Island,1010293_1


In [3]:
df = pd.read_csv('../output/booking/dataset/train_0.1_10.csv')

df.head()

Unnamed: 0,utrip_id,last_step,user_id,count_unique_city,trip_size,start_trip,end_trip,user_city_count,user_city_unique,user_trip_count,...,start_trip_is_weekend,start_trip_quarter_sin,start_trip_quarter_cos,start_trip_month_sin,start_trip_month_cos,start_trip_day_sin,start_trip_day_cos,start_trip_week_sin,start_trip_week_cos,user_features
0,1000027_1,4,1000027,4,3,2016-08-13,2016-08-18,-0.05013,-0.560293,-0.314496,...,1,-1.0,-1.83697e-16,-0.8660254,-0.5,-0.680773,-0.732494,-0.866025,0.5,"[0.16040363907814026, -0.16208097338676453, -0..."
1,1000083_1,4,1000083,4,3,2016-06-13,2016-06-16,-0.05013,-0.560293,-0.314496,...,0,1.224647e-16,-1.0,1.224647e-16,-1.0,0.296713,-0.954967,0.0,1.0,"[0.11315741389989853, -0.053704582154750824, -..."
2,100008_1,5,100008,5,4,2016-07-18,2016-07-25,0.203099,-0.325337,-0.314496,...,0,-1.0,-1.83697e-16,-0.5,-0.866025,-0.296713,-0.954967,0.0,1.0,"[0.041144490242004395, -0.17495191097259521, -..."
3,1000136_1,5,1000136,5,4,2016-10-08,2016-10-15,0.203099,-0.325337,-0.314496,...,1,-2.449294e-16,1.0,-0.8660254,0.5,-0.989932,0.14154,-0.866025,0.5,"[0.0011888369917869568, -0.1449287086725235, -..."
4,1000145_1,5,1000145,5,4,2016-07-16,2016-07-28,0.203099,-0.325337,-0.314496,...,1,-1.0,-1.83697e-16,-0.5,-0.866025,-0.263665,-0.964614,-0.866025,0.5,"[0.041662052273750305, -0.2262377291917801, -0..."


In [4]:
df['city_id_list'] = df['city_id_list'].apply(eval)
df['city_id_list']

0              [0, 0, 0, 0, 0, 0, 8183, 15626, 60902, M]
1             [0, 0, 0, 0, 0, 0, 55990, 14705, 35160, M]
2           [0, 0, 0, 0, 0, 11306, 12096, 6761, 6779, M]
3         [0, 0, 0, 0, 0, 62541, 42482, 20345, 33540, M]
4         [0, 0, 0, 0, 0, 47499, 27112, 17764, 56651, M]
                               ...                      
183995          [0, 0, 0, 0, 0, 0, 4476, 1034, 64876, M]
183996        [0, 0, 0, 0, 0, 0, 17775, 66634, 17775, M]
183997          [0, 0, 0, 0, 0, 0, 8335, 21328, 8335, M]
183998        [0, 0, 0, 0, 0, 0, 51291, 66969, 67169, M]
183999          [0, 0, 0, 0, 0, 0, 17944, 47075, 228, M]
Name: city_id_list, Length: 184000, dtype: object

## Coocorrence

In [5]:
def create_co_occurences_matrix(allowed_words, documents):
    word_to_id       = dict(zip(allowed_words, range(len(allowed_words))))
    documents_as_ids = [np.sort([word_to_id[w] for w in doc if w in word_to_id]).astype('uint32') for doc in documents]
    row_ind, col_ind = zip(*itertools.chain(*[[(i, w) for w in doc] for i, doc in enumerate(documents_as_ids)]))
    
    data        = np.ones(len(row_ind), dtype='uint32')  # use unsigned int for better memory utilization
    max_word_id = max(itertools.chain(*documents_as_ids)) + 1
    
    docs_words_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(documents_as_ids), max_word_id))  # efficient arithmetic operations with CSR * CSR
    words_cooc_matrix = docs_words_matrix.T * docs_words_matrix  # multiplying docs_words_matrix with its transpose matrix would generate the co-occurences matrix
    words_cooc_matrix.setdiag(0)

    return words_cooc_matrix, word_to_id 

In [6]:
item_idx = np.unique(df_all['city_id'])
lists    = list(df['city_id_list'])

In [7]:
cooc_matrix, cooc_to_idx = create_co_occurences_matrix(item_idx, lists)
cooc_to_id =  {v: k for k, v in cooc_to_idx.items()}

  self._set_arrayXarray(i, j, x)


In [8]:
uid    = cooc_to_idx['29770'] #52818
scores = np.argsort(np.array(cooc_matrix[uid].todense())[0])[::-1]
scores, np.max(scores)

(array([29690,  7751,  4223, ..., 26374, 26373,     0]), 39900)

In [9]:
[(cooc_to_id[_id], cooc_matrix[uid,_id]) for _id in scores[:10]]

[('55196', 643),
 ('21929', 569),
 ('16521', 422),
 ('52818', 338),
 ('46258', 332),
 ('48343', 326),
 ('44869', 275),
 ('36063', 239),
 ('64269', 233),
 ('37601', 214)]

In [10]:
cooc_matrix.shape

(39901, 39901)

In [11]:
def get_neighbors(uid,cooc_matrix, cooc_to_idx,cooc_to_id, top=10):
    if uid in cooc_to_idx:
        uid    = cooc_to_idx[uid] 
        scores = np.argsort(np.array(cooc_matrix[uid].todense())[0])[::-1]

        return [cooc_to_id[_id] for _id in scores[:top]],\
                    [cooc_matrix[uid,_id] for _id in scores[:top]]
    else:
        return [], []

In [12]:
uid = '29770'
items, scores = get_neighbors(uid, cooc_matrix, cooc_to_idx, cooc_to_id, top=10)
list(zip(items, scores))

[('55196', 643),
 ('21929', 569),
 ('16521', 422),
 ('52818', 338),
 ('46258', 332),
 ('48343', 326),
 ('44869', 275),
 ('36063', 239),
 ('64269', 233),
 ('37601', 214)]

# Item Neiborhods  - KDTree

In [5]:
from sklearn.model_selection import train_test_split

df_trip = df_all[['utrip_id']].drop_duplicates()
df_train, df_test = train_test_split(df_trip, test_size=0.1, random_state=42)
df_train, df_test = df_all[df_all['utrip_id'].isin(df_train['utrip_id'])].sort_values('checkin'), \
                    df_all[df_all['utrip_id'].isin(df_test['utrip_id'])].sort_values('checkin')
print(df_train.shape, df_test.shape)

df_train['visit'] = 1
df_train.head()

(1049635, 10) (117200, 10)


Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,visit
7504,7504,2595109,2015-12-31,2016-01-01,27404,mobile,359,The Devilfire Empire,Cobra Island,2595109_1,1
986596,986596,2000964,2015-12-31,2016-01-01,63341,mobile,8151,The Devilfire Empire,Cobra Island,2000964_1,1
1104472,1104472,2379328,2016-01-01,2016-01-02,65663,mobile,3449,Tcherkistan,Oceania,2379328_1,1
788050,788050,1379517,2016-01-01,2016-01-03,47499,mobile,7360,Elbonia,Kangan,1379517_1,1
765551,765551,2147992,2016-01-01,2016-01-04,51259,mobile,9452,Gondal,Oceania,2147992_1,1


In [23]:
df_train = df_train.sample(1000)

In [24]:
from pandas.api.types import CategoricalDtype
from sklearn.neighbors import KDTree
from scipy.spatial import KDTree as KDT

def create_sparse_matrix(df: pd.DataFrame, col_row, col_col):

    item_c    = CategoricalDtype(sorted(df[col_row].unique()), ordered=True)
    session_c = CategoricalDtype(sorted(df[col_col].unique()), ordered=True)

    row = df[col_row].astype(item_c).cat.codes
    col = df[col_col].astype(session_c).cat.codes

    sparse_matrix = csr_matrix((df["visit"], (row, col)), \
                            shape=(item_c.categories.size, session_c.categories.size))
    return sparse_matrix


In [25]:
item_idx = np.unique(df_train['city_id'])
lists    = list(df['city_id_list'])
sparse_matrix = create_sparse_matrix(df_train,'city_id', 'utrip_id')
sparse_matrix

<689x998 sparse matrix of type '<class 'numpy.int64'>'
	with 1000 stored elements in Compressed Sparse Row format>

In [26]:
np.asarray(sparse_matrix.todense())

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [39]:
X    = sparse_matrix.todense()
tree = KDTree(X, leaf_size=10)

dist, ind = tree.query(X[:2], k=3)   
dist, ind

(array([[0.        , 1.41421356, 1.41421356],
        [0.        , 1.73205081, 1.73205081]]), array([[  0,  15, 169],
        [  1,  15, 169]]))

In [45]:
matrix_item_idx  = dict(zip(item_idx, list(range(len(item_idx)))))
matrix_item_id   = {v: k for k, v in matrix_item_idx.items()}
sparse_matrix    = sparse_matrix
#np.fill_diagonal(cos_matrix, 0)
#cos_matrix       = sparse.csr_matrix(cos_matrix)

In [50]:
uid = '29770'
uid    = matrix_item_idx[uid] 
dist, ind = tree.query(np.array(sparse_matrix[uid].todense()), k=3)   
dist, ind 

(array([[0., 2., 2.]]), array([[232, 214, 213]]))

In [61]:
def get_neighbors_tree(uid, tree, sparse_matrix, cooc_to_idx, cooc_to_id, top=10):
    if uid in cooc_to_idx:
        uid      = cooc_to_idx[uid] 
        
        dist, ind = tree.query(np.array(sparse_matrix[uid].todense()), k=top+1)   

        return [cooc_to_id[_id] for _id in ind[0][1:]],\
                    dist[0][1:]
    else:
        return [], []

In [62]:
uid = '29770'
items, scores = get_neighbors_tree(uid, tree, sparse_matrix, matrix_item_idx, matrix_item_id, top=10)
list(zip(items, scores))

[('2808', 2.0),
 ('28496', 2.0),
 ('28479', 2.0),
 ('2813', 2.0),
 ('28115', 2.0),
 ('28154', 2.0),
 ('28455', 2.0),
 ('27634', 2.0),
 ('27695', 2.0),
 ('28968', 2.0)]

## Session Similarity

In [13]:
from sklearn.model_selection import train_test_split

df_trip = df_all[['utrip_id']].drop_duplicates()
df_train, df_test = train_test_split(df_trip, test_size=0.1, random_state=42)
df_train, df_test = df_all[df_all['utrip_id'].isin(df_train['utrip_id'])].sort_values('checkin'), \
                    df_all[df_all['utrip_id'].isin(df_test['utrip_id'])].sort_values('checkin')
print(df_train.shape, df_test.shape)

df_train['visit'] = 1
df_train.head()

(1049635, 10) (117200, 10)


Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id,visit
7504,7504,2595109,2015-12-31,2016-01-01,27404,mobile,359,The Devilfire Empire,Cobra Island,2595109_1,1
986596,986596,2000964,2015-12-31,2016-01-01,63341,mobile,8151,The Devilfire Empire,Cobra Island,2000964_1,1
1104472,1104472,2379328,2016-01-01,2016-01-02,65663,mobile,3449,Tcherkistan,Oceania,2379328_1,1
788050,788050,1379517,2016-01-01,2016-01-03,47499,mobile,7360,Elbonia,Kangan,1379517_1,1
765551,765551,2147992,2016-01-01,2016-01-04,51259,mobile,9452,Gondal,Oceania,2147992_1,1


In [14]:
# sample_days = 500
# test_days = 30
# # Split Data
# max_timestamp        = df_all.checkout.max()
# init_train_timestamp = max_timestamp - timedelta(days = sample_days)
# init_test_timestamp  = max_timestamp - timedelta(days = test_days)

# # TODO Garantir que o usuário fique com a sessão no train ou test
# df_train = df_all[(df_all.checkout >= init_train_timestamp) & (df_all.checkout < init_test_timestamp)]

# df_train['visit'] = 1
# df_train.head()

In [15]:
from pandas.api.types import CategoricalDtype

def create_sparse_matrix(df: pd.DataFrame, col_row, col_col):

    item_c    = CategoricalDtype(sorted(df[col_row].unique()), ordered=True)
    session_c = CategoricalDtype(sorted(df[col_col].unique()), ordered=True)

    row = df[col_row].astype(item_c).cat.codes
    col = df[col_col].astype(session_c).cat.codes

    sparse_matrix = csr_matrix((df["visit"], (row, col)), \
                            shape=(item_c.categories.size, session_c.categories.size))
    return sparse_matrix


In [16]:
item_idx = np.unique(df_train['city_id'])
lists    = list(df['city_id_list'])
sparse_matrix = create_sparse_matrix(df_train,'city_id', 'utrip_id')
sparse_matrix

<38542x195917 sparse matrix of type '<class 'numpy.int64'>'
	with 930661 stored elements in Compressed Sparse Row format>

In [17]:
matrix_item_idx  = dict(zip(item_idx, list(range(len(item_idx)))))
matrix_item_id   = {v: k for k, v in matrix_item_idx.items()}
sparse_matrix    = sparse_matrix
cos_matrix       = cosine_similarity(sparse_matrix)
np.fill_diagonal(cos_matrix, 0)
cos_matrix       = sparse.csr_matrix(cos_matrix)

In [18]:
cos_matrix

<38542x38542 sparse matrix of type '<class 'numpy.float64'>'
	with 1310568 stored elements in Compressed Sparse Row format>

In [19]:
cos_matrix.shape

(38542, 38542)

In [20]:
# def get_neighbors(uid,cooc_matrix, cooc_to_idx,cooc_to_id, top=10):
#     uid    = cooc_to_idx[uid] 
#     scores = np.argsort(np.array(cooc_matrix[uid].todense())[0])[::-1]
    
#     return [(cooc_to_id[_id], cooc_matrix[uid,_id]) for _id in scores[:top]]

uid = '29770'
items, scores = get_neighbors(uid, cos_matrix, matrix_item_idx, matrix_item_id, top=10)
list(zip(items, scores))

[('55196', 0.24882198273062126),
 ('48343', 0.20464527077161979),
 ('46258', 0.20346990117670205),
 ('37601', 0.17198514232928655),
 ('52818', 0.1684835559978779),
 ('21929', 0.16547570960762317),
 ('2201', 0.15737139444466414),
 ('53831', 0.14395272344687032),
 ('19333', 0.1376106689462748),
 ('53363', 0.1282264589232733)]

## Session With Country

In [21]:
item_idx = np.unique(df_train['city_id'])
lists    = list(df['city_id_list'])
sparse_matrix_c = create_sparse_matrix(df_train,'city_id', 'hotel_country')
sparse_matrix_c

<38542x194 sparse matrix of type '<class 'numpy.int64'>'
	with 38542 stored elements in Compressed Sparse Row format>

In [22]:
from scipy.sparse import hstack

sparse_matrix = hstack((sparse_matrix, sparse_matrix_c))
sparse_matrix

<38542x196111 sparse matrix of type '<class 'numpy.int64'>'
	with 969203 stored elements in COOrdinate format>

In [23]:
matrix_item_idx  = dict(zip(item_idx, list(range(len(item_idx)))))
matrix_item_id   = {v: k for k, v in matrix_item_idx.items()}
sparse_matrix    = sparse_matrix
cos_matrix_c       = cosine_similarity(sparse_matrix)
np.fill_diagonal(cos_matrix_c, 0)
cos_matrix_c       = sparse.csr_matrix(cos_matrix_c)

In [24]:
cos_matrix_c

<38542x38542 sparse matrix of type '<class 'numpy.float64'>'
	with 95869942 stored elements in Compressed Sparse Row format>

In [25]:
# def get_neighbors(uid,cooc_matrix, cooc_to_idx,cooc_to_id, top=10):
#     uid    = cooc_to_idx[uid] 
#     scores = np.argsort(np.array(cooc_matrix[uid].todense())[0])[::-1]
    
#     return [(cooc_to_id[_id], cooc_matrix[uid,_id]) for _id in scores[:top]]

uid = '29770'
items, scores = get_neighbors(uid, cos_matrix_c, matrix_item_idx, matrix_item_id, top=10)
list(zip(items, scores))

[('52818', 0.999653732787077),
 ('44869', 0.9995221146702377),
 ('46258', 0.9994775585730016),
 ('19333', 0.9993984561367825),
 ('53859', 0.99935079526557),
 ('4021', 0.9992434997448933),
 ('53363', 0.9992352798493708),
 ('37601', 0.9992160895788205),
 ('18083', 0.9991620780586361),
 ('67353', 0.9991615441041853)]

## Save

In [26]:
df_all.head()

Unnamed: 0.1,Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id
0,0,1006220,2016-04-09,2016-04-11,31114,desktop,384,Gondal,Gondal,1006220_1
1,1,1006220,2016-04-11,2016-04-12,39641,desktop,384,Gondal,Gondal,1006220_1
2,2,1006220,2016-04-12,2016-04-16,20232,desktop,384,Gondal,Glubbdubdrib,1006220_1
3,3,1006220,2016-04-16,2016-04-17,24144,desktop,384,Gondal,Gondal,1006220_1
4,4,1010293,2016-07-09,2016-07-10,5325,mobile,359,The Devilfire Empire,Cobra Island,1010293_1


In [27]:
%%time 

top_k     = 50
cities_id = list(np.unique(df_all['city_id']))
dict_cooc_map   = {c: cities_id for c in cities_id}
dict_kdtree_map   = {c: cities_id for c in cities_id}
dict_sim_map    = {c: cities_id for c in cities_id}
dict_sim_map_c  = {c: cities_id for c in cities_id}

uid = '29770'

for uid in cities_id:
    items, scores = get_neighbors(uid, cooc_matrix, cooc_to_idx, cooc_to_id, top=top_k)
    if len(items) > 0:
        dict_cooc_map[uid] = items
    
    items, scores = get_neighbors_tree(uid, tree, sparse_matrix, matrix_item_idx, matrix_item_id, top=top_k)
    if len(items) > 0:
        dict_kdtree_map[uid] = items
    
    items, scores = get_neighbors(uid, cos_matrix, matrix_item_idx, matrix_item_id, top=top_k)
    if len(items) > 0:
        dict_sim_map[uid]  = items
        
    items, scores = get_neighbors(uid, cos_matrix_c, matrix_item_idx, matrix_item_id, top=top_k)
    if len(items) > 0:
        dict_sim_map_c[uid]  = items        

CPU times: user 28min 50s, sys: 1.27 s, total: 28min 51s
Wall time: 28min 56s


In [28]:
import pickle 

# SAVE
with open("../output/booking/dataset/neighbors_dict_cooc_map.pkl", "wb") as pkl_handle:
    pickle.dump(dict_cooc_map, pkl_handle)

with open("../output/booking/dataset/neighbors_dict_kdtree_map.pkl", "wb") as pkl_handle:
    pickle.dump(dict_kdtree_map, pkl_handle)

with open("../output/booking/dataset/neighbors_dict_sim_map.pkl", "wb") as pkl_handle:
    pickle.dump(dict_sim_map, pkl_handle)
    
with open("../output/booking/dataset/neighbors_dict_sim_map_c.pkl", "wb") as pkl_handle:
    pickle.dump(dict_sim_map_c, pkl_handle)    
# # LOAD
# with open("../output/booking/dataset/data.pkl", "rb") as pkl_handle:
# 	output = pickle.load(pkl_handle)

In [29]:
1

1