In [248]:
import pandas as pd
import math
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from tqdm import tqdm
import pickle as pkl

In [48]:
test_df = pd.read_csv('/Users/vahid/data/recommender/test.csv.gz')
test_df.head()

Unnamed: 0,user_id,notif_id
0,8118012,525640
1,8077471,528428
2,3593257,528037
3,7250906,526292
4,7885672,526710


In [59]:
train_df = pd.read_csv('/Users/vahid/data/recommender/train_interactions.csv')
train_df.head()

Unnamed: 0,user_id,notif_id,interaction,interaction_dow,interaction_hour,interaction_min,delivery_dow,delivery_hour,delivery_min
0,654408,468552,0,1,1,57,1,1,55
1,9272634,517721,0,1,0,15,1,0,1
2,380089,519842,0,1,0,21,1,0,20
3,2586969,410941,0,1,2,12,1,1,57
4,491160,463087,0,1,1,37,1,1,27


In [194]:
train_notifs_df = train_df.groupby('notif_id').count()
train_notifs_df = train_notifs_df[['user_id']]

In [205]:
train_notifs_df.columns = [['count']]
train_notifs_df.head()

Unnamed: 0_level_0,count
notif_id,Unnamed: 1_level_1
406173,25169
406289,39408
406320,88494
408582,25227
410149,11301


In [3]:
notifs_text_df = pd.read_csv('/Users/vahid/data/recommender/notifs_corrected.csv')
notifs_text_df.head()

Unnamed: 0,notif_id,day_of_week,hour,minute,category,text
0,568156,6,17,30,7,135 37 8 39 105 1503 1504 25 1 161 35 213
1,567822,6,15,46,7,72 104 1 139 93 95 137 66 537 332 139 93 495 1...
2,567810,6,15,45,7,72 104 163 86 72 311 1712 1
3,567886,6,15,54,5,198 221 1426 538 1713 54 3 27 1714 716 38 145 ...
4,568058,6,16,13,7,72 104 1 139 93 95 137 66 537 332 139 93 495 1...


In [6]:
notifs_icons_df = pd.read_csv('/Users/vahid/data/recommender/icons.csv')
notifs_icons_df.set_index('notif_id', inplace=True)
notifs_icons_df.head()

Unnamed: 0_level_0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F191,F192,F193,F194,F195,F196,F197,F198,F199,F200
notif_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
447519,-4.966424,11.489478,4.418155,-8.264528,0.48345,-8.72612,-9.982015,-2.117726,-1.253858,3.31589,...,-0.165652,0.23625,-0.536746,0.074008,-0.198941,-0.087212,0.135536,-0.169727,0.010327,-0.124046
441851,1.695928,-9.561707,-1.148592,-7.54565,-8.476706,5.449286,3.271107,3.806645,1.922784,0.621298,...,-0.284256,-0.293422,-0.112613,-0.665172,-0.145205,-0.445586,-0.055238,0.116369,-0.637553,-0.056821
483576,-3.164493,4.96009,-3.474825,-2.931745,1.532727,-2.396772,10.59493,1.161292,0.133073,-1.898646,...,0.108387,-0.069454,0.136111,0.115855,0.003308,0.032245,0.04525,-0.091309,-0.008784,0.30049
564340,17.057026,-1.445091,-15.867423,2.147889,-7.040503,1.373868,-4.36583,0.633132,-3.303665,2.526653,...,-0.168556,0.110151,-0.164698,0.242161,0.055488,0.050694,0.190996,-0.144006,0.439409,0.108422
549636,-10.604857,4.211807,-3.017962,-3.874277,-4.608191,-5.410021,-5.191367,1.042329,-4.308573,-5.806875,...,0.41335,-0.187556,-0.132629,0.991428,1.060254,-0.262702,-0.447537,0.092006,0.370397,0.114488


In [90]:
word2id = {}
id2word = {}
nans = 0
nan_set = set()
min_length = 1000
for index,row in notifs_text_df.iterrows():
    if type(row.text) == float and math.isnan(row.text):
        nans += 1
        nan_set.add(row.notif_id)
        continue
    if len(row.text.split(' ')) < min_length:
        min_length = len(row.text.split(' '))
    for word in row.text.split(' '):
        word = int(word)
        if not word in word2id:
            _id = len(word2id)
            word2id[word] = _id
            id2word[_id] = word
print('there are', len(word2id), ' words')
print(nans, 'notifs have no text', nan_set)
print(notifs_text_df.shape[0] - nans, 'notifs have texts')
notifs_with_text_count = notifs_text_df.shape[0] - nans
print('min length of txts', min_length)

there are 1999  words
5 notifs have no text {465926, 442919, 455698, 465949, 416446}
6342 notifs have texts
min length of txts 1


In [91]:
notifs_text_vecs = np.zeros([notifs_with_text_count, len(word2id)])
idx = 0
notif2id = {}
id2notif = {}
for index,row in notifs_text_df.iterrows():
    if type(row.text) == float and math.isnan(row.text):
        nans += 1
        nan_set.add(row.notif_id)
        continue
    
    for word in row.text.split(' '):
        word = int(word)
        word_id = word2id[word]
        notifs_text_vecs[idx, word_id] = 1
    notif2id[row.notif_id] = idx
    id2notif[idx] = row.notif_id
    idx += 1
print('notifs_text_vecs.shape', notifs_text_vecs.shape)

notifs_text_vecs.shape (6342, 1999)


In [103]:
###smiliars_bsedon_text###
text_similarities = cosine_similarity(notifs_text_vecs)
np.fill_diagonal(text_similarities,-1)
smiliars_bsedon_text = {}
TEXT_SIMILARITY_THRESHOLD = 0.9
c = 0
for notif_id, idx in notif2id.items():
    similars = set()
    for i in range(text_similarities.shape[0]):
        if text_similarities[idx, i] > TEXT_SIMILARITY_THRESHOLD:
            similars.add((id2notif[i], np.round(text_similarities[idx, i], decimals=4)))
    smiliars_bsedon_text[notif_id] = similars
    if c < 5:
        print(notif_id, smiliars_bsedon_text[notif_id])
        c += 1

568156 {(541757, 0.9167), (493099, 0.9167), (493084, 0.9167), (493130, 0.9167), (474326, 0.9167)}
567822 {(567436, 1.0), (565641, 1.0), (563816, 1.0), (560501, 1.0), (569448, 1.0), (564738, 1.0), (568043, 1.0), (560868, 1.0), (569503, 1.0), (568128, 1.0), (564557, 1.0), (564583, 1.0), (564711, 1.0), (568853, 1.0), (560277, 1.0), (560323, 1.0), (562709, 1.0), (564432, 1.0), (564688, 1.0), (559374, 1.0), (560818, 1.0), (567218, 1.0), (564894, 1.0), (568058, 1.0), (560276, 1.0), (565521, 1.0), (565616, 1.0), (560279, 1.0), (560325, 1.0), (561139, 1.0), (564460, 1.0), (564742, 1.0), (563964, 1.0), (562648, 1.0)}
567810 set()
567886 {(567944, 1.0), (567909, 1.0), (567988, 1.0), (568010, 1.0)}
568058 {(567436, 1.0), (565641, 1.0), (563816, 1.0), (560501, 1.0), (569448, 1.0), (564738, 1.0), (568043, 1.0), (560868, 1.0), (569503, 1.0), (568128, 1.0), (564557, 1.0), (564583, 1.0), (564711, 1.0), (568853, 1.0), (560277, 1.0), (560323, 1.0), (562709, 1.0), (564432, 1.0), (564688, 1.0), (567822, 1

In [102]:
test_notifs = test_df.groupby('notif_id', as_index=False).count().notif_id.values
c = 0
for test_notif in test_notifs:
    if test_notif in notif2id:
        c += 1
print(c == 189, ';if true, means all notifs in test have text')

True ;if true, means all notifs in test have text


In [107]:
text_similars_counts = []
for test_notif in test_notifs:
    text_similars_counts.append(len(smiliars_bsedon_text[test_notif]))
test_notfis_df = pd.DataFrame(data={'notif_id': test_notifs, 'text_similars_counts': text_similars_counts})
print(test_notfis_df[test_notfis_df.text_similars_counts > 0].shape[0], 'notifs with more than 1 similar notifs based on text')
test_notfis_df.head(5)

151 notifs with more than 1 similar notifs based on text


Unnamed: 0,notif_id,text_similars_counts
0,525584,18
1,525640,80
2,525662,80
3,525679,14
4,525683,80


In [125]:
notif2id = {}
id2notif = {}
notifs_icons_vecs = np.zeros([notifs_icons_df.shape[0], 200])
for index,row in notifs_icons_df.iterrows():
    notif_id = index
    _id = len(notif2id)
    notif2id[notif_id] = _id
    id2notif[_id] = notif_id
    notifs_icons_vecs[notif2id[notif_id]] = row.values
notifs_icons_vecs.shape

(6279, 200)

In [130]:
###smiliars_bsedon_icons###
icons_similarities = euclidean_distances(notifs_icons_vecs)
icons_similarities = (icons_similarities - icons_similarities.min()) / (icons_similarities.max() - icons_similarities.min())

np.fill_diagonal(icons_similarities,1)
smiliars_bsedon_icons = {}
ICON_SIMILARITY_THRESHOLD = 0.1
c = 0
for notif_id, idx in notif2id.items():
    similars = set()
    for i in range(icons_similarities.shape[0]):
        if icons_similarities[idx, i] < ICON_SIMILARITY_THRESHOLD:
            similars.add((id2notif[i], np.round(icons_similarities[idx, i], decimals=4)))
    smiliars_bsedon_icons[notif_id] = similars
    if c < 2:
        print(notif_id, smiliars_bsedon_icons[notif_id])
        c += 1

447519 {(463099, 0.0673), (460505, 0.0517), (499006, 0.0673), (461205, 0.0673), (496574, 0.0673), (500332, 0.0673), (461512, 0.0673), (499759, 0.0673), (417446, 0.0647), (412975, 0.0673), (433220, 0.0535), (497373, 0.0673), (448610, 0.0673), (433266, 0.0535), (467452, 0.0673), (499734, 0.0673), (461467, 0.0673), (456446, 0.0647), (486028, 0.0673), (499683, 0.0673), (462092, 0.0673), (485564, 0.0512), (433240, 0.0535), (497318, 0.0673), (423293, 0.0673), (496683, 0.0673), (567575, 0.0673), (499694, 0.0673), (412997, 0.0673), (520988, 0.0673), (520983, 0.0673), (412982, 0.0673), (417389, 0.0647), (412992, 0.0673), (496551, 0.0673), (467505, 0.0673), (417297, 0.0647), (520943, 0.0673), (500785, 0.0673), (412998, 0.0673), (521015, 0.0673), (499685, 0.0673), (425285, 0.0647), (467475, 0.0673), (502300, 0.0561), (502298, 0.0517), (496440, 0.0647), (412952, 0.0673), (496531, 0.0673), (500525, 0.0673), (497779, 0.0647), (502153, 0.0673), (496608, 0.0673), (496711, 0.0673), (496501, 0.0673), (4

In [140]:
test_notfis_df['icon_similars_counts'] = test_notfis_df.notif_id.apply(lambda x: len(smiliars_bsedon_icons[x]) if x in smiliars_bsedon_icons else 0)
print(test_notfis_df[test_notfis_df.icon_similars_counts > 0].shape[0], 'notifs with more than 1 similar notifs based on icons')
print(test_notfis_df[(test_notfis_df.icon_similars_counts > 0) & (test_notfis_df.text_similars_counts > 0)].shape[0], 'notifs with more than 1 similar notifs based on text and icons')
test_notfis_df.head(5)

171 notifs with more than 1 similar notifs based on icons
143 notifs with more than 1 similar notifs based on text and icons


Unnamed: 0,notif_id,text_similars_counts,icon_similars_counts
0,525584,18,19
1,525640,80,38
2,525662,80,38
3,525679,14,13
4,525683,80,38


In [217]:
def special_intersection(set_a, set_b, global_impact = True):
    intersection = set()
    for a in set_a:
        for b in set_b:
#             print(a[0])
#             print(train_notifs_df[train_notifs_df.notif_id == a[0]])
            if a[0] == b[0]:
                if global_impact == False:
                    intersection.add((a[0], a[1], b[1]))
                elif b[0] in train_notifs_df.index and train_notifs_df.loc[b[0]].values[0] > 0:
                    intersection.add((a[0], a[1], b[1]))
                break
    return intersection

In [218]:
###similars_basedon_icons_and_texts###
intersections_counts = []
similars_basedon_icons_and_texts = {}
for index, row in test_notfis_df.iterrows():
    intersection = set()
    notif_id = row.notif_id
    if notif_id in smiliars_bsedon_icons and notif_id in smiliars_bsedon_text:
        intersection = special_intersection(smiliars_bsedon_icons[notif_id], smiliars_bsedon_text[notif_id])
    similars_basedon_icons_and_texts[notif_id] = intersection
    intersections_counts.append(len(intersection))
test_notfis_df['sim_icon_text_count'] = intersections_counts
print(test_notfis_df[test_notfis_df.sim_icon_text_count > 0].shape[0], 'that i really like between 189 notifs! (they have similars in train)')

94 that i really like between 189 notifs! (they have similars in train)


In [219]:
test_notfis_df.head(5)

Unnamed: 0,notif_id,text_similars_counts,icon_similars_counts,sim_icon_text_count
0,525584,18,19,13
1,525640,80,38,3
2,525662,80,38,3
3,525679,14,13,12
4,525683,80,38,3


In [250]:
special_test_notifs = test_notfis_df.copy()
special_test_notifs = special_test_notifs[special_test_notifs.sim_icon_text_count > 0]
special_test_notifs.shape

(94, 4)

In [251]:
mutual_users_union = []
mutual_users_min = []
for index, row in tqdm(special_test_notifs.iterrows()):
    test_notif_id = row.notif_id
    test_notifs_for_this_test_notif = set(test_df[test_df.notif_id == row.notif_id].user_id.unique())
    union = set()
    for similar in similars_basedon_icons_and_texts[test_notif_id]:
        train_users_for_this_similar_notif = set(train_df[train_df.notif_id == similar[0]].user_id.unique())
#         intersection = train_users_for_this_similar_notif.intersection(test_notifs_for_this_test_notif)
        union = train_users_for_this_similar_notif.union(union)
    mutual_users_union.append(len(union.intersection(test_notifs_for_this_test_notif)) / len(test_notifs_for_this_test_notif))
special_test_notifs['mutual_users_union'] = mutual_users_union

94it [06:59,  4.46s/it]


In [254]:
special_test_notifs.head(100)

Unnamed: 0,notif_id,text_similars_counts,icon_similars_counts,sim_icon_text_count,mutual_users_union
0,525584,18,19,13,0.769388
1,525640,80,38,3,0.007146
2,525662,80,38,3,0.004404
3,525679,14,13,12,0.026390
4,525683,80,38,3,0.001133
5,525693,35,34,8,0.008533
6,525698,35,34,8,0.010228
7,525702,35,34,8,0.010191
9,525713,35,34,8,0.009485
10,525750,35,34,8,0.007946


In [252]:
#things2save = [
#similars_basedon_icons_and_texts, dict
#special_test_notifs, dict
#similars_basedon_icons, csv
#similars_basedon_texts, smiliars_bsedon_text dict
#]

In [253]:
count = 0
for index, row in tqdm(special_test_notifs.iterrows()):
    test_notif_id = row.notif_id
    count += len(set(test_df[test_df.notif_id == row.notif_id].user_id.unique()))
print(count / test_df.shape[0])

94it [00:03, 27.65it/s]

0.3393069866718493



