In [200]:
import numpy as np
import pandas as pd
import keras
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder

In [133]:
train_df = pd.read_csv('/Users/vahid/data/recommender/train_interactions.csv')

In [134]:
train_df.drop(columns=['interaction_min', 'delivery_min','interaction_dow','interaction_hour','delivery_dow','delivery_hour'], inplace=True)

In [157]:
test_df = pd.read_csv('/Users/vahid/data/recommender/test.csv.gz')

In [185]:
users_df = pd.read_csv('/Users/vahid/data/recommender/users.csv')
users_df.drop(columns=['C1','C2','C3','C4','C5','C6'],inplace=True)
users_df.head()

Unnamed: 0,user_id,N1,N2,N3
0,2,1,,
1,3,10,1.0,
2,4,2,17.0,
3,5,3,7.0,
4,6,28,296.0,25.0


In [190]:
users_df.fillna(users_df.mean(),inplace=True)
users_df.head()

Unnamed: 0,user_id,N1,N2,N3
0,2,1,87.974051,51.098252
1,3,10,1.0,51.098252
2,4,2,17.0,51.098252
3,5,3,7.0,51.098252
4,6,28,296.0,25.0


In [202]:
user2features = {}
for index, row in tqdm(users_df.iterrows()):
    user2features[row.user_id] = np.array([row.N1, row.N2, row.N3])

4009914it [05:51, 11413.62it/s]


In [177]:
test_df.head()

Unnamed: 0,user_id,notif_id
0,8118012,525640
1,8077471,528428
2,3593257,528037
3,7250906,526292
4,7885672,526710


In [135]:
notif_df = pd.read_csv('/Users/vahid/data/recommender/notifs_corrected.csv')
notif_df.head()

Unnamed: 0,notif_id,day_of_week,hour,minute,category,text
0,568156,6,17,30,7,135 37 8 39 105 1503 1504 25 1 161 35 213
1,567822,6,15,46,7,72 104 1 139 93 95 137 66 537 332 139 93 495 1...
2,567810,6,15,45,7,72 104 163 86 72 311 1712 1
3,567886,6,15,54,5,198 221 1426 538 1713 54 3 27 1714 716 38 145 ...
4,568058,6,16,13,7,72 104 1 139 93 95 137 66 537 332 139 93 495 1...


In [243]:
notif2features = {}
notifs_vecs = []
notif_ids = []
for index, row in tqdm(notif_df.iterrows()):
    if type(row.text) == str:
        words = [int(word) for word in row.text.split(' ')]
        vec = np.zeros(2000)
        vec[words] = 1
        notif2features[row.notif_id] = vec
        notifs_vecs.append(vec)
        notif_ids.append(row.notif_id)
notifs_vecs = np.array(notifs_vecs)

6347it [00:00, 6929.10it/s]


In [221]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
notifs_small_vecs = pca.fit_transform(notifs_vecs)
for idx,notif_id in enumerate(notif_ids):
    notif2features[notif_id] = notifs_small_vecs[idx]

In [137]:
notif2dow = {}
notif2hour = {}
for index, row in notif_df.iterrows():
    notif2hour[row.notif_id] = row.hour

In [261]:
notif_df_filtered = notif_df[(notif_df.category == 3)]
notif_df_filtered.shape

(915, 6)

In [262]:
notifs_selected = notif_df_filtered.notif_id.unique()
print(notifs_selected)
notifs_selected.shape

[568117 469414 467293 468366 471292 567450 567526 473531 473519 473522
 475404 474607 474609 475454 475469 474902 475099 475023 476214 475351
 475624 475642 473533 475742 475850 475865 475860 475862 476276 474094
 475875 476657 476780 476814 476827 567440 475650 478257 475825 477363
 476705 479078 479129 479279 479305 479374 479538 479389 478146 482797
 483424 486536 485019 484619 486337 486365 484357 485445 485958 485966
 485996 485333 482795 486555 482796 483013 483175 483174 483058 483235
 483236 483334 483387 483343 483346 483368 486417 483730 483997 414617
 483988 483984 484009 484083 484067 484072 484144 484163 486236 484397
 486303 489381 414779 484478 415928 415987 415963 452928 487181 416266
 416260 416287 416293 416297 416318 416277 416301 416333 416336 416298
 416354 484881 416369 416377 416390 416441 416428 416459 416458 416472
 416487 452717 416533 416532 413732 416422 416290 416267 416962 417199
 417242 417245 417257 417248 417258 417289 416430 452700 488840 416467
 41735

(915,)

In [263]:
test_df[(test_df.notif_id.isin(notifs_selected))].shape

(938179, 2)

In [264]:
train_df.head()

Unnamed: 0,user_id,notif_id,interaction
0,654408,468552,0
1,9272634,517721,0
2,380089,519842,0
3,2586969,410941,0
4,491160,463087,0


In [265]:
train_df_filtered = train_df[train_df.notif_id.isin(notifs_selected)].copy()
train_df_filtered.shape

(13148959, 3)

In [266]:
train_df_filtered.head()

Unnamed: 0,user_id,notif_id,interaction
8,1644236,517398,0
14,6726471,414617,0
18,10272948,463254,0
19,5245236,521158,1
37,429251,463914,0


In [267]:
USERS_FEATURES = 3
NOTIFS_FEATURES = 2000

In [274]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data, batch_size=32, shuffle=True):
        'Initialization'
        self.batch_size = batch_size
        self.data = data
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return self.data.shape[0] // self.batch_size

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        Y = self.data[indexes,2]
        _X = self.data[indexes]
        X = np.zeros([_X.shape[0], USERS_FEATURES + NOTIFS_FEATURES])
        X[:,0:3] = [user2features[user_id] for user_id in _X[:,0]]
        X[:,3:2003] = [notif2features[notif_id] for notif_id in _X[:,1]]
        weight = np.array([0.05 if y == 0 else 0.95 for y in Y])
        return X, Y, weight

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.data.shape[0])
        if self.shuffle == True:
            np.random.shuffle(self.indexes)


In [276]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
# Datasets
# partition = # IDs
# labels = # Labels

# Generators
training_generator = DataGenerator(train_df_filtered.values, shuffle=True, batch_size=32)
validation_generator = DataGenerator(train_df_filtered.values, shuffle=True, batch_size=32)

input_dim = USERS_FEATURES + NOTIFS_FEATURES
model = Sequential([
    Dense(256, input_shape=(input_dim,)),
    Activation('relu'),
    Dense(64, input_shape=(input_dim,)),
    Activation('relu'),
    Dense(32, input_shape=(input_dim,)),
    Activation('relu'),
    Dense(16, input_shape=(input_dim,)),
    Activation('relu'),
    Dense(1),
    Activation('softmax'),
])

# For a mean squared error regression problem
model.compile(optimizer='adam', loss='binary_crossentropy')


# Design model
# model = Sequential()
# [...] # Architecture
# model.compile()

# Train model on dataset
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    use_multiprocessing=True,
                    workers=7,
#                     class_weight={0: 0.05, 1: 0.95},
                    epochs=100)


Epoch 1/100
 44150/410904 [==>...........................] - ETA: 1:39:38 - loss: 0.7579

Process ForkPoolWorker-3296:
Process ForkPoolWorker-3297:
Process ForkPoolWorker-3295:
Process ForkPoolWorker-3299:
Process ForkPoolWorker-3298:
Process ForkPoolWorker-3300:
Process ForkPoolWorker-3301:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    sel

KeyboardInterrupt: 