In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd "/content/drive/My Drive/Hackathon/Recommendation system/SR-GNN/pytorch_code/"

/content/drive/My Drive/Hackathon/Recommendation system/SR-GNN/pytorch_code


In [None]:
!pip install DotMap



In [None]:
# Preprocessing
import pandas as pd
train_df = pd.read_csv('/content/drive/My Drive/Hackathon/Recommendation system/train_mddNHeX/train.csv')
test_df = pd.read_csv('/content/drive/My Drive/Hackathon/Recommendation system/test_HLxMpl7/test.csv')

In [None]:
challenge_id_mapping = {}
counter = 1
for challenge in set(train_df['challenge'].tolist() + test_df['challenge'].tolist()):
  challenge_id_mapping[challenge] = counter
  counter += 1

In [None]:
train_df['challenge'] = train_df['challenge'].map(challenge_id_mapping)
test_df['challenge'] = test_df['challenge'].map(challenge_id_mapping)

In [None]:
N_NODE = len(challenge_id_mapping) + 1

In [None]:
train_df.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,4849
1,4576_2,4576,2,5501
2,4576_3,4576,3,1212
3,4576_4,4576,4,1873
4,4576_5,4576,5,2835


In [None]:
def prepare_dataset(challenge):
    '''Convert a session into multiple sessions iteratively'''
    list_sess = list(challenge)
    sess_length = len(list_sess)
    if sess_length < 2:
        return None
    if sess_length == 2:
        return [list_sess]
    else:
        return_list = []
        for i in range(sess_length - 1):
            return_list.append(list_sess[:sess_length - i])
        return return_list

def prepare_df(dataframe):

  sessions_in_lists = dataframe.groupby(['user_id']).challenge.apply(prepare_dataset).tolist()

  session_input, session_target = [], []
  for i in sessions_in_lists:
      for j in i:
          session_input.append(j[:-1])
          session_target.append(j[-1])

  final_sessions = [session_input, session_target]
  return final_sessions

In [None]:
import argparse
import pickle
import time
from utils import build_graph, Data, split_validation
import imp
from model import *
from dotmap import DotMap

In [None]:
opt = {
    'dataset': 'Next_Challenge',
    'batchSize': 100,
    'hiddenSize': 100,
    'epoch': 7,
    'lr': 0.001,
    'lr_dc' : 0.1,
    'lr_dc_step' :3,
    'l2' : 1e-5,
    'step' : 1,
    'patience' : 10,
    'non-hybrid': True,
    'validation': False,
    'valid_portion': 0.1,
    'K':3
}
opt = DotMap(opt)

In [None]:
train_data = prepare_df(train_df)
if opt.validation:
    train_data, valid_data = split_validation(train_data, opt.valid_portion)
    test_data = valid_data
else:
    test_data = prepare_df(test_df)
# all_train_seq = pickle.load(open('../datasets/' + opt.dataset + '/all_train_seq.txt', 'rb'))
# g = build_graph(all_train_seq)
train_data = Data(train_data, shuffle=True)
test_data = Data(test_data, shuffle=False)
# del all_train_seq, g
if opt.dataset == 'diginetica':
    n_node = 43098
elif opt.dataset == 'yoochoose1_64' or opt.dataset == 'yoochoose1_4':
    n_node = 37484
else:
    n_node = N_NODE

In [None]:
model = trans_to_cuda(SessionGraph(opt, n_node))

In [None]:
def forward(model, i, data):
    alias_inputs, A, items, mask, targets = data.get_slice(i)
    alias_inputs = trans_to_cuda(torch.Tensor(alias_inputs).long())
    items = trans_to_cuda(torch.Tensor(items).long())
    A = trans_to_cuda(torch.Tensor(A).float())
    mask = trans_to_cuda(torch.Tensor(mask).long())
    hidden = model(items, A)
    get = lambda i: hidden[i][alias_inputs[i]]
    seq_hidden = torch.stack([get(i) for i in torch.arange(len(alias_inputs)).long()])
    return targets, model.compute_scores(seq_hidden, mask)

def train(model, train_data):
    
    print('start training: ', datetime.datetime.now())
    model.train()
    total_loss = 0.0
    slices = train_data.generate_batch(model.batch_size)
    for i, j in zip(slices, np.arange(len(slices))):
        model.optimizer.zero_grad()
        targets, scores = forward(model, i, train_data)
        targets = trans_to_cuda(torch.Tensor(targets).long())
        loss = model.loss_function(scores, (targets - 1))
        loss.backward()
        model.optimizer.step()
        total_loss += loss
        if j % int(len(slices) / 5 + 1) == 0:
            print('[%d/%d] Loss: %.4f' % (j, len(slices), loss.item()))
    print('\tLoss:\t%.3f' % total_loss)
    model.scheduler.step()
    

def test(model, test_data):
    print('start predicting: ', datetime.datetime.now())
    model.eval()
    hit, mrr = [], []
    slices = test_data.generate_batch(model.batch_size)
    for i in slices:
        targets, scores = forward(model, i, test_data)
        sub_scores = scores.topk(opt.K)[1]
        sub_scores = trans_to_cpu(sub_scores).detach().numpy()
        for score, target, mask in zip(sub_scores, targets, test_data.mask):
            hit.append(np.isin(target - 1, score))
            if len(np.where(score == target - 1)[0]) == 0:
                mrr.append(0)
            else:
                mrr.append(1 / (np.where(score == target - 1)[0][0] + 1))
    hit = np.mean(hit) * 100
    mrr = np.mean(mrr) * 100
    return hit, mrr

def generate_graph(input_session):
  sess_len = len(input_session)
  
  if sess_len < 13:
    inputs = input_session + ([0] * (13 - sess_len))
    mask = np.array(([1] * sess_len) + ([0] * (13 - sess_len)))
  
  items, A, alias_inputs = [], [], []
  node = np.unique(inputs)
  items.append(node.tolist()) #append unique node followed by zeros upto 10.
  u_A = np.zeros((sess_len+1, sess_len+1))
  for i in np.arange(len(inputs) - 1):
      if inputs[i + 1] == 0:
          break
      u = np.where(node == inputs[i])[0][0]
      v = np.where(node == inputs[i + 1])[0][0]
      u_A[u][v] = 1
  u_sum_in = np.sum(u_A, 0)
  u_sum_in[np.where(u_sum_in == 0)] = 1
  u_A_in = np.divide(u_A, u_sum_in)
  u_sum_out = np.sum(u_A, 1)
  u_sum_out[np.where(u_sum_out == 0)] = 1
  u_A_out = np.divide(u_A.transpose(), u_sum_out)
  u_A = np.concatenate([u_A_in, u_A_out]).transpose()
  A.append(u_A)
  alias_inputs.append([np.where(node == i)[0][0] for i in inputs])    
  return alias_inputs, A, items, np.array([mask]), np.array(sess_len)

def predict_next_items(input_session):
  model.eval()
  alias_inputs, A, items, mask, targets = generate_graph(input_session)
  alias_inputs = trans_to_cuda(torch.Tensor(alias_inputs).long())
  items = trans_to_cuda(torch.Tensor(items).long())
  A = trans_to_cuda(torch.Tensor(A).float())
  mask = trans_to_cuda(torch.Tensor(mask).long())
  hidden = model(items, A)
  get = lambda i: hidden[i][alias_inputs[i]]
  seq_hidden = torch.stack([get(i) for i in torch.arange(len(alias_inputs)).long()])
  scores = model.compute_scores(seq_hidden, mask)
  sub_scores = scores.topk(opt.K)[1]
  sub_scores = trans_to_cpu(sub_scores).detach().numpy()
  return list(sub_scores[0]+ 1)[:3]

In [15]:
start = time.time()
best_result = [0, 0]
best_epoch = [0, 0]
bad_counter = 0
for epoch in range(opt.epoch):
    print('-------------------------------------------------------')
    print('epoch: ', epoch)
    train(model, train_data)
    hit, mrr = test(model, test_data)
    flag = 0
    if hit >= best_result[0]:
        best_result[0] = hit
        best_epoch[0] = epoch
        flag = 1
    if mrr >= best_result[1]:
        best_result[1] = mrr
        best_epoch[1] = epoch
        flag = 1
    print('Best Result:')
    print('\tRecall@3:\t%.4f\tMMR@3:\t%.4f\tEpoch:\t%d,\t%d'% (best_result[0], best_result[1], best_epoch[0], best_epoch[1]))
    bad_counter += 1 - flag
    if bad_counter >= opt.patience:
        break
    input = test_df[0:10000].groupby(['user_id']).challenge.apply(list).apply(lambda x : x[:9])
    target = test_df[0:10000].groupby(['user_id']).challenge.apply(list).apply(lambda x : x[9])
    print((input.apply(predict_next_items).apply(lambda x: x[1]) == target).value_counts())
    torch.save(model.state_dict(), "/content/drive/My Drive/Hackathon/Recommendation system/model_{0:03d}.pwf".format(epoch)) 
print('-------------------------------------------------------')
end = time.time()
print("Run time: %f s" % (end - start))


-------------------------------------------------------
epoch:  0
start training:  2020-07-15 02:41:14.985510
[0/8344] Loss: 8.6111
[1669/8344] Loss: 4.9291
[3338/8344] Loss: 4.8114
[5007/8344] Loss: 4.2238
[6676/8344] Loss: 4.1932
	Loss:	39033.789
start predicting:  2020-07-15 02:49:19.337238
Best Result:
	Recall@3:	29.5888	MMR@3:	21.1234	Epoch:	0,	0
False    857
True     143
Name: challenge, dtype: int64
-------------------------------------------------------
epoch:  1
start training:  2020-07-15 02:50:58.625120
[0/8344] Loss: 3.9714
[1669/8344] Loss: 4.1769
[3338/8344] Loss: 4.2373
[5007/8344] Loss: 3.9126
[6676/8344] Loss: 4.2017
	Loss:	34088.219
start predicting:  2020-07-15 02:59:01.932122
Best Result:
	Recall@3:	32.6322	MMR@3:	23.5865	Epoch:	1,	1
False    846
True     154
Name: challenge, dtype: int64
-------------------------------------------------------
epoch:  2
start training:  2020-07-15 03:00:41.776899
[0/8344] Loss: 3.7069
[1669/8344] Loss: 3.4428
[3338/8344] Loss: 3.882

In [16]:
'''
print(predict_next_items([1789, 4313, 1229, 2203, 4925, 2783, 5153]))
test(model, test_data)
'''

'\nprint(predict_next_items([1789, 4313, 1229, 2203, 4925, 2783, 5153]))\ntest(model, test_data)\n'

In [25]:
input = test_df[0:10000].groupby(['user_id']).challenge.apply(list).apply(lambda x : x[:9])
target = test_df[0:10000].groupby(['user_id']).challenge.apply(list).apply(lambda x : x[9])
print((input.apply(predict_next_items).apply(lambda x: x[1]) == target).value_counts())

False    843
True     157
Name: challenge, dtype: int64


In [26]:
input[0:100].apply(predict_next_items)

user_id
4577     [5188, 540, 3389]
4578    [3389, 5188, 5164]
4579    [5235, 2984, 3007]
4583    [5188, 4475, 5164]
4584      [2874, 1873, 82]
               ...        
4805    [3283, 3638, 5235]
4807     [3765, 5400, 414]
4808    [3638, 3283, 4305]
4814     [2129, 1401, 704]
4817     [5193, 4147, 548]
Name: challenge, Length: 100, dtype: object

In [27]:
counter = 0
def predict_3_challenges(input_data):
  global counter
  counter += 1
  if counter%1000 == 0:
    print(counter/(test_df.shape[0]/10))
  return predict_next_items(input_data)
  
final_dict = test_df.groupby(['user_id']).challenge.apply(list).apply(predict_3_challenges).to_dict()

0.02516862981979261
0.05033725963958522
0.07550588945937783
0.10067451927917044
0.12584314909896305
0.15101177891875567
0.17618040873854826
0.20134903855834088
0.2265176683781335
0.2516862981979261
0.2768549280177187
0.30202355783751134
0.32719218765730396
0.3523608174770965
0.37752944729688914
0.40269807711668176
0.4278667069364744
0.453035336756267
0.4782039665760596
0.5033725963958522
0.5285412262156448
0.5537098560354374
0.57887848585523
0.6040471156750227
0.6292157454948153
0.6543843753146079
0.6795530051344005
0.704721634954193
0.7298902647739857
0.7550588945937783
0.7802275244135709
0.8053961542333635
0.8305647840531561
0.8557334138729488
0.8809020436927414
0.906070673512534
0.9312393033323266
0.9564079331521192
0.9815765629719118


In [28]:
inverse_challenge_id_mapping = {value : key for (key, value) in challenge_id_mapping.items()}

In [29]:
ss = pd.read_csv("/content/drive/My Drive/Hackathon/Recommendation system/" + "sample_submission_J0OjXLi_DDt3uQN.csv")

In [30]:
def map_prediction(input_data):
  seq_no = int(input_data[-2:]) - 11
  return inverse_challenge_id_mapping[final_dict[int(input_data[:-3])][seq_no]]
ss['challenge'] = ss['user_sequence'].apply(map_prediction)

In [31]:
ss.head()

Unnamed: 0,user_sequence,challenge
0,4577_11,CI23691
1,4577_12,CI24527
2,4577_13,CI23648
3,4578_11,CI23691
4,4578_12,CI23848


In [32]:
ss.to_csv("/content/drive/My Drive/Hackathon/Recommendation system/" + "sample_submission_srgnn.csv", index=False)