In [1]:
import numpy as np
import pandas as pd
np.random.seed(123)

In [2]:
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

In [3]:
transactions = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', parse_dates=['t_dat'])
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2
...,...,...,...,...,...
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1


In [4]:
rand_userIds = np.random.choice(transactions['customer_id'].unique(), 
                                size=int(len(transactions['article_id'].unique())*0.05), 
                                replace=False)

transactions = transactions.loc[transactions['customer_id'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(transactions), len(rand_userIds)))

There are 121374 rows of data from 5227 users


In [5]:
# transactions.to_csv('compromized_transactions_train.csv', index=False)

In [6]:
# Restructuring ids for CNF

new_customer_ids = {}
ls = transactions['customer_id'].unique()
for i in range(len(ls)):
    new_customer_ids[ls[i]] = i
    
transactions['customer_id'].replace(new_customer_ids, inplace=True)

new_article_ids = {}
ls = transactions['article_id'].unique()
for i in range(len(ls)):
    new_article_ids[ls[i]] = i
    
transactions['article_id'].replace(new_article_ids, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [7]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
460,2018-09-20,0,0,0.016932,2
461,2018-09-20,0,1,0.025407,2
462,2018-09-20,0,2,0.044051,2
463,2018-09-20,0,3,0.023712,2
464,2018-09-20,0,4,0.044051,2
...,...,...,...,...,...
31785837,2020-09-22,725,35536,0.013237,2
31785838,2020-09-22,725,28800,0.024780,2
31786668,2020-09-22,3435,21583,0.016932,2
31786989,2020-09-22,2748,34866,0.042356,2


In [8]:
transactions['rank_latest'] = transactions.groupby(['customer_id'])['t_dat'] \
                                .rank(method='first', ascending=False)
transactions.loc[:, 'buy'] = 1
train_transactions = transactions[transactions['rank_latest'] != 1]
test_transactions = transactions[transactions['rank_latest'] == 1]

# drop columns that we no longer need
train_transactions = train_transactions[['customer_id', 'article_id', 'buy']]
test_transactions = test_transactions[['customer_id', 'article_id', 'buy']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [9]:
# train_transactions.to_csv('train.csv', index=False)
# test_transactions.to_csv('test.csv', index=False)

In [10]:
# Get a list of all article IDs
all_articleIds = transactions['article_id'].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_transactions['customer_id'], train_transactions['article_id']))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_articleIds) 
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_articleIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

  0%|          | 0/99696 [00:00<?, ?it/s]

In [11]:
class ArticleTrainDataset(Dataset):
    """Article PyTorch Dataset for Training
    
    Args:
        buy (pd.DataFrame): Dataframe containing the movie ratings
        all_articleIds (list): List containing all article_ids
    
    """

    def __init__(self, buy, all_articleIds):
        self.users, self.items, self.labels = self.get_dataset(buy, all_articleIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, buy, all_articleIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(buy['customer_id'], buy['article_id']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_articleIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_articleIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [12]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            buy (pd.DataFrame): Dataframe containing the movie ratings for training
            all_articleIds (list): List containing all articleIds (train + test)
    """
    
    def __init__(self, num_users, num_items, buy, all_articleIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.buy = buy
        self.all_articleIds = all_articleIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(ArticleTrainDataset(self.buy, self.all_articleIds),
                          batch_size=512, num_workers=4)

In [13]:
num_users = transactions['customer_id'].max()+1;
num_items = transactions['article_id'].max()+1;

all_articleIds = transactions['article_id'].unique()

model = NCF(num_users, num_items, train_transactions, all_articleIds)

In [14]:
trainer = pl.Trainer(max_epochs=5, gpus=1, reload_dataloaders_every_n_epochs=1, progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"


In [15]:
trainer.fit(model)

  cpuset_checked))


Training: 0it [00:00, ?it/s]

In [16]:
# User-item pairs for testing
user_item_predict = {}
test_user_item_set = set(zip(test_transactions['customer_id'], test_transactions['article_id']))

# Dict of all items that are interacted with by each user
user_interacted_items = transactions.groupby('customer_id')['article_id'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_articleIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top12_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:12].tolist()]
    user_item_predict[u] = top12_items
    
    if i in top12_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 12 is {:.2f}".format(np.average(hits)))

  0%|          | 0/5227 [00:00<?, ?it/s]

The Hit Ratio @ 12 is 0.29


In [17]:
given_customer_ids = dict([(value, key) for key, value in new_customer_ids.items()])
given_article_ids = dict([(value, key) for key, value in new_article_ids.items()])

In [18]:
df = pd.DataFrame(user_item_predict.items(), columns=['customer_id', 'article_id'])
df

Unnamed: 0,customer_id,article_id
0,1802,"[14677, 5563, 32490, 4712, 31243, 16842, 14334..."
1,4931,"[861, 8025, 3460, 14137, 1291, 17351, 25459, 2..."
2,1357,"[2984, 13022, 23897, 15253, 13669, 3256, 17733..."
3,2826,"[12775, 20129, 1332, 11456, 11899, 11037, 2644..."
4,278,"[8283, 16383, 15395, 28216, 18200, 147, 20043,..."
...,...,...
5222,2738,"[5156, 27428, 21448, 23058, 5405, 20989, 14167..."
5223,753,"[11620, 13659, 13323, 3821, 16097, 21290, 2943..."
5224,2570,"[20620, 34183, 13075, 27458, 9774, 1544, 30203..."
5225,419,"[893, 13986, 17430, 24098, 14937, 2100, 7362, ..."


In [19]:
# Reseting to given ids

df['customer_id'].replace(given_customer_ids, inplace=True)

In [20]:
for index, row in df.iterrows():
    temp = []
    for i in row['article_id']:
        temp.append(str(given_article_ids[i]))
    row['article_id'] = ', '.join(temp)
df.rename(columns={'article_id':'prediction'}, inplace=True)
df

Unnamed: 0,customer_id,prediction
0,d54230d3035096ee158754c66cfa29a774ae767a6949ca...,"745475011, 473507001, 803892001, 687539002, 49..."
1,7cfbbe93d61ebdcef5b6d6acc2df3a411039c1db7e9de7...,"507909001, 689109003, 575347014, 559616013, 58..."
2,bb3acd03512477ab39aa427d33cf3686ced545481f39d1...,"606714001, 718278002, 835801001, 702800007, 70..."
3,1ac6790c01fa84926d877871a6c4126c2a7e3a78e79824...,"697060013, 759482001, 568575001, 697564006, 72..."
4,b75892af28e5ef3ae6dbfe19ed209ebaf215af9aa733f2...,"695632002, 568838009, 753724003, 855778001, 73..."
...,...,...
5222,af41dbefca624443eb50bbe01d01c5ef6a1c0d64f2b3f6...,"253448003, 778064028, 777070003, 803118001, 73..."
5223,06fbace3e26026d80bd1c4371b7e0e1b2af29b64ae05c5...,"714790003, 739144004, 554477026, 447694008, 61..."
5224,b84e46247a19ed20fa7b9777d828a1201a14dbb7df263c...,"399201042, 896152003, 612800009, 841260013, 70..."
5225,399ebd473763dc427dc29ad1bea9287949628ef8b57dda...,"579541001, 733419005, 768921001, 573085047, 75..."


In [21]:
# transactions.to_csv('5%-data.csv', index=False)