In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl

np.random.seed(123)

In [2]:
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv',
                     parse_dates=['timestamp'])

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [4]:
ratings.shape

(20000263, 4)

In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   userId     int64         
 1   movieId    int64         
 2   rating     float64       
 3   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 610.4 MB


In [6]:
rand_userIds = np.random.choice(ratings['userId'].unique(),
                               size=int(len(ratings['userId'].unique())*0.3),
                               replace=False)

ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(ratings),len(rand_userIds)))

There are 6027314 rows of data from 41547 users


In [7]:
ratings.shape

(6027314, 4)

In [8]:
ratings.sample(10)

Unnamed: 0,userId,movieId,rating,timestamp
3840312,26182,3704,4.0,2007-01-31 21:56:52
7608731,52439,3365,4.0,2004-03-21 08:02:56
19363634,134060,1027,3.0,2003-07-15 22:43:45
17181947,118860,2629,1.0,2007-11-29 21:27:08
9344779,64638,4723,2.0,2001-09-10 20:11:41
10356404,71637,3882,0.5,2004-02-20 01:48:53
962965,6460,11,4.0,1996-10-22 12:53:32
12014375,82949,54001,4.5,2010-08-06 16:39:25
10532053,72855,7158,2.5,2011-07-25 01:07:58
18710288,129551,1228,4.5,2007-06-28 00:17:42


In [9]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
                                .rank(method = 'first',ascending=False)

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
236,3,1,4.0,1999-12-11 13:36:47,81.0
237,3,24,3.0,1999-12-14 12:54:08,10.0
238,3,32,4.0,1999-12-11 13:14:07,140.0
239,3,50,5.0,1999-12-11 13:13:38,143.0
240,3,160,3.0,1999-12-14 12:54:08,11.0


In [11]:
ratings[ratings['rank_latest'] <= 10]

Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
237,3,24,3.0,1999-12-14 12:54:08,10.0
241,3,173,2.0,1999-12-14 12:54:59,1.0
325,3,1373,4.0,1999-12-14 12:54:32,6.0
337,3,1762,4.0,1999-12-14 12:54:32,7.0
342,3,1882,4.0,1999-12-14 12:54:32,8.0
...,...,...,...,...,...
19999801,138491,3186,5.0,2009-03-04 01:38:28,7.0
19999804,138491,6874,4.0,2009-07-09 23:48:57,2.0
19999805,138491,8961,2.5,2009-07-09 23:49:07,1.0
19999806,138491,33794,2.5,2009-07-09 23:48:54,3.0


In [12]:
train_ratings = ratings[ratings['rank_latest'] > 10]
val_ratings = ratings[(ratings['rank_latest'] <= 10) & (ratings['rank_latest'] > 2)]
test_ratings = ratings[ratings['rank_latest'] <= 2]

# drop columns that we no Longer need 
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
val_ratings = val_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId','movieId','rating']]

In [13]:
train_ratings.shape, val_ratings.shape, test_ratings.shape

((5611844, 3), (332376, 3), (83094, 3))

In [14]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
236,3,1,4.0,1999-12-11 13:36:47,81.0
237,3,24,3.0,1999-12-14 12:54:08,10.0
238,3,32,4.0,1999-12-11 13:14:07,140.0
239,3,50,5.0,1999-12-11 13:13:38,143.0
240,3,160,3.0,1999-12-14 12:54:08,11.0


In [15]:
train_ratings.loc[:, 'rating'] = 1

train_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
8430429,58177,4874,1.0
14421990,99620,1281,1.0
18779975,130076,551,1.0
13768013,95138,1533,1.0
1126150,7686,1264,1.0


In [16]:
# # Get a list of all movie IDs
# all_movieIds = ratings['movieId'].unique()

# # Placeholfers that will hold the training data
# users, items, labels = [], [], []

# # This is the set of items that each usesr has interaction with
# user_item_set = set(zip(train_ratings['userId'],train_ratings['movieId']))

# # 4:1 ratio of negative to positive samples
# num_negatives = 4

# for (u, i) in tqdm(user_item_set):
#     users.append(u)
#     items.append(i)
#     labels.append(1) # items that the user has interacted with are positive
#     for _ in range(num_negatives):
#         # randomly select an item
#         negative_item = np.random.choice(all_movieIds)
#         # check that the user has not interacted with this item
#         while (u, negative_item) in user_item_set:
#             negative_item = np.random.choice(all_movieIds)
#         users.append(u)
#         items.append(negative_item)
#         labels.append(0) # items not interacted with are negative

In [17]:
class MovieLens20MDataset(Dataset):
    def __init__(self, user_ids, movie_ids, num_negatives= 4):
        self.users, self.items, self.labels = self.get_dataset(user_ids, movie_ids, num_negatives)
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__ (self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]
    
    def get_dataset(self, user_ids, movie_ids, num_negatives):
        # This is the set of items that each usesr has interaction with
        user_item_set = set(zip(user_ids, movie_ids))
        users, items, labels = [], [], []
        all_movieIds = movie_ids.unique()
        for (u, i) in tqdm(user_item_set):
            users.append(u)
            items.append(i)
            labels.append(1) # items that the user has interacted with are positive
            for _ in range(num_negatives):
                # randomly select an item
                negative_item = np.random.choice(all_movieIds)
                # check that the user has not interacted with this item
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0) # items not interacted with are negative
        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)
        

# Prepare data for training

In [18]:
train_dataset = MovieLens20MDataset(train_ratings['userId'], train_ratings['movieId'])

  0%|          | 0/5611844 [00:00<?, ?it/s]

In [19]:
val_dataset = MovieLens20MDataset(val_ratings['userId'], val_ratings['movieId'])

  0%|          | 0/332376 [00:00<?, ?it/s]

In [20]:
test_dataset = MovieLens20MDataset(test_ratings['userId'], test_ratings['movieId'])

  0%|          | 0/83094 [00:00<?, ?it/s]

In [21]:
train = DataLoader(train_dataset, 
                        batch_size=4,
                        shuffle=True, 
                        num_workers=0)

In [22]:
val = DataLoader(val_dataset, 
                        batch_size=4,
                        shuffle=True, 
                        num_workers=0)

In [23]:
test = DataLoader(test_dataset, 
                        batch_size=4,
                        shuffle=True, 
                        num_workers=0)

In [24]:
for i, sample in enumerate(train_dataset):
    print(i, sample)

    if i == 10:
        break

0 (tensor(5070), tensor(1270), tensor(1))
1 (tensor(5070), tensor(84133), tensor(0))
2 (tensor(5070), tensor(116285), tensor(0))
3 (tensor(5070), tensor(6931), tensor(0))
4 (tensor(5070), tensor(83222), tensor(0))
5 (tensor(12009), tensor(3476), tensor(1))
6 (tensor(12009), tensor(27050), tensor(0))
7 (tensor(12009), tensor(86841), tensor(0))
8 (tensor(12009), tensor(6749), tensor(0))
9 (tensor(12009), tensor(67429), tensor(0))
10 (tensor(109462), tensor(3265), tensor(1))


In [25]:
torch.save(train_dataset, './train.pt')
torch.save(val_dataset, './val.pt')
torch.save(test_dataset, './test.pt')

In [26]:
train_dataset = torch.load('./train.pt')

In [27]:
for i, sample in enumerate(train_dataset):
    print(i, sample)

    if i == 10:
        break

0 (tensor(5070), tensor(1270), tensor(1))
1 (tensor(5070), tensor(84133), tensor(0))
2 (tensor(5070), tensor(116285), tensor(0))
3 (tensor(5070), tensor(6931), tensor(0))
4 (tensor(5070), tensor(83222), tensor(0))
5 (tensor(12009), tensor(3476), tensor(1))
6 (tensor(12009), tensor(27050), tensor(0))
7 (tensor(12009), tensor(86841), tensor(0))
8 (tensor(12009), tensor(6749), tensor(0))
9 (tensor(12009), tensor(67429), tensor(0))
10 (tensor(109462), tensor(3265), tensor(1))
