In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [5]:
class DataSet:

    def __init__(self, ratings, batch_size=128):
        self.ratings = ratings
        self.batch_size = batch_size
        self.num_batch = self.ratings.shape[0] // self.batch_size

    def get_user_rated_items(self):
        rated_data = self.get_rated_data()
        user_rated_items = dict()
        for user_id, item_id, rate in rated_data:
            if not user_rated_items.__contains__(user_id):
                user_rated_items[user_id] = set()
            user_rated_items[user_id].add(item_id)
        return user_rated_items

    def shuffle(self):
        np.random.shuffle(self.ratings)

    def get_batch(self, i):
        user_ids = self.ratings[i * self.batch_size: (i + 1) * self.batch_size, 0]
        item_ids = self.ratings[i * self.batch_size: (i + 1) * self.batch_size, 1]
        rates = self.ratings[i * self.batch_size: (i + 1) * self.batch_size, 2]
        return (np.array(user_ids, dtype=np.int32),
                np.array(item_ids, dtype=np.int32),
                np.array(rates, dtype=np.float32))

In [6]:
# base_folder = 'F:\\Projects\\train\\episerver\\data\\rs\\'
base_folder = 'E:\\Projects\\Train\\episerver\\data\\rs\\'

ratings = pd.read_csv(base_folder + 'ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']].values
dataset = DataSet(ratings, batch_size=512)
dataset.shuffle()
for i in tqdm(range(dataset.num_batch)):
    user_ids, item_ids, ratings = dataset.get_batch(i)

100%|████████████████████████████████████████████████████████████████████████| 50828/50828 [00:00<00:00, 165747.50it/s]


In [5]:
dataset.shuffle()
for i in tqdm(range(dataset.num_batch)):
    user_ids, item_ids, ratings = dataset.get_batch(i)

100%|█████████████████████████████████████████████████████████████████████████| 50828/50828 [00:00<00:00, 87021.94it/s]


In [8]:
%%time
dataset.shuffle()

Wall time: 38.1 s


In [7]:
for i in tqdm(range(dataset.num_batch)):
    user_ids, item_ids, ratings = dataset.get_batch(i)

100%|████████████████████████████████████████████████████████████████████████| 50828/50828 [00:00<00:00, 168762.13it/s]


In [9]:
ratings

array([1. , 1. , 4. , 2.5, 2. , 3. , 3. , 4. , 4. , 3. , 5. , 3. , 0.5,
       3.5, 4.5, 4.5, 5. , 4. , 3.5, 3. , 4.5, 3. , 4. , 5. , 5. , 4.5,
       2.5, 3. , 1.5, 3.5, 4. , 3. , 1. , 3.5, 3.5, 4. , 1.5, 3. , 4. ,
       4. , 4. , 5. , 3. , 3. , 5. , 3.5, 4. , 3. , 4.5, 5. , 4. , 4. ,
       5. , 5. , 4. , 5. , 3.5, 4. , 4. , 4. , 3. , 3. , 4.5, 4. , 4. ,
       2.5, 4.5, 4.5, 4. , 4. , 3.5, 2.5, 5. , 3. , 5. , 3. , 5. , 5. ,
       1.5, 3.5, 3. , 4. , 4. , 4. , 5. , 3. , 3. , 4. , 3. , 0.5, 2.5,
       3.5, 2. , 4. , 5. , 5. , 2.5, 3. , 3. , 2.5, 3. , 4. , 4. , 4. ,
       3.5, 3.5, 3. , 5. , 3. , 4. , 3.5, 2. , 4. , 3. , 2. , 4. , 5. ,
       3. , 4. , 3. , 5. , 5. , 4. , 3. , 2.5, 3. , 3.5, 3.5, 5. , 3. ,
       3. , 3.5, 4.5, 3. , 4.5, 3.5, 4. , 3.5, 3.5, 4. , 5. , 4.5, 3.5,
       3.5, 5. , 3. , 5. , 1. , 4.5, 1.5, 5. , 4.5, 4. , 2.5, 3. , 3. ,
       3. , 5. , 4. , 4. , 4.5, 4. , 4.5, 3. , 3. , 3.5, 2. , 4.5, 3. ,
       4. , 2.5, 5. , 4. , 4. , 4. , 0.5, 3.5, 5. , 3. , 5. , 2.