## libraries --- all necessary libraries are being imported

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import TensorDataset, DataLoader, random_split
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## dataset --- loading the ratings.csv dataset

In [2]:
ratings = pd.read_csv('../csv/ratings.csv')

In [3]:
n_users = ratings.user_id.nunique()
n_users

53424

In [4]:
ratings.book_id.nunique()

10000

In [5]:
ratings.rating.value_counts()

4    2139018
5    1983093
3    1370916
2     359257
1     124195
Name: rating, dtype: int64

## dataset --- [book, user] IDs are converted to indices

In [6]:
ratings.user_id -= 1
ratings.book_id -= 1

In [7]:
max(ratings.user_id)

53423

In [8]:
max(ratings.book_id)

9999

## dataset --- threashold is being defined
    >= 3 ---> positive interaction
    < 3  ---> negative interaction

In [9]:
ratings.rating[ratings.rating < 3] = 0
ratings.rating[ratings.rating >= 3] = 1

In [10]:
ratings.rating

0          1
1          1
2          1
3          1
4          1
          ..
5976474    1
5976475    1
5976476    1
5976477    1
5976478    1
Name: rating, Length: 5976479, dtype: int64

## dataset --- books & its features are being loaded

    trained autoencoder is being used for dimensionality reduction

In [12]:
books = np.load('/content/drive/MyDrive/Project/numpy/item_sparse.npy')
books = torch.from_numpy(books).float()

In [13]:
books.size()

torch.Size([10000, 34264])

### Model --- Autoencoder

In [14]:
class Autoencoder(nn.Module):
    def __init__(self, input_dimension, hidden_dimension):
        super(Autoencoder, self).__init__()

        self.encoder = nn.Sequential(
            # nn.Linear(input_dimension, input_dimension),
            # nn.ReLU(),
            nn.Linear(input_dimension, hidden_dimension),
            nn.ReLU()
        )

        self.decoder = nn.Sequential(
            nn.Linear(hidden_dimension, input_dimension),
            nn.ReLU()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [15]:
# hidden_size // input_size ---> autoencoder model
input_size = books.size()[-1]
hidden_size = 128

In [16]:
autoencoder = Autoencoder(input_size, hidden_size)
autoencoder.load_state_dict(torch.load('/content/drive/MyDrive/Project/model/autoencoder.pth'))
autoencoder.eval()

books = autoencoder.encoder(books.float())
books = books

## ratings --- accessing book_ids

In [17]:
ratings[: 5].values

array([[   0,  257,    1],
       [   1, 4080,    1],
       [   1,  259,    1],
       [   1, 9295,    1],
       [   1, 2317,    1]])

## books --- extracting those particular books

In [18]:
books[ratings[: 5].values[:, 1]]

tensor([[   0.0000,    0.0000,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,  225.6808,   69.5684,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,  219.3959,  122.7329,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,   77.9636,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,    0.0000,    0.0000,    0.0000,    0.0000,    0.0000,
            0.0000,   73.

## dataset --- class is being defined for loading, getting items

In [11]:
X = torch.tensor(ratings[['user_id', 'book_id']].values, dtype=torch.float)
y = torch.tensor(ratings[['rating']].values, dtype=torch.float)

In [12]:
torch.manual_seed(7)
dataset = TensorDataset(X, y)

In [14]:
train_ratio = 0.8
test_ratio = 1 - train_ratio

train_size = int(train_ratio * len(dataset))
test_size = len(dataset) - train_size

train_size, test_size

(4781183, 1195296)

In [15]:
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [16]:
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

In [13]:
class Recommender(nn.Module):
    def __init__(self, n_users, books_n_features, hidden_size):
        super(Recommender, self).__init__()
        self.user_embed = nn.Embedding(n_users, hidden_size)
        self.book_embed = nn.Embedding(10000, hidden_size)
        
        self.dense = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
            nn.Sigmoid()
        )
        
    def forward(self, user_ids, book_ids):
        users = self.user_embed(user_ids)
        books = self.book_embed(book_ids)

        target = torch.cat([users, books], dim=1)
        target = self.dense(target)
        
        return target

In [14]:

hidden_size = 16

In [26]:
recommender = Recommender(n_users=n_users, books_n_features=128, hidden_size=hidden_size).cuda()
recommender

Recommender(
  (user_embed): Embedding(53424, 16)
  (book_embed): Embedding(10000, 16)
  (dense): Sequential(
    (0): Linear(in_features=32, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=8, bias=True)
    (3): ReLU()
    (4): Linear(in_features=8, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

In [27]:
optimizer = optim.AdamW(recommender.parameters(), lr=0.0025, weight_decay=0.0001)
scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 0.95 ** epoch)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.0025
    lr: 0.0025
    maximize: False
    weight_decay: 0.0001
)

In [28]:
criterion = nn.MSELoss()
criterion

MSELoss()

In [29]:
epochs = 1
device = 'cuda'

In [30]:
# books = books.detach()
# books = books.to(device)

In [31]:
for epoch in range(epochs):
    recommender.train()
    train_loss = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")
    for x_batch, y_batch in progress_bar:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = recommender(x_batch[:, 0].long(), x_batch[:, 1].long())
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * len(x_batch)
        progress_bar.set_postfix({"Train Loss": np.sqrt(train_loss / len(train_loader))})
    
    train_loss /= len(train_loader)
    
    scheduler.step()
    recommender.eval()
    with torch.no_grad():
        test_loss = 0

        for x_batch, y_batch in test_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            output = recommender(x_batch[:, 0].long(), x_batch[:, 1].long())
            loss = criterion(output, y_batch)
            test_loss += loss.item() * len(x_batch)

        test_loss /= len(test_loader)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {np.sqrt(train_loss):.4f}, Test Loss: {np.sqrt(test_loss):.4f}")


Epoch 1/100: 100%|██████████| 9339/9339 [01:30<00:00, 102.75it/s, Train Loss=5.99]


Epoch 1/100, Train Loss: 5.9937, Test Loss: 5.8296


Epoch 2/100: 100%|██████████| 9339/9339 [01:24<00:00, 110.68it/s, Train Loss=5.76]


Epoch 2/100, Train Loss: 5.7565, Test Loss: 5.8071


Epoch 3/100: 100%|██████████| 9339/9339 [01:23<00:00, 111.66it/s, Train Loss=5.72]


Epoch 3/100, Train Loss: 5.7224, Test Loss: 5.7999


Epoch 4/100: 100%|██████████| 9339/9339 [01:31<00:00, 102.17it/s, Train Loss=5.7]


Epoch 4/100, Train Loss: 5.6994, Test Loss: 5.8004


Epoch 5/100: 100%|██████████| 9339/9339 [01:28<00:00, 105.83it/s, Train Loss=5.68]


Epoch 5/100, Train Loss: 5.6774, Test Loss: 5.8046


Epoch 6/100: 100%|██████████| 9339/9339 [01:24<00:00, 110.56it/s, Train Loss=5.65]


Epoch 6/100, Train Loss: 5.6526, Test Loss: 5.8039


Epoch 7/100: 100%|██████████| 9339/9339 [01:23<00:00, 111.79it/s, Train Loss=5.62]


Epoch 7/100, Train Loss: 5.6231, Test Loss: 5.8118


Epoch 8/100: 100%|██████████| 9339/9339 [01:28<00:00, 104.95it/s, Train Loss=5.59]


Epoch 8/100, Train Loss: 5.5883, Test Loss: 5.8255


Epoch 9/100:  17%|█▋        | 1625/9339 [00:14<01:10, 109.99it/s, Train Loss=2.3]


KeyboardInterrupt: ignored

In [32]:
#torch.save(recommender.state_dict(), '/content/drive/MyDrive/Project/model/recommender.pth')

In [15]:
recommender = Recommender(n_users, 128, hidden_size)

In [17]:
recommender.load_state_dict(torch.load('../model/recommender.pth', map_location='cpu'))

<All keys matched successfully>

In [59]:
recommender.eval()
accuracyScore = 0
for i in range(0, 10000):
  user_i = ratings[ratings['user_id']==i].values
  output = recommender(torch.tensor(user_i[:,0]).long().to(device), torch.tensor(user_i[:,1]).long().to(device))
  output[output < 0.8] = 0 
  output[output >= 0.8] = 1
  output = output.detach().cpu()
  accuracyScore += accuracy_score(output, user_i[:,2])

print(accuracyScore/10000)

0.8645405856218878


In [75]:
book_embeddings = torch.tensor(range(10000)).long()
book_embeddings = recommender.book_embed(book_embeddings)
book_embeddings.size()


torch.Size([10000, 16])

In [76]:
book_embeddings_df = pd.DataFrame(book_embeddings.detach().cpu())
book_embeddings_df['book_id'] = list(range(10000))

In [77]:
ratings_merged = pd.merge(book_embeddings_df, ratings, on = 'book_id', how = 'outer')

In [78]:
X2 = ratings_merged.drop('rating', axis = 1).values
y2 = ratings_merged['rating'].values

In [79]:
books_df = pd.read_csv('../csv/books.csv', usecols = ['book_id', 'title'])

In [80]:
books_df.iloc[4]

book_id                   5
title      The Great Gatsby
Name: 4, dtype: object

In [83]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X2, y2, test_size = 0.25, random_state = 21)

In [86]:
rfc = RandomForestClassifier(verbose=11, n_jobs=-1)
rfc.fit(X_train_2, y_train_2)
pred = rfc.predict(X_test_2)
accuracyScore = accuracy_score(pred, y_test_2)
accuracyScore

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 100building tree 2 of 100
building tree 3 of 100

building tree 4 of 100
building tree 5 of 100


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.2min


building tree 6 of 100


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.2min


building tree 7 of 100
building tree 8 of 100


[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.2min


building tree 9 of 100
building tree 10 of 100


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  2.2min


building tree 11 of 100


[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  2.2min


building tree 12 of 100


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.3min


building tree 13 of 100


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  3.1min


building tree 14 of 100


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.1min


building tree 15 of 100


[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  3.2min


building tree 16 of 100


[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  3.2min


building tree 17 of 100


[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  4.1min


building tree 18 of 100
building tree 19 of 100


[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  4.2min


building tree 20 of 100


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  4.2min


building tree 21 of 100


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  5.1min


building tree 22 of 100


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  5.1min


building tree 23 of 100


[Parallel(n_jobs=-1)]: Done  19 tasks      | elapsed:  5.1min


building tree 24 of 100


[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:  5.2min


building tree 25 of 100


[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  6.2min


building tree 26 of 100


[Parallel(n_jobs=-1)]: Done  22 tasks      | elapsed:  6.2min


building tree 27 of 100


[Parallel(n_jobs=-1)]: Done  23 tasks      | elapsed:  6.3min


building tree 28 of 100


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  6.3min


building tree 29 of 100


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.6min


building tree 30 of 100


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  7.7min


building tree 31 of 100


[Parallel(n_jobs=-1)]: Done  27 tasks      | elapsed:  7.7min


building tree 32 of 100


[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  7.7min


building tree 33 of 100


[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:  9.4min


building tree 34 of 100


[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:  9.4min


building tree 35 of 100


[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:  9.4min


building tree 36 of 100


[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:  9.5min


building tree 37 of 100


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 10.8min


building tree 38 of 100


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 10.8min


building tree 39 of 100


[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed: 10.9min


building tree 40 of 100


[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed: 10.9min


building tree 41 of 100


[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 12.0min


building tree 42 of 100


[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed: 12.0min


building tree 43 of 100


[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed: 12.0min


building tree 44 of 100


[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed: 12.1min


building tree 45 of 100


[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed: 13.6min


building tree 46 of 100


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 13.6min


building tree 47 of 100


[Parallel(n_jobs=-1)]: Done  43 tasks      | elapsed: 13.6min


building tree 48 of 100


[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed: 13.7min


building tree 49 of 100
building tree 50 of 100


[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 15.3min


building tree 51 of 100


[Parallel(n_jobs=-1)]: Done  47 tasks      | elapsed: 15.3min


building tree 52 of 100


[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed: 15.4min


building tree 53 of 100


[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed: 17.0min


building tree 54 of 100


[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed: 17.0min


building tree 55 of 100


[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed: 17.0min


building tree 56 of 100


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed: 17.0min


building tree 57 of 100


[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 18.6min


building tree 58 of 100


[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed: 18.6min


building tree 59 of 100


[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed: 18.7min


building tree 60 of 100


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 18.7min


building tree 61 of 100


[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed: 20.1min


building tree 62 of 100


[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed: 20.1min


building tree 63 of 100


[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed: 20.2min


building tree 64 of 100


[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed: 20.2min


building tree 65 of 100


[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 21.7min


building tree 66 of 100


[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed: 21.8min


building tree 67 of 100


[Parallel(n_jobs=-1)]: Done  63 tasks      | elapsed: 21.8min


building tree 68 of 100


[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 21.9min


building tree 69 of 100


[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed: 23.1min


building tree 70 of 100


[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed: 23.1min


building tree 71 of 100
building tree 72 of 100


[Parallel(n_jobs=-1)]: Done  67 tasks      | elapsed: 23.2min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed: 23.2min


building tree 73 of 100


[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 24.4min


building tree 74 of 100


[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed: 24.4min


building tree 75 of 100


[Parallel(n_jobs=-1)]: Done  71 tasks      | elapsed: 24.5min


building tree 76 of 100


[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed: 24.5min


building tree 77 of 100


[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed: 25.7min


building tree 78 of 100


[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 25.8min


building tree 79 of 100


[Parallel(n_jobs=-1)]: Done  75 tasks      | elapsed: 25.8min


building tree 80 of 100


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed: 25.9min


building tree 81 of 100


[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 27.0min


building tree 82 of 100


[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed: 27.1min


building tree 83 of 100


[Parallel(n_jobs=-1)]: Done  79 tasks      | elapsed: 27.2min


building tree 84 of 100


[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed: 27.2min


building tree 85 of 100


[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 28.3min


building tree 86 of 100


[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 28.4min


building tree 87 of 100


[Parallel(n_jobs=-1)]: Done  83 tasks      | elapsed: 28.4min


building tree 88 of 100


[Parallel(n_jobs=-1)]: Done  84 tasks      | elapsed: 28.5min


building tree 89 of 100


[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed: 29.6min


building tree 90 of 100


[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed: 29.7min


building tree 91 of 100


[Parallel(n_jobs=-1)]: Done  87 tasks      | elapsed: 29.8min


building tree 92 of 100


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed: 29.8min


building tree 93 of 100


[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed: 30.9min


building tree 94 of 100


[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 31.3min


building tree 95 of 100


[Parallel(n_jobs=-1)]: Done  91 tasks      | elapsed: 31.3min


building tree 96 of 100


[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed: 31.3min


building tree 97 of 100


[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed: 32.4min


building tree 98 of 100
building tree 99 of 100
building tree 100 of 100


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 33.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    6.4s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:    6.4s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:    6.5s
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:    6.7s
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:   10.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   10.1s
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:   10.1s
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:   10.1s
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:   13.3s
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:   1

0.8578782159398174

In [89]:
# this marks accuracy of the random forrest model
accuracyScore

0.8578782159398174

In [123]:
def RECOMMEND_N_TOP(userID: int, n=5, books_df=books_df, book_embeddings_df=book_embeddings_df):
    '''
    ratings_merged: dataframe which merged embeddings from the neural network
    we will perform an emsemble of two models, our neural net, and a random forrest for improved accuracy
    '''
    
    user_i = ratings[ratings['user_id'] == userID].values
    uninteracted_books = np.zeros(10000)
    uninteracted_books[user_i[:, 1]] = 1

    uninteracted_books_idx = np.argwhere(uninteracted_books == 0).reshape(-1)
    uninteracted_user_idx = np.array(
        [user_i[:, 0][0]for i in range(len(uninteracted_books_idx))])


    # X_test_user_i = book_embeddings_df[book_embeddings_df['book_id'].isin(uninteracted_books_idx + 1)]
    # X_test_user_i['user_id'] = [userID] * len(X_test_user_i)
    
    # target = rfc.predict(X_test_user_i.values)
    # target_sorted_idx = np.argsort(target.ravel())[::-1]
    
    # print(target, target_sorted_idx)


    # # print(uninteracted_books_idx.shape, uninteracted_user_idx.shape)

    recommender.eval()
    with torch.no_grad():
        output = recommender(torch.tensor(uninteracted_user_idx, dtype=torch.long), torch.tensor(
            uninteracted_books_idx, dtype=torch.long))

    output = output.cpu()
    # print()
    sorted_idx = np.argsort(output.reshape(-1))
    sorted_idx = sorted_idx.numpy()[::-1]

    uninteracted_books_idx = uninteracted_books_idx[sorted_idx][:n]

    return pd.DataFrame({'TOP_N_RECOMMENDATIONS': books_df['title'].iloc[uninteracted_books_idx].to_numpy()})

In [128]:
RECOMMEND_N_TOP(0, 10)

Unnamed: 0,TOP_N_RECOMMENDATIONS
0,"The Complete Maus (Maus, #1-2)"
1,The Hate U Give
2,There's Treasure Everywhere: A Calvin and Hobb...
3,The Indispensable Calvin and Hobbes
4,All Things Wise and Wonderful
5,"Words of Radiance (The Stormlight Archive, #2)"
6,Just Mercy: A Story of Justice and Redemption
7,"Unnatural Death (Lord Peter Wimsey, #3)"
8,"Fair Game (Alpha & Omega, #3)"
9,"The Obelisk Gate (The Broken Earth, #2)"
