In [1]:
%load_ext blackcellmagic
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

In [2]:
import os
import sys

source_dir = os.path.realpath("../bpr")
sys.path.append(source_dir)

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import dok_matrix, csr_matrix
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from typing import Tuple

## Load data

In [3]:
DATA_DIR = "../data"


def load_ml100k(data_dir) -> csr_matrix:
    path = os.path.join(DATA_DIR, "ml-100k", "u.data")
    df = pd.read_csv(path, names=["user_id", "item_id", "rating", "dt"], sep="\t")
    df["user_id"] = LabelEncoder().fit_transform(df["user_id"])
    df["item_id"] = LabelEncoder().fit_transform(df["item_id"])
    ratings = csr_matrix((df["rating"], (df["user_id"], df["item_id"])), dtype=np.int)
    return ratings

In [4]:
ratings = load_ml100k(DATA_DIR)

In [5]:
# ui_matrix = sparse.csr_matrix(train_matrix)
# print(ui_matrix.shape, ui_matrix.indptr.shape)

## Train-test split

In [6]:
def train_test_split(ratings: csr_matrix, train_ratio=0.8) -> Tuple[csr_matrix, csr_matrix]:
    train = dok_matrix(ratings.shape, dtype=np.int)
    test = dok_matrix(ratings.shape, dtype=np.int)
    for user, row in enumerate(ratings):
        n_train = int(train_ratio * row.indices.shape[0])
        train[user, row.indices[:n_train]] = 1
        test[user, row.indices[n_train:]] = 1

    return train.tocsr(), test.tocsr()

In [7]:
train, test = train_test_split(ratings)

## Pairwise similarity

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(ratings, dense_output=False)
print(similarities.shape)

(943, 943)


In [21]:
from dataclasses import dataclass, field
from typing import Any

@dataclass
class Base:
    x: Any = 15.0
    y: int = 0
    w: str = field(init=False)
    
    def __post_init__(self):
        self.y += 5
        self.w = 'win'

@dataclass
class C(Base):
    z: int = 10
    x: int = 15
    
    def __post_init__(self):
        self.w = 'won'

In [22]:
print(Base())
print(C())

Base(x=15.0, y=5, w='win')
C(x=15, y=0, w='won', z=10)


<hr>

In [None]:
# # Loop through user profiles.
# max_idx = 0
# for cur in range(0, ratings.indptr.shape[0] - 1):
#     row_start, row_end = ratings.indptr[cur], ratings.indptr[cur + 1]
#     indices = ratings.indices[row_start: row_end]
#     max_idx = max(max_idx, np.max(indices))
# #     print(max_idx)
    
    
# # pos_items = indices[indptr[user]:indptr[user + 1]]
# # pos_item = np.random.choice(pos_items)
# # neg_item = np.random.choice(n_items)