In [1]:
import os
os.chdir("../")
print(os.getcwd())

C:\Users\Milosz\Desktop\python\thesis-recsys


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import functools
import operator
import gzip
import json
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
games = pd.read_csv('data/games.csv')
rec = pd.read_csv('data/recommendations.csv')
users = pd.read_csv('data/users.csv')
meta = pd.read_json("data/games_metadata.json", lines=True)

In [None]:
dt = rec.date.value_counts(sort=False).sort_index()
dt_months = {}
for i, v in dt.iteritems():
    dt_months[i[:7]] = dt_months.get(i[:7], 0) + v

plt.plot(list(dt_months.keys()), list(dt_months.values()))

In [None]:
rec['date'] = pd.to_datetime(rec['date'])
rec['is_recommended'] = rec['is_recommended'].astype(int) 

In [None]:
rec = rec.sort_values(by='date')

In [None]:
split_bound = "2022-12-01"
rec_train = rec[rec['date'] <= split_bound]
rec_test = rec[rec['date'] > split_bound]

In [None]:
u_train, a_train = rec_train.user_id.unique(), rec_train.app_id.unique()
u_test, a_test = rec_test.user_id.unique(), rec_test.app_id.unique()

In [None]:
u = np.intersect1d(u_train, u_test)
a = np.intersect1d(a_train, a_test)

In [None]:
u.size, a.size

In [None]:
rec_train = rec_train[(rec_train.user_id.isin(u)) & (rec_train.app_id.isin(a))]
rec_test = rec_test[(rec_test.user_id.isin(u)) & (rec_test.app_id.isin(a))]

In [None]:
for ui in u:
    print(ui)
    display(rec_train[rec_train.user_id == ui])
    display(rec_test[rec_test.user_id == ui])

In [None]:
def featurize_games(games):
    games_features = games.copy(deep=True)

    for c in ['win', 'mac', 'linux', 'steam_deck']:
        games_features[c] = games_features[c].astype(int)

    rating = pd.get_dummies(games['rating'])
    games_features = pd.concat([games_features, rating], axis=1)
   
    cols = ['app_id', 'win', 'mac', 'linux', 'steam_deck', 'price_original', 'price_final', 'discount', 'user_reviews', 'positive_ratio'] + \
    list(rating.columns)

    return games_features[cols], cols

In [None]:
meta

In [None]:
games_features, games_cols = featurize_games(games)

In [None]:
games_features

In [None]:
rec_train

In [None]:
def remap(df, col):
    idx = df[col].unique()
    new_idx = np.arange(idx.size)
    return {i: ni for i, ni in zip(idx, new_idx)}

In [None]:
user_dict = remap(rec_train, 'user_id')
item_dict = remap(rec_train, 'app_id')

In [None]:
rec_train['user_id'] = rec_train['user_id'].map(user_dict)
rec_train['app_id'] = rec_train['app_id'].map(item_dict)

In [None]:
rec_train

In [None]:
class MFDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        user_id = torch.tensor([row['user_id']], dtype=torch.int)
        item_id = torch.tensor([row['app_id']],  dtype=torch.int)
        rating = torch.Tensor([row['is_recommended']])
        
        return user_id, item_id, rating

In [None]:
class MF(nn.Module):
    def __init__(self, n_users, n_items,emb_size):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.u = nn.Embedding(n_users, emb_size)
        self.i = nn.Embedding(n_items, emb_size)
        
        self.u_rnn = nn.GRU(input_size=18, hidden_size=16)
        self.i_rnn = nn.GRU(input_size=18, hidden_size=16)
        
    def forward(self, ux, ix):
        return torch.sigmoid(torch.sum(self.u(ux) * self.i(ix), dim=2))

In [None]:
def train_model(train_data, num_epochs, batch_size, lr, embedding_size):
    num_users = train_data['user_id'].nunique()
    num_items = train_data['app_id'].nunique()
    
    dataset = MFDataset(train_data)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = MF(num_users, num_items, embedding_size)
    model = model.to(device)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(num_epochs):
        epoch_loss = 0
        for user_idx, item_idx, rating in tqdm(dataloader):
            user_idx, item_idx, rating = user_idx.to(device), item_idx.to(device), rating.to(device)
            optimizer.zero_grad()
            prediction = model(user_idx, item_idx)
            loss = criterion(prediction.squeeze(), rating.squeeze())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1}, loss: {epoch_loss/len(dataloader)}')
    
    return model

In [None]:
train_model(rec_train, 10, 1024, 1e-3, 32)

In [9]:
from reco_env import RecoEnv
from utils import import_data_for_env
import gym

ImportError: attempted relative import with no known parent package

In [None]:
env = gym.make(RecoEnv.id, **import_data_for_env())

In [None]:
vc = rec.user_id.value_counts()

In [None]:
vc

In [None]:
vc[vc >= 3]

In [None]:
plt.plot(vc)