In [None]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import pandas as pd
from filldb import read_df
import warnings
import torch.utils.data
from model import MF, MFParams
from filldb import read_df
from train import create_target_matrix, set_seed, DL
from ncf import NCF, NCFParams, default_ncf
from train import train2, compute_f1
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def predict(model,ratings):
    movieIds = [1,30,177,10]
    userIds = [1,]*len(movieIds)
#     _userIds = torch.repeat_interleave(torch.tensor(userIds), len(movieIds)).tolist()

    pred = model.predict(userIds,movieIds)
    pred = (pred*2).round()/2
    df = pd.DataFrame({"userId":userIds ,"movieId": movieIds, "pred_rating": pred.flatten().tolist()})

    df = df.merge(ratings.loc[ratings.userId.isin(userIds)].loc[ratings.movieId.isin(movieIds)][["movieId","rating","userId"]],
            on=["userId","movieId"],how="outer")
    return df

In [None]:
from train import main,plot
warnings.filterwarnings("ignore")
train_df = pd.read_csv("train.csv")
losses, f1, val_losses, val_f1 = main(
    500,
    train_df,
    "models/ncf1.pt",
    load_model=True
)
plot(losses, f1, val_losses, val_f1)

In [None]:
def get_first_of_every_user(df):
    indices = []
    rows = []
    for i in range(df.userId.nunique()):
        row = df.loc[df.userId == i]
        indices.append(row.index[0])
        rows.append(row.iloc[0])
    df = df.drop(indices,axis=0)
    return df,pd.DataFrame(rows)

In [None]:

seed = 1337

def load(split=None):
    ratings = read_df("", 200000)[["userId","movieId","rating"]]#.sample(frac=1)
    nu, nm = ratings.userId.nunique(), ratings.movieId.nunique()
    if split is not None:
        dfs = get_first_of_every_user(ratings)
        # s = int(ratings.shape[0]*0.8)
        # dfs = ratings.iloc[:s], ratings.iloc[s:]
    else:
        dfs = [ratings]
    dfs = [
        (torch.from_numpy(df.userId.values).long(),
        torch.from_numpy(df.movieId.values).long(),
        torch.from_numpy(df.rating.values).float())
        for df in dfs
    ]
    del ratings
    return nu,nm,dfs
nu, nm,(df1,df2) = load(.8)

In [None]:
train_dl = DL(*df1, 0x1000*2,shuffle=True)
val_dl = DL(*df2, 0x100,shuffle=False)
params = Params(
    nu, 
    nm,
    0.001,
    32,
    use_bias=True,
    usePQ=False
)
def loss_fn(pred, real, embedings, usePQ=False):
    return F.mse_loss(pred,real)
    l = F.l1_loss(pred, real)
    if not usePQ:
        return l
    P, Q, ub, mb = embedings
    return l + .01 * (
        torch.norm(Q, p="fro")+torch.norm(P, p="fro") +
        (ub**2).mean() + (mb**2).mean()
    )
model = MF(params,seed=seed).to(device)
last_state = {}
scheduler = None
optimizer = torch.optim.AdamW(model.parameters(), lr=params.lr,weight_decay=.0001)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min',patience=20,cooldown=20,min_lr=.00001)
losses, f1, val_losses, val_f1 = train2(
    model, optimizer, train_dl, loss_fn, lambda a,b,*_:compute_f1(a,b,None),
    500, device, scheduler=scheduler, val_ld=val_dl,
    last_state=last_state)
plot(losses, f1,val_losses,val_f1)
model.save("models/mf.pt", optimizer=optimizer,
         losses=losses, f1=f1,val_losses=val_losses,val_f1=val_f1)

In [None]:
matrix = ratings.pivot(index="userId",columns="movieId",values="rating").fillna(0)
nu,nm = matrix.shape

def loss_fn(pred, real, embedings,_):
    return F.mse_loss(pred, real)
load_model = False
set_seed(seed)
if not load_model:
    ncf, ncf_opt, last_state = default_ncf(nu,nm,device)
else:
    ncf, ncf_opt, last_state = NCF.load(
        "models/ncf.pt",optCls=torch.optim.AdamW)
scheduler = None
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(ncf_opt, 'min',patience=50,cooldown=50,min_lr=.00001)
val_dl = DL(*df2, 0x100, shuffle=False)
def train_dl(ubs,mbs):
    for i in range (0,nu,ubs):
        u_ids = torch.arange(i,i+ubs)
        for j in range (0,nm,mbs):
            m_ids = torch.arange(j,j+mbs)
            ratings = matrix.iloc[u_ids.tolist(), m_ids.tolist()]
            ratings = torch.from_numpy(ratings.values)
            yield u_ids,m_ids,ratings
train_dl = DL(*df1, 0x1000*2,shuffle=True)
losses, f1, val_losses, val_f1 = train2(
    ncf, ncf_opt, train_dl, loss_fn, lambda a,b,*_:compute_f1(a,b,None),
    500, device, scheduler=scheduler, val_ld=val_dl,
    last_state=last_state)
plot(losses, f1, val_losses,val_f1)
ncf.save("models/ncf.pt", optimizer=ncf_opt,
         losses=losses,f1=f1,val_losses=val_losses,val_f1=val_f1)

In [None]:
from geturls import load_df,url
covers_path = "covers.csv"
df = load_df(covers_path, s=10000)
df.head()

In [None]:
from bs4 import BeautifulSoup
import requests
possible_classes = ["sc-491663c0-7 dUfBfF", "sc-491663c0-7 jmhiib"]
def get(i, id):
    err = -1
    try:
        if i % 100 == 0:
            print(i)
        u = url(id)
        resp = requests.get(u, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
        }, timeout=10)
        if resp.status_code != 200:
            # print(resp.content.decode())
            return (id, err)
        soup = BeautifulSoup(resp.content.decode(), "html.parser")
        err = -2
        div = None
        for c in possible_classes:
            divs = soup.find_all("div", {
                "class": c
            })
            if len(divs)==0:
                continue
            div = divs[0]
        if div is None:
            raise ""
        err = -3
        [img] = div.findAll("img")
        err = -4
        return id, img["src"]
    except Exception as e:
        pass
    return (id, err)
imdb = 125877
get(0, imdb),url(imdb)