## Задние 1

In [6]:
import pandas as pd
from tqdm import tqdm

def get_smaple(path: str, limit=100000):

    sample_dict = {'user_id': [], 'item_id': [], 'order': []}

    with open(path, 'r') as f:
        for idx, line in tqdm(enumerate(f.readlines())):
            if idx > limit:
                break
            items = [int(item) for item in line.split()]
            sample_dict['user_id'].extend([idx for _ in items])
            sample_dict['item_id'].extend([item for item in items])
            sample_dict['order'].extend([i for i in reversed(range(len(items)))])

    return pd.DataFrame(sample_dict)


train_path = '/home/ndubrovnyi/PycharmProjects/itmo_recsys_2022/likes/likes_data/train'

interactions = get_smaple(train_path)
interactions.head()

100001it [00:01, 77743.59it/s]


Unnamed: 0,user_id,item_id,order
0,0,333396,53
1,0,267089,52
2,0,155959,51
3,0,353335,50
4,0,414000,49


In [8]:
from sklearn.model_selection import KFold
import numpy as np

class UsersKFoldPOut():
    def __init__(self, n_folds, p, random_seed=23):
        self.n_folds = n_folds
        self.p = p
        self.random_seed = random_seed

    def split(self, df):
        users = df['user_id'].unique()
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_seed)
        for train_index, test_index in kf.split(users):
            train_mask = df.user_id.isin(users[train_index])
            test_mask = (df.user_id.isin(users[test_index])) & (df.order < self.p)

            yield train_mask, test_mask


In [9]:
cv = UsersKFoldPOut(n_folds=3, p=50)

for i, (train_mask, test_mask) in enumerate(cv.split(interactions)):
    train = interactions[train_mask]
    test = interactions[test_mask]
    print(f'Fold#{i} | Train: {train.shape[0]}, Test: {test.shape[0]}')
#     print(set(train.user_id).intersection(set(test.user_id)))

Fold#0 | Train: 5411643, Test: 1117720
Fold#1 | Train: 5412945, Test: 1118109
Fold#2 | Train: 5438102, Test: 1117745


## Задание 2

In [18]:
import pandas as pd
import warnings
import numpy as np

qid_query = pd.read_csv("./yandex_cup_analytics_A/hidden_task/qid_query.tsv", sep="\t", names=["qid", "query"])
qid_url_rating = pd.read_csv("./yandex_cup_analytics_A/hidden_task/qid_url_rating.tsv", sep="\t", names=["qid", "url", "rating"])
hostid_url = pd.read_csv("./yandex_cup_analytics_A/hidden_task/hostid_url.tsv", sep="\t", names=["hostid", "url"])

qid_url_rating_hostid = pd.merge(qid_url_rating, hostid_url, on="url")

In [19]:
%%time
## Исходное решение

warnings.filterwarnings("ignore")


def plook(ind, rels):
    if ind == 0:
                return 1
    return plook(ind-1, rels)*(1-rels[ind-1])*(1-0.15)


def pfound(group):
    max_by_host = group.groupby("hostid")["rating"].max() # максимальный рейтинг хоста
    top10 = max_by_host.sort_values(ascending=False)[:10] # берем топ10 урлов с наивысшим рейтингом
    pfound = 0
    for ind, val in enumerate(top10):
                pfound += val*plook(ind, top10.values)
    return pfound


qid_pfound = qid_url_rating_hostid.groupby('qid').apply(pfound)
print('Mean pFound:', np.mean(qid_pfound))


Mean pFound: 0.5822199638393889
CPU times: user 20.6 ms, sys: 23 µs, total: 20.6 ms
Wall time: 18.9 ms


In [20]:
%%time
## Решение без циклов

import pandas as pd


def pfound_by_group(group):
    qid_group = group.sort_values(ascending=False).reset_index().rename(columns={group.name: 'rating'})
    qid_group['1_pRel'] = 1 - qid_group['rating']
    qid_group['1_pBreak'] = 1 - pBreak
    qid_group['cum_1_pRel'] = qid_group['1_pRel'].cumprod().shift(1)
    qid_group['cum_1_pBreak'] = qid_group['1_pBreak'].cumprod().shift(1)
    qid_group['plook'] = qid_group['cum_1_pBreak'] * qid_group['cum_1_pRel']
    print(qid_group)
    qid_group['plook'].fillna(1, inplace=True)
    qid_group['plook_prod_rel'] = qid_group['plook'] * qid_group['rating']
    return qid_group['plook_prod_rel'].sum()


def get_pfound(df, topk=10):
    new_df = df.groupby(['qid', 'hostid'])['rating'].max().reset_index()
    new_df = new_df.sort_values(by=['qid', 'rating'], ascending=False).groupby('qid').head(topk)
    new_df['1_pRel'] = 1 - new_df['rating']
    new_df['1_pBreak'] = 1 - pBreak
    cumprod_df = new_df.groupby('qid')[['1_pRel', '1_pBreak']].cumprod().shift(1)
    cumprod_df.loc[::10] = None
    new_df[['cum_1_pRel', 'cum_1_pBreak']] = cumprod_df
    new_df['plook'] = new_df['cum_1_pBreak'] * new_df['cum_1_pRel']
    new_df['plook'].fillna(1, inplace=True)
    new_df['plook_prod_rel'] = new_df['plook'] * new_df['rating']

    return np.mean(new_df.groupby('qid')['plook_prod_rel'].sum())




pBreak = 0.15

pfound = get_pfound(qid_url_rating_hostid)
print('Mean pFound:', pfound)


Mean pFound: 0.5822199638393888
CPU times: user 11.1 ms, sys: 12 µs, total: 11.1 ms
Wall time: 10.6 ms
