In [None]:
import json
import math
import os
import time
from functools import partial
from urllib.parse import urlencode
from fastparquet import write
import faiss
import numpy as np
import pandas as pd
import pyarrow.parquet as pq

import requests
from catboost import CatBoostRanker, Pool
from faiss import read_index, write_index
from sentence_transformers import SentenceTransformer

In [None]:
st_model = SentenceTransformer("symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli", device="cuda") #cuda cointegrated/rubert-tiny2
from_file = CatBoostRanker()
from_file.load_model("ranker.ckpt")
with open("ind2videoid.json", "r") as f:
    ind2videoid = json.load(f)


In [None]:
df_test = pd.read_csv('./test_dataset_submission_queries.csv')
queries = df_test['query']
qembeddings = st_model.encode(queries, show_progress_bar=True)#batch_size=1000, 

In [None]:
search_cpu_index = read_index("candidates.index")
search_cpu_index.is_trained, search_cpu_index.ntotal

In [None]:
batch_size = 500
topk = 300
num_batches = math.ceil(len(queries) / batch_size)
distance, faiss_ind = [], []
for i in range(num_batches):
    print(i)
    start, end = i * batch_size, (i + 1) * batch_size
    corpus_batch = qembeddings[start:end]
    distanceq, faiss_indq = search_cpu_index.search(corpus_batch, topk)
    distance.append(distanceq)
    faiss_ind.append(faiss_indq)

In [None]:
new = [ i[:] for i in distance]
distance = np.array(new).reshape(2000,300)
new = [ i[:] for i in faiss_ind]
faiss_ind = np.array(faiss_ind).reshape(2000,300)

In [None]:
from tqdm import tqdm
dtype = [('distance', float), ('index', int)]
generated_cand_name = "generated_candidates.parquet"

generated_cand = {"query": [], "video_id": [], "top":[]}
pbar = tqdm(total=len(queries))
for i, q in enumerate(queries):
    vids = faiss_ind[i]
    dist = distance[i]
    percentel = np.percentile(dist,2.5)
    sorted_output = np.sort(np.array([(i,j) for i,j in zip(dist, faiss_ind[i])], dtype=dtype), order='distance')
    generated_cand["video_id"] += [ind2videoid[str(v)] for v in vids]
    generated_cand["query"] += [q] * len(vids)
    if int(distance[i][0]) == 0:
        generated_cand["top"] += [1 if d<5 else 0 for d, i in sorted_output]
    else:
        mulp_dist =dist[0]*2
        generated_cand["top"] += [1 if d<percentel and d<mulp_dist else 0 for d, i in sorted_output]
    pbar.update(1)
pbar.close()
generated_cand = pd.DataFrame(generated_cand)

In [None]:
features_parquet = pq.ParquetFile("features.parquet")
featuresArr, filter_date = [], "2023-05-02"

for batch in features_parquet.iter_batches():
    tmp = batch.to_pandas()
    featuresArr.append( tmp[tmp["report_date"] == filter_date])

features = pd.concat(featuresArr, axis=0)
features = features.drop(
    ["v_channel_reg_datetime", "v_pub_datetime","v_channel_type", "v_category"], axis=1 # "v_channel_type", "v_category",
)

In [None]:
automarkup = pd.read_parquet("automarkup.parquet", engine="fastparquet")
automarkup_percent = automarkup.drop(columns=[
    'query','datetime','is_authorized','position','position','vtop','comment','channel','tv_show','season','emotion'])#, 'target'
percent_of_watch = automarkup_percent.groupby(['video_id']).median()
percent_of_watch['percent_of_watch'] = (percent_of_watch['watchtime'] * 1000) / percent_of_watch['duration']
percent_of_watch = percent_of_watch.drop(columns=['duration','watchtime'])

In [None]:
full_df = generated_cand.merge(
    features, how="left", left_on="video_id", right_on="video_id"
)
# del features
full_df = full_df.drop("report_date", axis=1)
full_df = full_df.drop_duplicates()

In [None]:
from catboost import CatBoostClassifier

clickbait_clf = CatBoostClassifier()
clickbait_clf.load_model("clickbait_clf.ckpt")
is_clickbait = clickbait_clf.predict(full_df[clickbait_dataset_cols])
full_df['clickbait'] = is_clickbait

In [None]:
full_df = full_df.merge(
    percent_of_watch, how="left", left_on="video_id", right_on="video_id"
)