In [1]:
import pandas as pd
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, ndcg_score, roc_auc_score
import numpy as np

In [8]:
user_interest_df = pd.read_feather('../../datasets/raw/user_interest.feather')
user_df = pd.read_feather('../../datasets/raw/user.feather')
video_df = pd.read_feather('../../datasets/raw/video.feather')
vote_df = pd.read_feather('../../datasets/raw/vote.feather')
season_df = pd.read_feather('../../datasets/raw/season.feather')
post_feed_df = pd.read_feather('../../datasets/raw/post_feed.feather')  

In [9]:
user_interest_df = user_df["id"].to_frame().merge(user_interest_df, left_on="id", right_on="user_id", how="left", suffixes=["_user", "_interaction"])
user_interest_df["count"] = 1
user_interest_df = user_interest_df.pivot(index="id", columns="name", values="count")
user_interest_df = user_interest_df.loc[:, user_interest_df.columns.notna()].fillna(0)
user_interest_df

name,ART&DESIGN,COMEDY,CRYPTO,DANCE,FINANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
00009b00-b3ae-458a-841e-e3ad5f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00009c76-592e-46a3-8754-59d68e1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0000a373-24ff-4bb3-8ff1-f924161a2b3c,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
00011b54-e070-4d0b-b8db-76cb9f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00013ce3-6800-4f34-914b-6c99171a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffc61cb-db48-450d-a721-30fc8a1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fffce31b-6034-4d11-8593-a576061a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fffeb5ee-8285-4a08-806d-d07ff91a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
upvotes_df = vote_df[vote_df["created_at"] > '2023-07-01'].groupby(["voter_id", "video_id"])["id"].nunique().reset_index(name="upvotes")
upvotes_df

Unnamed: 0,voter_id,video_id,upvotes
0,000064da-cce1-4dec-929f-ea2d871a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1
1,000068ac-49ae-421a-86ee-83ea321a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,1
2,00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,12
3,00009c76-592e-46a3-8754-59d68e1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,1
4,000233b9-d92e-4c07-9bf7-f5d8911a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1
...,...,...,...
142299,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,1
142300,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,2
142301,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,95c9270c-e6db-4520-823a-447aee1a2b3c,1
142302,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,c05f4ba5-8830-4be5-808c-8706e81a2b3c,2


In [49]:
upvotes_df = upvotes_df.merge(video_df[["id", "season_id", "created_at"]], left_on="video_id", right_on="id", how="left", suffixes=["_upvote", "_video"])
upvotes_df

Unnamed: 0,voter_id,video_id,upvotes,id,season_id,created_at
0,000064da-cce1-4dec-929f-ea2d871a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1,fd4e8fd6-9524-4790-b325-7170411a2b3c,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,2023-04-22 02:04:19
1,000068ac-49ae-421a-86ee-83ea321a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,1,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,046a6b76-bb19-4619-bbf5-2ad9e71a2b3c,2023-04-22 02:04:19
2,00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,12,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,43fda735-831b-41e7-9100-39657e1a2b3c,2023-06-14 17:18:31
3,00009c76-592e-46a3-8754-59d68e1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,1,c7716a3c-5a03-4563-8364-823ebd1a2b3c,43fda735-831b-41e7-9100-39657e1a2b3c,2023-06-15 06:24:14
4,000233b9-d92e-4c07-9bf7-f5d8911a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1,fd4e8fd6-9524-4790-b325-7170411a2b3c,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,2023-04-22 02:04:19
...,...,...,...,...,...,...
142299,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,1,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:38:07
142300,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,2,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 02:54:38
142301,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,95c9270c-e6db-4520-823a-447aee1a2b3c,1,95c9270c-e6db-4520-823a-447aee1a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:36:59
142302,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,c05f4ba5-8830-4be5-808c-8706e81a2b3c,2,c05f4ba5-8830-4be5-808c-8706e81a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:31:12


In [50]:
upvotes_category_df = upvotes_df.merge(season_df[["id", "category"]], left_on="season_id", right_on="id", how="left", suffixes=["_video", "_season"])
upvotes_category_df

Unnamed: 0,voter_id,video_id,upvotes,id_video,season_id,created_at,id_season,category
0,000064da-cce1-4dec-929f-ea2d871a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1,fd4e8fd6-9524-4790-b325-7170411a2b3c,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,2023-04-22 02:04:19,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,DANCE
1,000068ac-49ae-421a-86ee-83ea321a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,1,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,046a6b76-bb19-4619-bbf5-2ad9e71a2b3c,2023-04-22 02:04:19,046a6b76-bb19-4619-bbf5-2ad9e71a2b3c,DANCE
2,00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,12,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,43fda735-831b-41e7-9100-39657e1a2b3c,2023-06-14 17:18:31,43fda735-831b-41e7-9100-39657e1a2b3c,OTHERS
3,00009c76-592e-46a3-8754-59d68e1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,1,c7716a3c-5a03-4563-8364-823ebd1a2b3c,43fda735-831b-41e7-9100-39657e1a2b3c,2023-06-15 06:24:14,43fda735-831b-41e7-9100-39657e1a2b3c,OTHERS
4,000233b9-d92e-4c07-9bf7-f5d8911a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1,fd4e8fd6-9524-4790-b325-7170411a2b3c,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,2023-04-22 02:04:19,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,DANCE
...,...,...,...,...,...,...,...,...
142299,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,1,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:38:07,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,STYLE&BEAUTY
142300,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,2,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 02:54:38,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,STYLE&BEAUTY
142301,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,95c9270c-e6db-4520-823a-447aee1a2b3c,1,95c9270c-e6db-4520-823a-447aee1a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:36:59,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,STYLE&BEAUTY
142302,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,c05f4ba5-8830-4be5-808c-8706e81a2b3c,2,c05f4ba5-8830-4be5-808c-8706e81a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:31:12,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,STYLE&BEAUTY


In [51]:
upvotes_category_df = upvotes_category_df.pivot_table(columns="category", index="voter_id", values="upvotes", aggfunc=sum, fill_value=0)
upvotes_category_df["CRYPTO"] = 0
upvotes_category_df["FINANCE"] = 0
upvotes_category_df

  upvotes_category_df = upvotes_category_df.pivot_table(columns="category", index="voter_id", values="upvotes", aggfunc=sum, fill_value=0)


category,ART&DESIGN,COMEDY,DANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY,CRYPTO,FINANCE
voter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
000064da-cce1-4dec-929f-ea2d871a2b3c,0,0,1,0,0,0,0,0,0,0,0,0,0,0
000068ac-49ae-421a-86ee-83ea321a2b3c,0,0,1,0,0,0,0,0,0,0,0,0,0,0
00009c76-592e-46a3-8754-59d68e1a2b3c,0,0,0,0,0,0,0,0,0,13,0,0,0,0
000233b9-d92e-4c07-9bf7-f5d8911a2b3c,0,0,1,0,0,0,0,0,0,0,0,0,0,0
00025abe-b96e-4a10-afd5-ec79161a2b3c,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffcfcb4-ceb8-458f-992e-79c71e1a2b3c,0,0,1,0,0,0,0,0,0,0,0,0,0,0
fffd3fe2-3c84-4d3d-b85d-5eebdf1a2b3c,0,0,3,0,0,0,0,0,0,0,0,0,0,0
fffeb5ee-8285-4a08-806d-d07ff91a2b3c,0,0,0,0,0,0,0,0,0,0,0,2,0,0
ffff4d00-0004-4aae-a6db-47ffef1a2b3c,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [52]:
user_interest_df = user_interest_df.add(upvotes_category_df, fill_value=0)
user_interest_df

Unnamed: 0,ART&DESIGN,COMEDY,CRYPTO,DANCE,FINANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY
000064da-cce1-4dec-929f-ea2d871a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000068ac-49ae-421a-86ee-83ea321a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00009b00-b3ae-458a-841e-e3ad5f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00009c76-592e-46a3-8754-59d68e1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0
0000a373-24ff-4bb3-8ff1-f924161a2b3c,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffcfcb4-ceb8-458f-992e-79c71e1a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fffd3fe2-3c84-4d3d-b85d-5eebdf1a2b3c,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fffeb5ee-8285-4a08-806d-d07ff91a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
ffff4d00-0004-4aae-a6db-47ffef1a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
user_interest_df.reset_index(names="user_id", inplace=True)
interaction_df = user_interest_df.merge(upvotes_df[["voter_id", "video_id", "upvotes"]], left_on="user_id", right_on="voter_id", how="left", suffixes=["_user", "_upvotes"])
interaction_df

Unnamed: 0,user_id,ART&DESIGN,COMEDY,CRYPTO,DANCE,FINANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY,voter_id,video_id,upvotes
0,000064da-cce1-4dec-929f-ea2d871a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,000064da-cce1-4dec-929f-ea2d871a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1.0
1,000068ac-49ae-421a-86ee-83ea321a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,000068ac-49ae-421a-86ee-83ea321a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,1.0
2,00009b00-b3ae-458a-841e-e3ad5f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,00009c76-592e-46a3-8754-59d68e1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,12.0
4,00009c76-592e-46a3-8754-59d68e1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,00009c76-592e-46a3-8754-59d68e1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165497,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,1.0
165498,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,2.0
165499,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,95c9270c-e6db-4520-823a-447aee1a2b3c,1.0
165500,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,c05f4ba5-8830-4be5-808c-8706e81a2b3c,2.0


In [54]:
interaction_df.drop(columns="voter_id", inplace=True)
no_likes = interaction_df[interaction_df["video_id"].isna()]
interaction_df.dropna(subset="video_id", axis=0, inplace=True)
no_likes = no_likes.drop(columns=["video_id", "upvotes"]).merge(pd.DataFrame(interaction_df["video_id"].unique(), columns=["video_id"]), how="cross", suffixes=["_no_like", ""])
no_likes

Unnamed: 0,user_id,ART&DESIGN,COMEDY,CRYPTO,DANCE,FINANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY,upvotes,video_id
0,00009b00-b3ae-458a-841e-e3ad5f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,fd4e8fd6-9524-4790-b325-7170411a2b3c
1,00009b00-b3ae-458a-841e-e3ad5f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c
2,00009b00-b3ae-458a-841e-e3ad5f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,00033275-cbd8-4ee5-8c1f-7a50491a2b3c
3,00009b00-b3ae-458a-841e-e3ad5f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,c7716a3c-5a03-4563-8364-823ebd1a2b3c
4,00009b00-b3ae-458a-841e-e3ad5f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,8442dd86-17b2-4887-983d-bc62c21a2b3c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60830863,fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,27ae2edb-6f94-42d9-97b6-ccd7a01a2b3c
60830864,fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,925312b2-00c3-4307-8e72-a86ba41a2b3c
60830865,fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,d5329147-f77d-474c-af4b-4c2eab1a2b3c
60830866,fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,653a33c2-298b-4de5-a1cd-a8488b1a2b3c


In [57]:
no_likes = no_likes.merge(video_df[["id", "created_at", "season_id"]], left_on="video_id", right_on="id", how="left")
no_likes = no_likes.merge(season_df[["id", "category"]], left_on="season_id", right_on="id", how="left", suffixes=["_video", "_season"])
no_likes.drop(columns=["id_video", "season_id", "id_season"], inplace=True)
no_likes.set_index(["user_id", "video_id"], inplace=True)
no_likes

Unnamed: 0_level_0,Unnamed: 1_level_0,ART&DESIGN,COMEDY,CRYPTO,DANCE,FINANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY,upvotes,created_at,category
user_id,video_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
00009b00-b3ae-458a-841e-e3ad5f1a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-04-22 02:04:19,DANCE
00009b00-b3ae-458a-841e-e3ad5f1a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-04-22 02:04:19,DANCE
00009b00-b3ae-458a-841e-e3ad5f1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-06-14 17:18:31,OTHERS
00009b00-b3ae-458a-841e-e3ad5f1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-06-15 06:24:14,OTHERS
00009b00-b3ae-458a-841e-e3ad5f1a2b3c,8442dd86-17b2-4887-983d-bc62c21a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-04-22 02:04:19,DANCE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,27ae2edb-6f94-42d9-97b6-ccd7a01a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-04-22 02:04:19,OTHERS
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,925312b2-00c3-4307-8e72-a86ba41a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-04-22 02:04:19,OTHERS
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,d5329147-f77d-474c-af4b-4c2eab1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-04-22 02:04:19,OTHERS
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,653a33c2-298b-4de5-a1cd-a8488b1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-04-22 02:04:19,DANCE


In [55]:
interaction_df = interaction_df.merge(video_df[["id", "created_at", "season_id"]], left_on="video_id", right_on="id", how="left")
interaction_df = interaction_df.merge(season_df[["id", "category"]], left_on="season_id", right_on="id", how="left", suffixes=["_video", "_season"])
interaction_df.drop(columns=["id_video", "season_id", "id_season"], inplace=True)
interaction_df.set_index(["user_id", "video_id"], inplace=True)
interaction_df

Unnamed: 0_level_0,Unnamed: 1_level_0,ART&DESIGN,COMEDY,CRYPTO,DANCE,FINANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY,upvotes,created_at,category
user_id,video_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
000064da-cce1-4dec-929f-ea2d871a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2023-04-22 02:04:19,DANCE
000068ac-49ae-421a-86ee-83ea321a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2023-04-22 02:04:19,DANCE
00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,12.0,2023-06-14 17:18:31,OTHERS
00009c76-592e-46a3-8754-59d68e1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,1.0,2023-06-15 06:24:14,OTHERS
000233b9-d92e-4c07-9bf7-f5d8911a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2023-04-22 02:04:19,DANCE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffff7f3c-c253-45e0-a881-e7988c1a2b3c,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,2023-05-02 03:38:07,STYLE&BEAUTY
ffff7f3c-c253-45e0-a881-e7988c1a2b3c,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,2.0,2023-05-02 02:54:38,STYLE&BEAUTY
ffff7f3c-c253-45e0-a881-e7988c1a2b3c,95c9270c-e6db-4520-823a-447aee1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,2023-05-02 03:36:59,STYLE&BEAUTY
ffff7f3c-c253-45e0-a881-e7988c1a2b3c,c05f4ba5-8830-4be5-808c-8706e81a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,2.0,2023-05-02 03:31:12,STYLE&BEAUTY


In [14]:
end_date = interaction_df["created_at"].max() - timedelta(weeks=2)
train = interaction_df[interaction_df["created_at"] < end_date].drop(columns="created_at")
train

Unnamed: 0_level_0,Unnamed: 1_level_0,ART&DESIGN,COMEDY,CRYPTO,DANCE,FINANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY,interaction,category
user_id,video_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
000064da-cce1-4dec-929f-ea2d871a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DANCE
000068ac-49ae-421a-86ee-83ea321a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DANCE
00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,1,OTHERS
00009c76-592e-46a3-8754-59d68e1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,1,OTHERS
000233b9-d92e-4c07-9bf7-f5d8911a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DANCE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,27ae2edb-6f94-42d9-97b6-ccd7a01a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,OTHERS
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,925312b2-00c3-4307-8e72-a86ba41a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,OTHERS
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,d5329147-f77d-474c-af4b-4c2eab1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,OTHERS
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,653a33c2-298b-4de5-a1cd-a8488b1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,DANCE


In [15]:
enc = OrdinalEncoder(encoded_missing_value=-1)
enc.fit(interaction_df["category"].to_frame())
train["category"] = enc.transform(train["category"].to_frame())
train

Unnamed: 0_level_0,Unnamed: 1_level_0,ART&DESIGN,COMEDY,CRYPTO,DANCE,FINANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY,interaction,category
user_id,video_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
000064da-cce1-4dec-929f-ea2d871a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.0
000068ac-49ae-421a-86ee-83ea321a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.0
00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,1,9.0
00009c76-592e-46a3-8754-59d68e1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,1,9.0
000233b9-d92e-4c07-9bf7-f5d8911a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,27ae2edb-6f94-42d9-97b6-ccd7a01a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,9.0
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,925312b2-00c3-4307-8e72-a86ba41a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,9.0
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,d5329147-f77d-474c-af4b-4c2eab1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,9.0
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,653a33c2-298b-4de5-a1cd-a8488b1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2.0


In [17]:
model = RandomForestClassifier()
model.fit(train.loc[:, train.columns != "interaction"], train["interaction"])

In [18]:
test = interaction_df[interaction_df["created_at"] >= end_date].drop(columns="created_at") 
test["category"] = enc.transform(test["category"].to_frame())
test

Unnamed: 0_level_0,Unnamed: 1_level_0,ART&DESIGN,COMEDY,CRYPTO,DANCE,FINANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY,interaction,category
user_id,video_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
01074566-50bd-4c00-8092-1a22431a2b3c,56e03cea-a1de-445f-ab0b-b5d3581a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1,-1.0
01074566-50bd-4c00-8092-1a22431a2b3c,8a9a9eba-cb2d-4781-b591-3f61ab1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1,-1.0
01074566-50bd-4c00-8092-1a22431a2b3c,8daea8ee-0e99-4e02-a79e-49f5051a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1,-1.0
01074566-50bd-4c00-8092-1a22431a2b3c,8ea4abeb-766b-4b36-ab05-d94d9f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1,-1.0
01074566-50bd-4c00-8092-1a22431a2b3c,ce1aeb08-0c12-40ef-9214-27c1e21a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,b261bb87-08d6-48da-976a-6217ba1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6.0
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,ba2d2a63-52c7-4c11-9ba0-abd27b1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,11.0
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,24afe606-cbd4-496d-9c3f-a14aa41a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,2d8f252f-96ec-4fac-912d-398fe11a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,9.0


In [19]:
test["prediction"] = model.predict(test.loc[:, test.columns != "interaction"])
test

Unnamed: 0_level_0,Unnamed: 1_level_0,ART&DESIGN,COMEDY,CRYPTO,DANCE,FINANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY,interaction,category,prediction
user_id,video_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
01074566-50bd-4c00-8092-1a22431a2b3c,56e03cea-a1de-445f-ab0b-b5d3581a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1,-1.0,1
01074566-50bd-4c00-8092-1a22431a2b3c,8a9a9eba-cb2d-4781-b591-3f61ab1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1,-1.0,1
01074566-50bd-4c00-8092-1a22431a2b3c,8daea8ee-0e99-4e02-a79e-49f5051a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1,-1.0,1
01074566-50bd-4c00-8092-1a22431a2b3c,8ea4abeb-766b-4b36-ab05-d94d9f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1,-1.0,1
01074566-50bd-4c00-8092-1a22431a2b3c,ce1aeb08-0c12-40ef-9214-27c1e21a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,1,-1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,b261bb87-08d6-48da-976a-6217ba1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6.0,0
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,ba2d2a63-52c7-4c11-9ba0-abd27b1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,11.0,0
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,24afe606-cbd4-496d-9c3f-a14aa41a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,2d8f252f-96ec-4fac-912d-398fe11a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,9.0,0


In [24]:
results = {}
results["roc_auc"] = roc_auc_score(test["interaction"], test["prediction"])
results["accuracy"] = accuracy_score(test["interaction"], test["prediction"])
results["precision"] = precision_score(test["interaction"], test["prediction"])
results["recall"] = recall_score(test["interaction"], test["prediction"])
results["f1"] = f1_score(test["interaction"], test["prediction"])
top_20_indices = np.argsort(-np.array(test["prediction"]))[:20]
results["hit_ratio"] = int(any(test["interaction"][i] == 1 for i in top_20_indices))
results["ndcg"] = ndcg_score([np.array(test["interaction"])], [np.array(test["prediction"])], k=20)
print(results)

  results["hit_ratio"] = int(any(test["interaction"][i] == 1 for i in top_20_indices))


{'roc_auc': 0.9687739367851901, 'accuracy': 0.9999308658535325, 'precision': 0.9551122194513716, 'recall': 0.9375764993880049, 'f1': 0.9462631253860407, 'hit_ratio': 1, 'ndcg': 0.9551122194513715}


In [19]:
test["prediction"].nunique()

2

In [3]:
def get_end_date() -> str:
    # Calculate end date as 2 weeks before today
    today = datetime.now()
    end_date = (today - timedelta(weeks=2)).strftime('%Y-%m-%d')
    return end_date

def get_num_cycles(start_date: str) -> int:
    # Get today's date
    today_date = datetime.now().strftime('%Y-%m-%d')
    start_datetime = datetime.strptime(start_date, '%Y-%m-%d')
    end_datetime = datetime.strptime(today_date, '%Y-%m-%d')
    date_difference = (end_datetime - start_datetime).days

    return date_difference

def train_test_split_for_data(data: pd.DataFrame, date_col: str, start_date: str):
    train_data = data[data[date_col] <= start_date]
    test_data = data[(data[date_col] > start_date) & (data[date_col] > get_end_date())]
    return train_data, test_data

def create_interaction_df(user_interest_df: pd.DataFrame, user_df: pd.DataFrame, season_df: pd.DataFrame, 
                              video_df: pd.DataFrame, vote_df: pd.DataFrame, date: str):
    user_interest_df = user_df["id"].to_frame().merge(user_interest_df, left_on="id", right_on="user_id", how="left", suffixes=["_user", "_interaction"])
    user_interest_df["count"] = 1
    user_interest_df = user_interest_df.pivot(index="id", columns="name", values="count")
    user_interest_df = user_interest_df.loc[:, user_interest_df.columns.notna()].fillna(0)

    upvotes_df = vote_df[vote_df["created_at"] > datetime.strptime(date, '%Y-%m-%d') - timedelta(weeks=1)].groupby(["voter_id", "video_id"])["id"].nunique().reset_index(name="upvotes")
    upvotes_df = upvotes_df.merge(video_df[["id", "season_id", "created_at"]], left_on="video_id", right_on="id", how="left", suffixes=["", "_video"])

    upvotes_category_df = upvotes_df.merge(season_df[["id", "category"]], left_on="season_id", right_on="id", how="left", suffixes=["_video", "_season"])
    upvotes_category_df = upvotes_category_df.pivot_table(columns="category", index="voter_id", values="upvotes", aggfunc=sum, fill_value=0)
    upvotes_category_df["CRYPTO"] = 0
    upvotes_category_df["FINANCE"] = 0

    user_interest_df = user_interest_df.add(upvotes_category_df, fill_value=0)
    user_interest_df.reset_index(names="user_id", inplace=True)

    interaction_df = user_interest_df.merge(upvotes_df[["voter_id", "video_id", "upvotes"]], left_on="user_id", right_on="voter_id", how="left", suffixes=["_user", "_upvotes"])
    interaction_df.drop(columns="voter_id", inplace=True)
    no_likes = interaction_df[interaction_df["video_id"].isna()]
    interaction_df.dropna(subset="video_id", axis=0, inplace=True)

    no_likes = no_likes.drop(columns=["video_id"]).merge(pd.DataFrame(interaction_df["video_id"].unique(), columns=["video_id"]), how="cross", suffixes=["_no_like", ""])
    no_likes = no_likes.merge(video_df[["id", "created_at", "season_id"]], left_on="video_id", right_on="id", how="left")
    no_likes = no_likes.merge(season_df[["id", "category"]], left_on="season_id", right_on="id", how="left", suffixes=["_video", "_season"])
    no_likes.drop(columns=["id_video", "season_id", "id_season"], inplace=True)
    no_likes.set_index(["user_id", "video_id"], inplace=True)

    interaction_df = interaction_df.merge(video_df[["id", "created_at", "season_id"]], left_on="video_id", right_on="id", how="left")
    interaction_df = interaction_df.merge(season_df[["id", "category"]], left_on="season_id", right_on="id", how="left", suffixes=["_video", "_season"])
    interaction_df.drop(columns=["id_video", "season_id", "id_season"], inplace=True)
    interaction_df.set_index(["user_id", "video_id"], inplace=True)

    train, _ = train_test_split_for_data(interaction_df, 'created_at', date)
    enc = OrdinalEncoder(encoded_missing_value=-1)
    enc.fit(interaction_df["category"].to_frame())
    train["category"] = enc.transform(train["category"].to_frame())

    no_likes["category"] = enc.transform(no_likes["category"].to_frame())

    train.drop(columns="created_at", inplace=True)
    no_likes.drop(columns="created_at", inplace=True)
    
    return train, no_likes

def find_top_k_videos(user_id, k, prediction_df):
    return prediction_df.nlargest(k, "prediction")

def hit_ratio_at_k(y_true, y_pred, K):
    top_k_indices = np.argsort(-np.array(y_pred))[:K]
    return int(any(y_true[i] == 1 for i in top_k_indices))  # 1 if at least one relevant item is in top-K, 0 otherwise

def ndcg_at_k(y_true, y_pred, K):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred) 
    return ndcg_score([y_true], [y_pred], k=K)

def get_summary_statistics(vote_df, train_df, test_df, date, K):
    _, vote_test = train_test_split_for_data(vote_df, 'created_at', date)
    vote_test['created_at'] = vote_test['created_at'].dt.date
    model_statistics = pd.DataFrame(columns=['dt', 'roc_auc_score', 'accuracy', 'precision', 'recall', 'f1_score', 'hit_ratio_k', 'ndcg_k'])
    model = RandomForestClassifier()
    model.fit(train_df.loc[:, train_df.columns != "upvotes"], train_df["upvotes"])
    test_df["prediction"] = model.predict(test_df.loc[:, test_df.columns != "upvotes"])

    for day in sorted(vote_test['created_at'].unique()):
        print(day)
        voted_videos_for_day = vote_test[vote_test['created_at'] == day]
        summary_statistics = pd.DataFrame(columns=['user_id', 'roc_auc_score', 'accuracy', 'precision', 'recall', 'f1_score', 'hit_ratio_k', 'ndcg_k'])

        for user_id in voted_videos_for_day['voter_id'].unique():
            if user_id not in train_df.index:
                continue
            
            # create dataframe to calculate confusion matrix
            user_voted_videos = voted_videos_for_day[voted_videos_for_day['voter_id'] == user_id]
            y_true_and_pred = pd.DataFrame(index=test_df.index.get_level_values("video_id"))
            y_true_and_pred['true'] = np.where(y_true_and_pred.index.isin(user_voted_videos['video_id']), 1, 0)

            recommendations = find_top_k_videos(user_id, 20, test_df)
            y_true_and_pred['pred'] = np.where(y_true_and_pred.index.isin(recommendations.index), 1, 0)

            try:
                roc_auc = roc_auc_score(y_true_and_pred['true'], y_true_and_pred['pred'])
                accuracy = accuracy_score(y_true_and_pred['true'], y_true_and_pred['pred'])
                precision = precision_score(y_true_and_pred['true'], y_true_and_pred['pred'])
                recall = recall_score(y_true_and_pred['true'], y_true_and_pred['pred'])
                f1 = f1_score(y_true_and_pred['true'], y_true_and_pred['pred'])
                hit_ratio = hit_ratio_at_k(y_true_and_pred['true'], y_true_and_pred['pred'], K)
                ndcg = ndcg_at_k(y_true_and_pred['true'], y_true_and_pred['pred'], K)
            except ValueError:
                print(f'ROC AUC for {user_id} not valid')

            summary_statistics.loc[len(summary_statistics)] = [user_id, roc_auc, accuracy, precision, recall, f1, hit_ratio, ndcg]
        
        model_statistics.loc[len(model_statistics)] = np.append(np.array(day), summary_statistics.iloc[:,1:].mean().values)

    return model_statistics

def run_random_forest(date, K, num_cycles):
    user_interest_df = pd.read_feather('datasets/raw/user_interest.feather')
    user_df = pd.read_feather('datasets/raw/user.feather')
    season_df = pd.read_feather('datasets/raw/season.feather')
    video_df = pd.read_feather('datasets/raw/video.feather')
    vote_df = pd.read_feather('datasets/raw/vote.feather')

    model_statistics = pd.DataFrame(columns=['dt', 'roc_auc_score', 'accuracy', 'precision', 'recall', 'f1_score', 'hit_ratio_k', 'ndcg_k'])
    # for cycle in range(num_cycles):
    train_df, test_df = create_interaction_df(user_interest_df, user_df, season_df, video_df, vote_df, date)
    model_statistics_for_training_cycle = get_summary_statistics(vote_df, train_df, test_df, date, K)
    model_statistics = pd.concat([model_statistics, model_statistics_for_training_cycle])
    # date = get_end_date()

    model_statistics['model'] = 'random_forest'
    model_statistics.to_csv('datasets/final/random_forest_video.csv', index=False)

In [9]:
date = "2023-07-01"
user_interest_df = user_df["id"].to_frame().merge(user_interest_df, left_on="id", right_on="user_id", how="left", suffixes=["_user", "_interaction"])
user_interest_df["count"] = 1
user_interest_df = user_interest_df.pivot(index="id", columns="name", values="count")
user_interest_df = user_interest_df.loc[:, user_interest_df.columns.notna()].fillna(0)

upvotes_df = vote_df[vote_df["created_at"] > datetime.strptime(date, '%Y-%m-%d') - timedelta(weeks=1)].groupby(["voter_id", "video_id"])["id"].nunique().reset_index(name="upvotes")
upvotes_df = upvotes_df.merge(video_df[["id", "season_id", "created_at"]], left_on="video_id", right_on="id", how="left", suffixes=["", "_video"])

upvotes_category_df = upvotes_df.merge(season_df[["id", "category"]], left_on="season_id", right_on="id", how="left", suffixes=["_video", "_season"])
upvotes_category_df = upvotes_category_df.pivot_table(columns="category", index="voter_id", values="upvotes", aggfunc=sum, fill_value=0)
upvotes_category_df["CRYPTO"] = 0
upvotes_category_df["FINANCE"] = 0

user_interest_df = user_interest_df.add(upvotes_category_df, fill_value=0)
user_interest_df.reset_index(names="user_id", inplace=True)

interaction_df = user_interest_df.merge(upvotes_df[["voter_id", "video_id", "upvotes"]], left_on="user_id", right_on="voter_id", how="left", suffixes=["_user", "_upvotes"])
interaction_df.drop(columns="voter_id", inplace=True)
no_likes = interaction_df[interaction_df["video_id"].isna()]
interaction_df.dropna(subset="video_id", axis=0, inplace=True)

no_likes = no_likes.drop(columns=["video_id"]).merge(pd.DataFrame(interaction_df["video_id"].unique(), columns=["video_id"]), how="cross", suffixes=["_no_like", ""])
no_likes = no_likes.merge(video_df[["id", "created_at", "season_id"]], left_on="video_id", right_on="id", how="left")
no_likes = no_likes.merge(season_df[["id", "category"]], left_on="season_id", right_on="id", how="left", suffixes=["_video", "_season"])
no_likes.drop(columns=["id_video", "season_id", "id_season"], inplace=True)
no_likes.set_index(["user_id", "video_id"], inplace=True)
no_likes = no_likes[(no_likes["created_at"] > datetime.strptime(date, '%Y-%m-%d') - timedelta(weeks=1)) & (no_likes["created_at"] <= date)]

interaction_df = interaction_df.merge(video_df[["id", "created_at", "season_id"]], left_on="video_id", right_on="id", how="left")
interaction_df = interaction_df.merge(season_df[["id", "category"]], left_on="season_id", right_on="id", how="left", suffixes=["_video", "_season"])
interaction_df.drop(columns=["id_video", "season_id", "id_season"], inplace=True)
interaction_df.set_index(["user_id", "video_id"], inplace=True)

interaction_df = pd.concat([interaction_df, no_likes])

  upvotes_category_df = upvotes_category_df.pivot_table(columns="category", index="voter_id", values="upvotes", aggfunc=sum, fill_value=0)


In [10]:
interaction_df

Unnamed: 0_level_0,Unnamed: 1_level_0,ART&DESIGN,COMEDY,CRYPTO,DANCE,FINANCE,FOOD&DRINKS,GAMING,HACKS&PRODUCTIVITY,LIFESTYLE,MUSIC,NFT,OTHERS,SPORTS&FITNESS,STYLE&BEAUTY,upvotes,created_at,category
user_id,video_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,8.0,2023-06-14 17:18:31,OTHERS
0046973b-47ef-4f80-83fc-fbdedf1a2b3c,662fdb13-370d-4d3e-9384-4539b71a2b3c,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,2023-07-21 16:56:21,LIFESTYLE
006ff3e5-d41b-40e5-8696-46bf9e1a2b3c,f57e6a19-147b-4219-8985-823b601a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2023-07-12 14:28:49,OTHERS
00bd451c-5857-4d25-acca-224de81a2b3c,65301db2-1c9e-4fae-b65c-30a0881a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,2023-07-22 09:50:27,LIFESTYLE
00d577db-ee03-4c58-ba87-eb6cac1a2b3c,00a4d834-a252-44e8-b78c-fe9bcc1a2b3c,0.0,0.0,0.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,2023-05-19 07:14:19,GAMING
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffff7f3c-c253-45e0-a881-e7988c1a2b3c,dcca9fe9-b872-4c4c-ac0b-39339c1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-06-27 05:16:03,STYLE&BEAUTY
ffff7f3c-c253-45e0-a881-e7988c1a2b3c,ff39372d-09bd-4608-af7d-9892d51a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-06-30 20:41:22,STYLE&BEAUTY
ffff7f3c-c253-45e0-a881-e7988c1a2b3c,1fbc06bc-e7a0-464a-abd7-247e0f1a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-06-26 04:46:51,OTHERS
ffff7f3c-c253-45e0-a881-e7988c1a2b3c,5df69a83-c4c7-40d3-aed1-225cb91a2b3c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,2023-06-26 02:15:38,SPORTS&FITNESS


In [7]:
video_df.loc[:, video_df.columns not in ["season_id", "created_at"]]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()