In [8]:
import pandas as pd
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, ndcg_score, roc_auc_score

In [9]:
user_interest_df = pd.read_feather('../../datasets/raw/user_interest.feather')
user_df = pd.read_feather('../../datasets/raw/user.feather')
video_df = pd.read_feather('../../datasets/raw/video.feather')
vote_df = pd.read_feather('../../datasets/raw/vote.feather')
season_df = pd.read_feather('../../datasets/raw/season.feather')
post_feed_df = pd.read_feather('../../datasets/raw/post_feed.feather')  

In [10]:
user_interest_df = user_interest_df.groupby("user_id")["name"].apply(", ".join).reset_index(name="interests")

df = user_df["id"].to_frame().merge(user_interest_df, left_on="id", right_on="user_id", how="left", suffixes=["_user", "_interaction"])
df

Unnamed: 0,id,user_id,interests
0,00009b00-b3ae-458a-841e-e3ad5f1a2b3c,,
1,00009c76-592e-46a3-8754-59d68e1a2b3c,,
2,0000a373-24ff-4bb3-8ff1-f924161a2b3c,0000a373-24ff-4bb3-8ff1-f924161a2b3c,"ART&DESIGN, CRYPTO, FINANCE, GAMING, NFT"
3,00011b54-e070-4d0b-b8db-76cb9f1a2b3c,,
4,00013ce3-6800-4f34-914b-6c99171a2b3c,,
...,...,...,...
67783,fffc61cb-db48-450d-a721-30fc8a1a2b3c,,
67784,fffcc67a-b6b3-4246-a554-7efc2f1a2b3c,,
67785,fffce31b-6034-4d11-8593-a576061a2b3c,,
67786,fffeb5ee-8285-4a08-806d-d07ff91a2b3c,,


In [11]:
upvotes_df = vote_df.groupby(["voter_id", "video_id"])["id"].nunique().reset_index(name="upvotes")
upvotes_df

Unnamed: 0,voter_id,video_id,upvotes
0,000064da-cce1-4dec-929f-ea2d871a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1
1,000068ac-49ae-421a-86ee-83ea321a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,1
2,00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,12
3,00009c76-592e-46a3-8754-59d68e1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,1
4,000233b9-d92e-4c07-9bf7-f5d8911a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1
...,...,...,...
142299,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,1
142300,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,2
142301,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,95c9270c-e6db-4520-823a-447aee1a2b3c,1
142302,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,c05f4ba5-8830-4be5-808c-8706e81a2b3c,2


In [13]:
df = df.merge(upvotes_df, left_on="id", right_on="voter_id", how="left")
df = df[df["video_id"].notna()]
df

Unnamed: 0,id,user_id,interests,voter_id,video_id,upvotes
1,00009c76-592e-46a3-8754-59d68e1a2b3c,,,00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,12.0
2,00009c76-592e-46a3-8754-59d68e1a2b3c,,,00009c76-592e-46a3-8754-59d68e1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,1.0
6,000233b9-d92e-4c07-9bf7-f5d8911a2b3c,,,000233b9-d92e-4c07-9bf7-f5d8911a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1.0
8,0002a16e-70bc-4fb9-9fd8-2199841a2b3c,,,0002a16e-70bc-4fb9-9fd8-2199841a2b3c,8442dd86-17b2-4887-983d-bc62c21a2b3c,1.0
9,0002bcf0-3a0e-4850-81a9-7961d61a2b3c,,,0002bcf0-3a0e-4850-81a9-7961d61a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,1.0
...,...,...,...,...,...,...
130814,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,1.0
130815,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,2.0
130816,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,95c9270c-e6db-4520-823a-447aee1a2b3c,1.0
130817,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,c05f4ba5-8830-4be5-808c-8706e81a2b3c,2.0


In [14]:
df = df.merge(video_df[["id", "season_id", "created_at"]], left_on="video_id", right_on="id", how="left", suffixes=["_user", "_video"])
df

Unnamed: 0,id_user,user_id,interests,voter_id,video_id,upvotes,id_video,season_id,created_at
0,00009c76-592e-46a3-8754-59d68e1a2b3c,,,00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,12.0,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,43fda735-831b-41e7-9100-39657e1a2b3c,2023-06-14 17:18:31
1,00009c76-592e-46a3-8754-59d68e1a2b3c,,,00009c76-592e-46a3-8754-59d68e1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,1.0,c7716a3c-5a03-4563-8364-823ebd1a2b3c,43fda735-831b-41e7-9100-39657e1a2b3c,2023-06-15 06:24:14
2,000233b9-d92e-4c07-9bf7-f5d8911a2b3c,,,000233b9-d92e-4c07-9bf7-f5d8911a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1.0,fd4e8fd6-9524-4790-b325-7170411a2b3c,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,2023-04-22 02:04:19
3,0002a16e-70bc-4fb9-9fd8-2199841a2b3c,,,0002a16e-70bc-4fb9-9fd8-2199841a2b3c,8442dd86-17b2-4887-983d-bc62c21a2b3c,1.0,8442dd86-17b2-4887-983d-bc62c21a2b3c,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,2023-04-22 02:04:19
4,0002bcf0-3a0e-4850-81a9-7961d61a2b3c,,,0002bcf0-3a0e-4850-81a9-7961d61a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,1.0,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,046a6b76-bb19-4619-bbf5-2ad9e71a2b3c,2023-04-22 02:04:19
...,...,...,...,...,...,...,...,...,...
107525,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,1.0,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:38:07
107526,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,2.0,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 02:54:38
107527,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,95c9270c-e6db-4520-823a-447aee1a2b3c,1.0,95c9270c-e6db-4520-823a-447aee1a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:36:59
107528,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,c05f4ba5-8830-4be5-808c-8706e81a2b3c,2.0,c05f4ba5-8830-4be5-808c-8706e81a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:31:12


In [15]:
df = df.merge(season_df[["id", "category"]], left_on="season_id", right_on="id", how="left")
df

Unnamed: 0,id_user,user_id,interests,voter_id,video_id,upvotes,id_video,season_id,created_at,id,category
0,00009c76-592e-46a3-8754-59d68e1a2b3c,,,00009c76-592e-46a3-8754-59d68e1a2b3c,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,12.0,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,43fda735-831b-41e7-9100-39657e1a2b3c,2023-06-14 17:18:31,43fda735-831b-41e7-9100-39657e1a2b3c,OTHERS
1,00009c76-592e-46a3-8754-59d68e1a2b3c,,,00009c76-592e-46a3-8754-59d68e1a2b3c,c7716a3c-5a03-4563-8364-823ebd1a2b3c,1.0,c7716a3c-5a03-4563-8364-823ebd1a2b3c,43fda735-831b-41e7-9100-39657e1a2b3c,2023-06-15 06:24:14,43fda735-831b-41e7-9100-39657e1a2b3c,OTHERS
2,000233b9-d92e-4c07-9bf7-f5d8911a2b3c,,,000233b9-d92e-4c07-9bf7-f5d8911a2b3c,fd4e8fd6-9524-4790-b325-7170411a2b3c,1.0,fd4e8fd6-9524-4790-b325-7170411a2b3c,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,2023-04-22 02:04:19,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,DANCE
3,0002a16e-70bc-4fb9-9fd8-2199841a2b3c,,,0002a16e-70bc-4fb9-9fd8-2199841a2b3c,8442dd86-17b2-4887-983d-bc62c21a2b3c,1.0,8442dd86-17b2-4887-983d-bc62c21a2b3c,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,2023-04-22 02:04:19,46b1f1e7-b2ab-4e16-a275-fb63081a2b3c,DANCE
4,0002bcf0-3a0e-4850-81a9-7961d61a2b3c,,,0002bcf0-3a0e-4850-81a9-7961d61a2b3c,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,1.0,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,046a6b76-bb19-4619-bbf5-2ad9e71a2b3c,2023-04-22 02:04:19,046a6b76-bb19-4619-bbf5-2ad9e71a2b3c,DANCE
...,...,...,...,...,...,...,...,...,...,...,...
107525,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,1.0,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:38:07,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,STYLE&BEAUTY
107526,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,2.0,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 02:54:38,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,STYLE&BEAUTY
107527,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,95c9270c-e6db-4520-823a-447aee1a2b3c,1.0,95c9270c-e6db-4520-823a-447aee1a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:36:59,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,STYLE&BEAUTY
107528,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,c05f4ba5-8830-4be5-808c-8706e81a2b3c,2.0,c05f4ba5-8830-4be5-808c-8706e81a2b3c,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,2023-05-02 03:31:12,9f6786f4-fc20-45c2-a8c5-944e941a2b3c,STYLE&BEAUTY


In [16]:
end_date = df["created_at"].max() - timedelta(weeks=2)
train = df[df["created_at"] < end_date][["id_user", "interests", "video_id", "category", "upvotes"]] 
train

Unnamed: 0,id_user,interests,video_id,category,upvotes
0,00009c76-592e-46a3-8754-59d68e1a2b3c,,00033275-cbd8-4ee5-8c1f-7a50491a2b3c,OTHERS,12.0
1,00009c76-592e-46a3-8754-59d68e1a2b3c,,c7716a3c-5a03-4563-8364-823ebd1a2b3c,OTHERS,1.0
2,000233b9-d92e-4c07-9bf7-f5d8911a2b3c,,fd4e8fd6-9524-4790-b325-7170411a2b3c,DANCE,1.0
3,0002a16e-70bc-4fb9-9fd8-2199841a2b3c,,8442dd86-17b2-4887-983d-bc62c21a2b3c,DANCE,1.0
4,0002bcf0-3a0e-4850-81a9-7961d61a2b3c,,ebe8ccd7-caa4-40d5-bcbd-1d21c11a2b3c,DANCE,1.0
...,...,...,...,...,...
107525,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,67dad23b-08c9-4097-98ac-dfd13d1a2b3c,STYLE&BEAUTY,1.0
107526,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,8ae653d5-ba1c-4a17-92ce-f973681a2b3c,STYLE&BEAUTY,2.0
107527,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,95c9270c-e6db-4520-823a-447aee1a2b3c,STYLE&BEAUTY,1.0
107528,ffff7f3c-c253-45e0-a881-e7988c1a2b3c,,c05f4ba5-8830-4be5-808c-8706e81a2b3c,STYLE&BEAUTY,2.0


In [17]:
enc = OrdinalEncoder(encoded_missing_value=-1)
enc.fit(df[["id_user", "interests", "video_id", "category"]])
train[["id_user", "interests", "video_id", "category"]] = enc.transform(train[["id_user", "interests", "video_id", "category"]])
train

Unnamed: 0,id_user,interests,video_id,category,upvotes
0,0.0,-1.0,0.0,9.0,12.0
1,0.0,-1.0,2043.0,9.0,1.0
2,1.0,-1.0,2580.0,2.0,1.0
3,2.0,-1.0,1354.0,2.0,1.0
4,3.0,-1.0,2423.0,2.0,1.0
...,...,...,...,...,...
107525,44498.0,-1.0,1072.0,11.0,1.0
107526,44498.0,-1.0,1421.0,11.0,2.0
107527,44498.0,-1.0,1547.0,11.0,1.0
107528,44498.0,-1.0,1961.0,11.0,2.0


In [18]:
model = RandomForestClassifier()
model.fit(train.loc[:, train.columns != "upvotes"], train["upvotes"])

In [19]:
test_upvotes = df[df["created_at"] >= end_date][["id_user", "interests", "video_id", "category", "upvotes"]] 
test = df[["id_user", "interests"]].drop_duplicates("id_user").merge(df[["video_id", "category"]].drop_duplicates("category"), how="cross")
test = pd.DataFrame(enc.transform(test), columns=test.columns)
test = pd.merge(test, train[["id_user", "interests", "video_id", "category"]], indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
test

Unnamed: 0,id_user,interests,video_id,category
1,0.0,-1.0,2580.0,2.0
2,0.0,-1.0,1944.0,10.0
3,0.0,-1.0,1419.0,11.0
4,0.0,-1.0,475.0,-1.0
5,0.0,-1.0,1771.0,6.0
...,...,...,...,...
578482,44498.0,-1.0,391.0,1.0
578483,44498.0,-1.0,2081.0,8.0
578484,44498.0,-1.0,7.0,4.0
578485,44498.0,-1.0,37.0,0.0


In [20]:
test["prediction"] = model.predict(test)
test

Unnamed: 0,id_user,interests,video_id,category,prediction
1,0.0,-1.0,2580.0,2.0,1.0
2,0.0,-1.0,1944.0,10.0,1.0
3,0.0,-1.0,1419.0,11.0,1.0
4,0.0,-1.0,475.0,-1.0,1.0
5,0.0,-1.0,1771.0,6.0,1.0
...,...,...,...,...,...
578482,44498.0,-1.0,391.0,1.0,2.0
578483,44498.0,-1.0,2081.0,8.0,1.0
578484,44498.0,-1.0,7.0,4.0,2.0
578485,44498.0,-1.0,37.0,0.0,1.0


In [21]:
test.sort_values("prediction", ascending=False)

Unnamed: 0,id_user,interests,video_id,category,prediction
376247,28942.0,-1.0,2580.0,2.0,18.0
376325,28948.0,-1.0,2580.0,2.0,18.0
376312,28947.0,-1.0,2580.0,2.0,18.0
376221,28940.0,-1.0,2580.0,2.0,18.0
376273,28944.0,-1.0,2580.0,2.0,18.0
...,...,...,...,...,...
212326,16332.0,-1.0,7.0,4.0,1.0
212324,16332.0,-1.0,391.0,1.0,1.0
212323,16332.0,-1.0,2532.0,3.0,1.0
212321,16332.0,-1.0,1771.0,6.0,1.0


In [24]:
test[test["prediction"] == 18]["video_id"].nunique()

1