In [1]:
import pickle
from bertopic import BERTopic
import numpy as np
import pandas as pd
import xgboost as xgb


testing_df = pd.read_feather('test.feather')

topic_model = BERTopic.load('TopicModel')

xg = xgb.XGBClassifier()
xg.load_model('xgboost_model.json')

In [2]:
testing_df.dropna(subset=['videoID'], inplace=True)

n = 5
testing_df['combined_text'] = [' '.join(testing_df['text'].iloc[i:i+n+1]) for i in range(len(testing_df))]
testing_df['expected'] = [np.mean(testing_df['sponsored'].iloc[i:i+n+1]) for i in range(len(testing_df))]
texts = testing_df['combined_text'].tolist()
expected = testing_df['sponsored'].astype(int).tolist()

In [3]:
topics = topic_model.transform(texts)[1]

Batches:   0%|          | 0/36924 [00:00<?, ?it/s]

2025-04-04 20:08:49,098 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [4]:
from sklearn.metrics import root_mean_squared_error

y_pred_prob = xg.predict_proba(topics)[:, 1]  # Get probability of positive class
threshold = 0.3  # Reduce threshold to classify more positives
y_pred = (y_pred_prob > threshold).astype(int)  # Convert probabilities to labels
rmse = root_mean_squared_error(expected, y_pred)  # RMSE
print(f"RMSE: {rmse:.4f}")

RMSE: 0.2701


In [5]:
testing_df['prediction'] = y_pred
testing_df['prediction_raw'] = y_pred_prob

In [6]:
testing_df

Unnamed: 0,videoID,UUID,category,start,duration,text,sponsored,text_without_stopwords,combined_text,expected,prediction,prediction_raw
807,--bcgPU87Hk,661fb9820d57f249502270489a6c8532442a9e9c205e45...,sponsor,0.03,2.820,this video sponsored by sheets and,True,video sponsored sheets,this video sponsored by sheets and giggles mor...,0.500000,0,0.125202
808,--bcgPU87Hk,661fb9820d57f249502270489a6c8532442a9e9c205e45...,sponsor,1.50,3.990,giggles more on them at the end of the,True,giggles end,giggles more on them at the end of the sketch ...,0.333333,0,0.005730
809,--bcgPU87Hk,661fb9820d57f249502270489a6c8532442a9e9c205e45...,sponsor,2.85,3.390,sketch hey hey man hey hey you look,True,sketch hey hey man hey hey look,sketch hey hey man hey hey you look different ...,0.166667,0,0.220844
810,--bcgPU87Hk,,sponsor,5.49,2.369,different,False,different,different oh yeah yeah see im actually not cov...,0.000000,0,0.181572
811,--bcgPU87Hk,,sponsor,6.24,3.029,oh yeah yeah see im actually not,False,oh yeah yeah see im actually,oh yeah yeah see im actually not covered in fi...,0.000000,0,0.010176
...,...,...,...,...,...,...,...,...,...,...,...,...
5837851,zzVwq6T5xo0,af202ba338583ffc82b7ec43ef59c84581071466206398...,sponsor,784.37,2.580,it always hooks the layman and well see,True,always hooks layman well see,it always hooks the layman and well see you gu...,0.600000,0,0.022876
5837852,zzVwq6T5xo0,af202ba338583ffc82b7ec43ef59c84581071466206398...,sponsor,785.78,3.620,you guys next time,True,guys next time,you guys next time leymah nap music you,0.500000,0,0.031211
5837853,zzVwq6T5xo0,af202ba338583ffc82b7ec43ef59c84581071466206398...,sponsor,786.95,2.450,leymah nap,True,leymah nap,leymah nap music you,0.333333,0,0.041813
5837854,zzVwq6T5xo0,,sponsor,790.88,15.149,music,False,music,music you,0.000000,0,0.050317


In [7]:
subset_df = pd.read_feather('llm_results_with_deepseek.feather')
texts = subset_df['combined_text'].tolist()
expected = subset_df['sponsored'].astype(int).tolist()
topics = topic_model.transform(texts)[1]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2025-04-04 20:08:53,512 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [8]:
y_pred_prob = xg.predict_proba(topics)[:, 1]  # Get probability of positive class
threshold = 0.3  # Reduce threshold to classify more positives
y_pred = (y_pred_prob > threshold).astype(int)  # Convert probabilities to labels
rmse = root_mean_squared_error(expected, y_pred)  # RMSE
print(f"RMSE: {rmse:.4f}")

RMSE: 0.5385


In [9]:
subset_df['xg_prediction'] = y_pred.astype(bool)
subset_df['xg_prediction_raw'] = y_pred_prob

In [10]:
subset_df

Unnamed: 0,videoID,UUID,category,start,duration,text,sponsored,text_without_stopwords,combined_text,expected,...,granite-3.2-8b-instruct_One_Shot,granite-3.2-8b-instruct_CARP-LESS,gemma-3-27b-it_Zero_Shot,gemma-3-27b-it_One_Shot,gemma-3-27b-it_CARP-LESS,deepseek-r1-distill-qwen-7b_Zero_Shot,deepseek-r1-distill-qwen-7b_One_Shot,deepseek-r1-distill-qwen-7b_CARP-LESS,xg_prediction,xg_prediction_raw
0,th_KQOeh-Co,db67f847cbf9593e5dd0643f4ffaf91ebdee8263a1c56b...,sponsor,7.880,5.509,welcome welcome everyone and congratulations,True,welcome welcome everyone congratulations,welcome welcome everyone and congratulations i...,0.166667,...,True,True,True,True,True,True,True,True,True,0.894055
1,5VWTfXfm47g,f0801d6cffddc590a7732fdb0270fd8d73143ce4146320...,sponsor,300.639,3.521,again thank you to trade coffee for,True,thank trade coffee,again thank you to trade coffee for sponsoring...,0.333333,...,True,True,True,True,True,True,True,False,False,0.011293
2,A0Lkh02_Ik4,,sponsor,777.740,3.810,havent seen gigging like that since i,False,havent seen gigging like since,havent seen gigging like that since i was a fr...,0.000000,...,False,False,True,False,False,True,False,True,False,0.011425
3,sRrEkZ8OqiQ,,sponsor,455.509,3.480,its gone boom and its gone it doesnt,False,gone boom gone doesnt,its gone boom and its gone it doesnt tell me i...,0.000000,...,True,False,True,True,False,True,True,True,True,0.320752
4,5_3BUU9ZmNo,,sponsor,610.060,3.709,if you could just add more hotbar slots it\nwo...,False,could add hotbar slots would cut way amount time,if you could just add more hotbar slots it\nwo...,0.000000,...,False,False,True,False,False,False,False,False,False,0.006862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,6HKI35P-m-w,437c57864759f62683d9851603566f4d8a98ff6167f570...,sponsor,192.540,4.650,verve just by going to verve co to,True,verve going verve co,verve just by going to verve co to jacksfilms...,1.000000,...,True,False,True,True,True,True,True,True,False,0.136493
196,wYWGf2rKTCY,89630073f16e6ab44c414d1557c4e7c467eec05033bf2f...,sponsor,602.000,5.120,by design a guide to elevating your,True,design guide elevating,by design a guide to elevating your drawing sk...,1.000000,...,True,False,True,False,True,True,True,False,False,0.069150
197,oTtYPB5h47E,08a89748af8d746ce205e765091813fa1296db8371661f...,sponsor,620.740,3.570,the charges can either show up as what,True,charges either show,the charges can either show up as what actuall...,1.000000,...,True,False,True,False,False,True,False,False,False,0.209666
198,17oZPYcpPnQ,6cf9b637e525869d190dc86875245563f5aaf77ab860cd...,sponsor,0.599,2.421,this video was made possible by brilliant,True,video made possible brilliant,this video was made possible by brilliant lear...,0.500000,...,True,False,True,True,True,True,True,True,False,0.034510


In [11]:
subset_df = subset_df.drop(['videoID', 'UUID', 'category', 'text_without_stopwords'], axis=1)
subset_df

Unnamed: 0,start,duration,text,sponsored,combined_text,expected,hermes-3-llama-3.2-3b_Zero_Shot,hermes-3-llama-3.2-3b_One_Shot,hermes-3-llama-3.2-3b_CARP-LESS,meta-llama-3.1-8b-instruct_Zero_Shot,...,granite-3.2-8b-instruct_One_Shot,granite-3.2-8b-instruct_CARP-LESS,gemma-3-27b-it_Zero_Shot,gemma-3-27b-it_One_Shot,gemma-3-27b-it_CARP-LESS,deepseek-r1-distill-qwen-7b_Zero_Shot,deepseek-r1-distill-qwen-7b_One_Shot,deepseek-r1-distill-qwen-7b_CARP-LESS,xg_prediction,xg_prediction_raw
0,7.880,5.509,welcome welcome everyone and congratulations,True,welcome welcome everyone and congratulations i...,0.166667,True,True,False,True,...,True,True,True,True,True,True,True,True,True,0.894055
1,300.639,3.521,again thank you to trade coffee for,True,again thank you to trade coffee for sponsoring...,0.333333,True,True,False,True,...,True,True,True,True,True,True,True,False,False,0.011293
2,777.740,3.810,havent seen gigging like that since i,False,havent seen gigging like that since i was a fr...,0.000000,True,False,False,,...,False,False,True,False,False,True,False,True,False,0.011425
3,455.509,3.480,its gone boom and its gone it doesnt,False,its gone boom and its gone it doesnt tell me i...,0.000000,,False,False,,...,True,False,True,True,False,True,True,True,True,0.320752
4,610.060,3.709,if you could just add more hotbar slots it\nwo...,False,if you could just add more hotbar slots it\nwo...,0.000000,True,False,False,,...,False,False,True,False,False,False,False,False,False,0.006862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,192.540,4.650,verve just by going to verve co to,True,verve just by going to verve co to jacksfilms...,1.000000,False,,False,,...,True,False,True,True,True,True,True,True,False,0.136493
196,602.000,5.120,by design a guide to elevating your,True,by design a guide to elevating your drawing sk...,1.000000,True,,False,True,...,True,False,True,False,True,True,True,False,False,0.069150
197,620.740,3.570,the charges can either show up as what,True,the charges can either show up as what actuall...,1.000000,False,False,False,False,...,True,False,True,False,False,True,False,False,False,0.209666
198,0.599,2.421,this video was made possible by brilliant,True,this video was made possible by brilliant lear...,0.500000,True,False,False,False,...,True,False,True,True,True,True,True,True,False,0.034510


In [12]:
testing_df = testing_df.drop(['videoID', 'UUID', 'category', 'text_without_stopwords'], axis=1)
testing_df

Unnamed: 0,start,duration,text,sponsored,combined_text,expected,prediction,prediction_raw
807,0.03,2.820,this video sponsored by sheets and,True,this video sponsored by sheets and giggles mor...,0.500000,0,0.125202
808,1.50,3.990,giggles more on them at the end of the,True,giggles more on them at the end of the sketch ...,0.333333,0,0.005730
809,2.85,3.390,sketch hey hey man hey hey you look,True,sketch hey hey man hey hey you look different ...,0.166667,0,0.220844
810,5.49,2.369,different,False,different oh yeah yeah see im actually not cov...,0.000000,0,0.181572
811,6.24,3.029,oh yeah yeah see im actually not,False,oh yeah yeah see im actually not covered in fi...,0.000000,0,0.010176
...,...,...,...,...,...,...,...,...
5837851,784.37,2.580,it always hooks the layman and well see,True,it always hooks the layman and well see you gu...,0.600000,0,0.022876
5837852,785.78,3.620,you guys next time,True,you guys next time leymah nap music you,0.500000,0,0.031211
5837853,786.95,2.450,leymah nap,True,leymah nap music you,0.333333,0,0.041813
5837854,790.88,15.149,music,False,music you,0.000000,0,0.050317


In [13]:
testing_df.to_feather('xg_results.feather')
subset_df.to_feather('subset_full_results.feather')

In [14]:
testing_df['sponsored'].value_counts()

sponsored
False    1111801
True       69750
Name: count, dtype: int64

In [15]:
69750/1111801

0.06273604718830078