In [None]:
import logging
from typing import Final
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import random

from youtube_recommender.io_methods import io_methods
from youtube_recommender.settings import VIDEOS_PATH, YOUTUBE_VIDEO_PREFIX
from rarc_utils.log import setup_logger, LOG_FMT
from IPython.core.display import HTML, display

logger = setup_logger(
    cmdLevel=logging.INFO, saveFile=0, savePandas=1, color=1, fmt=LOG_FMT
)


### 1.0 Get data

In [None]:
df = io_methods.load_feather(VIDEOS_PATH, "video")


In [None]:
print(df.description[0])


### 1.1 Define labels

### 1.2 Split test-train-validate dataset
Make sure to keep balanced distribution of videos per channel

In [None]:
# %%black
FRAC: Final[float] = 0.2
# reshuffle entire dataset
df = df.sample(frac=1)
reshuffled_sample = df.sample(frac=FRAC)
by_channel = df.groupby("channel_id")
# get channels from dataset
def sample_data(group, frac: float):
    return group.sample(frac=frac)


balanced_sample = by_channel.apply(sample_data, frac=FRAC)
# now you can see that the sample is balanced by channel_id
HEAD: Final[int] = 6
view = pd.DataFrame(
    {
        "dataset": by_channel["video_id"].count(),
        "sample_20": balanced_sample.reset_index(drop=True)
        .groupby("channel_id")["video_id"]
        .count(),
        "reshuffle_20": reshuffled_sample.groupby("channel_id")["video_id"]
        .count()
        .sort_index(),
    }
)
print("dataset: \n\n", view.head(HEAD))
# print(f"\nshould be {FRAC:.2%} sample: \n\n", view.sample_20.head(HEAD))
# print(f"\nshould be avg {FRAC:.2%} sample: \n\n", view.reshuffle_20.head(HEAD))
# a bit more unbalanced, but still OK


In [None]:
# add dependent variable column
# df["is_educational"] = 1
df["is_course"] = 0
# super lazy data scientist way: assume that FreeCodeCamp videos are courses and let the model figure out of other videos are like that
headTopChannels = by_channel[["video_id", "channel_name"]].agg({"video_id": "count", "channel_name": "last"}).sort_values("video_id", ascending=False).head(15)
# print(headTopChannels)
df["is_course"] = np.where(df.channel_id == "UC8butISFwT-Wl7EV0hUK0BQ", 1, 0)
df.is_course.sum()


In [None]:
headTopChannels

In [None]:
# TODO: tokenize the title and description

# TODO: manually label videos as tutorial / course, or find a dataset that did this

In [None]:
# split train, test, validate
ratio_train: Final[float] = 0.7
ratio_val: Final[float] = 0.15
ratio_test: Final[float] = 0.15
assert ratio_train + ratio_val + ratio_test > 0.99

# Produces test split.
# x_remaining, x_test, y_remaining, y_test = train_test_split(x, y, test_size=ratio_test)


In [None]:
# train_test_split?


In [None]:
# TODO: start with simplest model: binary logistic regression that predicts `is_educational`, then try multinomial logistic regression later


### Open video in browser

In [None]:
def visit_url(df: pd.DataFrame, index: int) -> str:
    """Visit youtube video, by adding prefix and opening browser tab."""
    url: Final[str] = YOUTUBE_VIDEO_PREFIX + df.iloc[index].video_id
    return url

In [None]:
# sel_index: Final[int] = 0
sel_index: Final[int] = random.choice(range(len(df)))
sel_title: Final[str] = df.iloc[sel_index].title
# TODO: not working in chrome? always opens new tab
reuseTab: Final[bool] = 1

url: Final[str] = visit_url(df, sel_index)

# open url in browser
targetStr: Final[str] = "_self" if reuseTab else "_blank"
htmlStr: Final[str] = f"<a href='{url}' target='{targetStr}'> Open YouTube video: {sel_title}</a>"
# print(htmlStr)
display(HTML(htmlStr))