In [None]:
import ast
import re

import numpy as np
import pandas as pd
import scipy
import scipy.signal as signal
import seaborn as sns
import torch
from sklearn.preprocessing import scaler

In [2]:
media_df = pd.read_csv("../data/media.csv")
users_df = pd.read_csv("../data/users.csv")

In [3]:
media_df = media_df[media_df["type"] == "ANIME"]
media_df = media_df[media_df["description"].notna()]
media_df["description"] = media_df["description"].apply(
    lambda x: x.replace("\n", " ").strip() if isinstance(x, str) else x
)
media_df["description"] = media_df["description"].apply(
    lambda x: re.sub(r"<.*?>", "", x).strip() if isinstance(x, str) else x
)

In [4]:
media_df.columns

Index(['id', 'title', 'description', 'genres', 'tags', 'isAdult', 'format',
       'meanScore', 'popularity', 'relations', 'startDate', 'endDate',
       'season', 'updatedAt', 'stats', 'type'],
      dtype='object')

In [5]:
media_df.head(1)

Unnamed: 0,id,title,description,genres,tags,isAdult,format,meanScore,popularity,relations,startDate,endDate,season,updatedAt,stats,type
0,1,"{'native': 'カウボーイビバップ', 'romaji': 'Cowboy Bebo...","Enter a world in the distant future, where Bou...","['Action', 'Adventure', 'Drama', 'Sci-Fi']","[{'id': 63, 'name': 'Space', 'category': 'Sett...",False,TV,86.0,400454,"{'nodes': [{'id': 5, 'title': {'english': ""Cow...",{'year': 1998},{'year': 1999},SPRING,1751556013,"{'scoreDistribution': [{'amount': 549, 'score'...",ANIME


In [None]:
"""
id --> used in vector db to identify anime
title --> not used
description --> N most recently watched anime descriptions are avg pooled to get a single anime embedding vector
genres --> combine with tags
tags --> combine with genres -> embedd each tag/genre and avg pool to get a single feature vector
isadult --> maybe use number of adult anime as a feature
format --> not used?
meanScore --> scale to 0-1 and use as a feature
popularity --> scale to 0-1 and use as a feature
relations --> could be used to share embeddings between related anime
startDate --> not used?
endDate --> not used?
season --> not used?
updatedAt --> not used?
stats --> could use softmax to create a feature vector of a cumulative score distribution of watched anime
type --> only used to filter for anime
"""

In [6]:
media_df["stats"] = media_df["stats"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
media_df["stats"] = media_df["stats"].apply(
    lambda x: None if x["scoreDistribution"] is None else x["scoreDistribution"]
)
media_df = media_df[media_df["stats"].notna()]

In [None]:
media_df["genres"] = media_df["genres"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
media_df["tags"] = media_df["tags"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
media_df["tags"] = media_df["tags"].apply(lambda x: None if len(x) == 0 else x)
media_df["genres"] = media_df["genres"].apply(lambda x: None if len(x) == 0 else x)

In [None]:
def combine_genres_tags(row):
    genres = set()
    if row["genres"] is not None:
        genres = set(row["genres"])
    tags = set()
    if row["tags"] is not None:
        tags = set(t["name"] for t in row["tags"])

    unioned_tags = list(genres.union(tags))

    return unioned_tags if unioned_tags else None


media_df["genres_tags"] = media_df.apply(combine_genres_tags, axis=1)

media_df = media_df[media_df["genres_tags"].notna()]

In [None]:
media_df = media_df[
    media_df["format"].isin(["TV", "MOVIE", "OVA", "TV_SHORT", "SPECIAL", "ONA"])
]

In [55]:
media_df.columns

Index(['id', 'title', 'description', 'genres', 'tags', 'isAdult', 'format',
       'meanScore', 'popularity', 'relations', 'startDate', 'endDate',
       'season', 'updatedAt', 'stats', 'type', 'genres_tags'],
      dtype='object')

In [56]:
media_df = media_df[media_df["meanScore"].notna()]
media_df = media_df[media_df["popularity"].notna()]