In [2]:
import boto3
import os
from dotenv import load_dotenv
# Connect to s3
load_dotenv()
 
s3 = boto3.client(
"s3",
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
region_name=os.getenv("AWS_DEFAULT_REGION")
)

In [3]:
s3.download_file('manga-recs', 'raw/2025-12-29/manga_metadata.json', 'manga_metadata.json')

In [4]:
import pandas as pd

df = pd.read_json("manga_metadata.json", encoding="utf-8")
df.head()

Unnamed: 0,title,tags,popularity,chapters,averageScore,startDate,endDate,genres,favourites,meanScore,isAdult,id,volumes,description
0,{'english': 'Monster'},"[{'category': 'Demographic', 'description': 'T...",97627,162.0,90,"{'month': 12, 'year': 1994}","{'month': 12, 'year': 2001}","[Drama, Mystery, Psychological, Thriller]",9895,91,False,30001,18.0,Everyone faces uncertainty at some point in th...
1,{'english': 'Berserk'},"[{'category': 'Theme-Drama', 'description': 'C...",222675,,93,"{'month': 8, 'year': 1989}","{'month': None, 'year': None}","[Action, Adventure, Drama, Fantasy, Horror, Ps...",35411,93,False,30002,,"His name is Guts, the Black Swordsman, a feare..."
2,{'english': '20th Century Boys'},"[{'category': 'Theme-Drama', 'description': 'C...",97106,249.0,88,"{'month': 9, 'year': 1999}","{'month': 4, 'year': 2006}","[Drama, Mystery, Psychological, Sci-Fi, Thriller]",7656,88,False,30003,22.0,"Humanity, having faced extinction at the end o..."
3,{'english': 'Yokohama Kaidashi Kikou'},"[{'category': 'Theme-Slice of Life', 'descript...",34159,144.0,86,"{'month': 4, 'year': 1994}","{'month': 2, 'year': 2006}","[Sci-Fi, Slice of Life]",3072,86,False,30004,14.0,Set hundreds of years in the future after an e...
4,{'english': 'Hajime no Ippo: Fighting Spirit!'},"[{'category': 'Theme-Game-Sport', 'description...",32281,,87,"{'month': 9, 'year': 1989}","{'month': None, 'year': None}","[Action, Comedy, Drama, Sports]",3214,87,False,30007,,Makunouchi Ippo has been bullied his entire li...


In [5]:
# 2/2/2026: DEAL WITH NANS

# check number of missing
df[["volumes", "chapters"]].isna().mean()

volumes     0.408163
chapters    0.364431
dtype: float64

bro how is it different LOL (volumes and chapters different amount of missing values)

Well i think its still fine, since our dataset is so large it should be okay to just impute with whatevers remaining

In [6]:
df[["volumes", "chapters"]].mean()
# 133 chapters 15 volumes is the average
for col in ["volumes", "chapters"]:
    df[col + "_missing"] = df[col].isna()
    df[col] = df[col].fillna(df[col].median())

df.head()


Unnamed: 0,title,tags,popularity,chapters,averageScore,startDate,endDate,genres,favourites,meanScore,isAdult,id,volumes,description,volumes_missing,chapters_missing
0,{'english': 'Monster'},"[{'category': 'Demographic', 'description': 'T...",97627,162.0,90,"{'month': 12, 'year': 1994}","{'month': 12, 'year': 2001}","[Drama, Mystery, Psychological, Thriller]",9895,91,False,30001,18.0,Everyone faces uncertainty at some point in th...,False,False
1,{'english': 'Berserk'},"[{'category': 'Theme-Drama', 'description': 'C...",222675,133.0,93,"{'month': 8, 'year': 1989}","{'month': None, 'year': None}","[Action, Adventure, Drama, Fantasy, Horror, Ps...",35411,93,False,30002,15.0,"His name is Guts, the Black Swordsman, a feare...",True,True
2,{'english': '20th Century Boys'},"[{'category': 'Theme-Drama', 'description': 'C...",97106,249.0,88,"{'month': 9, 'year': 1999}","{'month': 4, 'year': 2006}","[Drama, Mystery, Psychological, Sci-Fi, Thriller]",7656,88,False,30003,22.0,"Humanity, having faced extinction at the end o...",False,False
3,{'english': 'Yokohama Kaidashi Kikou'},"[{'category': 'Theme-Slice of Life', 'descript...",34159,144.0,86,"{'month': 4, 'year': 1994}","{'month': 2, 'year': 2006}","[Sci-Fi, Slice of Life]",3072,86,False,30004,14.0,Set hundreds of years in the future after an e...,False,False
4,{'english': 'Hajime no Ippo: Fighting Spirit!'},"[{'category': 'Theme-Game-Sport', 'description...",32281,133.0,87,"{'month': 9, 'year': 1989}","{'month': None, 'year': None}","[Action, Comedy, Drama, Sports]",3214,87,False,30007,15.0,Makunouchi Ippo has been bullied his entire li...,True,True


In [8]:
# make integers instead of objects so we can work with them easier later
df["chapters"] = df["chapters"].round().astype(int)
df["volumes"]  = df["volumes"].round().astype(int)

# no duplicates or missing ids
df["id"].isna().sum()
df["id"].duplicated().sum()

# check outliers
df[["chapters","volumes","popularity","favourites"]].describe(percentiles=[.01,.05,.95,.99])

import numpy as np

# log transform populartiy and favourties to drop skew 
# wont drop the og ones though in case needed for some reason for now 
df["popularity_log"] = np.log1p(df["popularity"])
df["favourites_log"] = np.log1p(df["favourites"])

# Dates. Since startDate and endDate are dicts, we extract the avtual start and end years
df["start_year"] = df["startDate"].apply(lambda d: d.get("year") if isinstance(d, dict) else pd.NA).astype("Int64")
df["start_month"] = df["startDate"].apply(lambda d: d.get("month") if isinstance(d, dict) else pd.NA).astype("Int64")

df["end_year"] = df["endDate"].apply(lambda d: d.get("year") if isinstance(d, dict) else pd.NA).astype("Int64")
df["end_month"] = df["endDate"].apply(lambda d: d.get("month") if isinstance(d, dict) else pd.NA).astype("Int64")

df["is_finished"] = df["end_year"].notna()
df["tags"].head()

0    [{'category': 'Demographic', 'description': 'T...
1    [{'category': 'Theme-Drama', 'description': 'C...
2    [{'category': 'Theme-Drama', 'description': 'C...
3    [{'category': 'Theme-Slice of Life', 'descript...
4    [{'category': 'Theme-Game-Sport', 'description...
Name: tags, dtype: object