In [1]:
import boto3
import os
from dotenv import load_dotenv
# Connect to s3
load_dotenv()
 
s3 = boto3.client(
"s3",
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
region_name=os.getenv("AWS_DEFAULT_REGION")
)

In [4]:
# Target folder
folder = "../data/raw"
os.makedirs(folder, exist_ok=True)  # create folder if it doesn't exist

# File paths
manga_file_path = os.path.join(folder, 'manga_metadata.json')
user_file_path = os.path.join(folder, 'user_readdata.json')

# Download files from S3
s3.download_file(
    Bucket='manga-recs',
    Key='raw/2026-02-09/manga_metadata.json',
    Filename=manga_file_path
)

s3.download_file(
    Bucket='manga-recs',
    Key='raw/2026-02-09/user_readdata.json',
    Filename=user_file_path
)


# Manga metadata

In [19]:
import pandas as pd

# Read raw manga metadata
manga_df = pd.read_json("../data/raw/manga_metadata.json", encoding="utf-8")

# Helper: extract english title from nested title dicts or return string as-is
def extract_english_title(title):
    if isinstance(title, dict):
        # Prefer 'english' then fallback to other common keys
        extracted =  title.get('english') or title.get('romaji') or title.get('native') or None
        if extracted:
            return extracted.lower()
        return None
    elif isinstance(title, str):
            return title.lower()
    return None

# Helper: extract tag names (lowercased) from list of tag dicts
def extract_tag_names(tags):
    if isinstance(tags, list) and tags:
        names = [t.get('name') for t in tags if isinstance(t, dict) and t.get('name')]
        if names:
            return [n.lower() for n in names]

    # If tags is a single string, return it as a single-item list (lowercased)
    if isinstance(tags, str):
        return [tags.lower()]

    # Return empty list when no tags are available
    return []

# Create cleaned columns
manga_df['title_clean'] = manga_df['title'].apply(extract_english_title)
manga_df['tags_clean'] = manga_df['tags'].apply(extract_tag_names)

# Example: show original and cleaned columns
manga_df.loc[:, ['title', 'title_clean', 'tags', 'tags_clean']].head()

Unnamed: 0,title,title_clean,tags,tags_clean
0,{'english': 'Monster'},monster,"[{'name': 'Seinen', 'category': 'Demographic',...","[seinen, conspiracy, philosophy, crime, traged..."
1,{'english': 'Berserk'},berserk,"[{'name': 'Tragedy', 'category': 'Theme-Drama'...","[tragedy, seinen, revenge, male protagonist, d..."
2,{'english': '20th Century Boys'},20th century boys,"[{'name': 'Coming of Age', 'category': 'Theme-...","[coming of age, ensemble cast, politics, seine..."
3,{'english': 'Yokohama Kaidashi Kikou'},yokohama kaidashi kikou,"[{'name': 'Iyashikei', 'category': 'Theme-Slic...","[iyashikei, female protagonist, post-apocalypt..."
4,{'english': 'Hajime no Ippo: Fighting Spirit!'},hajime no ippo: fighting spirit!,"[{'name': 'Boxing', 'category': 'Theme-Game-Sp...","[boxing, shounen, male protagonist, primarily ..."


In [23]:
null_counts = manga_df.isnull().sum()
null_counts

title             0
tags              0
popularity        0
chapters        326
averageScore      0
startDate         0
endDate           0
genres            0
favourites        0
meanScore         0
isAdult           0
id                0
volumes         388
description       0
title_clean      81
tags_clean        0
dtype: int64

In [25]:
null_rows = manga_df[manga_df['title_clean'].isnull()]
null_rows

Unnamed: 0,title,tags,popularity,chapters,averageScore,startDate,endDate,genres,favourites,meanScore,isAdult,id,volumes,description,title_clean,tags_clean
79,{'english': None},"[{'name': 'Martial Arts', 'category': 'Theme-A...",14421,338.0,74,"{'month': 11, 'year': 1998}","{'month': 1, 'year': 2015}","[Action, Drama, Psychological, Sports]",486,75,False,30727,34.0,At the age of sixteen Ryo Narushima was a geni...,,"[martial arts, male protagonist, anti-hero, re..."
88,{'english': None},"[{'name': 'Surreal Comedy', 'category': 'Theme...",16330,89.0,80,"{'month': 2, 'year': 1993}","{'month': 3, 'year': 2000}","[Action, Comedy, Romance]",635,80,False,30838,15.0,Seiichirou Kitano is a naive and kind boy with...,,"[surreal comedy, delinquents, shounen, school,..."
95,{'english': None},"[{'name': 'Baseball', 'category': 'Theme-Game-...",10691,233.0,82,"{'month': 6, 'year': 1998}","{'month': 11, 'year': 2003}","[Comedy, Drama, Sports]",407,83,False,30915,24.0,Koichi Kawato is the new Japanese teacher at t...,,"[baseball, delinquents, coming of age, shounen..."
100,{'english': None},"[{'name': 'Martial Arts', 'category': 'Theme-A...",14325,584.0,76,"{'month': 8, 'year': 2002}","{'month': 9, 'year': 2014}","[Action, Comedy, Drama, Ecchi]",538,77,False,30988,61.0,"The story is focused on Kenichi Shirahama, an ...",,"[martial arts, shounen, delinquents, male prot..."
104,{'english': None},"[{'name': 'Ensemble Cast', 'category': 'Cast-M...",10359,178.0,81,"{'month': 10, 'year': 2001}","{'month': 5, 'year': 2017}","[Adventure, Drama, Horror, Mystery, Psychologi...",535,83,False,31075,35.0,"Scientists had predicted this disaster, and in...",,"[ensemble cast, female protagonist, survival, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,{'english': None},"[{'name': 'Time Skip', 'category': 'Setting-Ti...",12505,3.0,83,"{'month': 2, 'year': 2021}","{'month': 2, 'year': 2021}",[Comedy],636,83,False,129574,,A Twitter comic by Ishida Sui about a gorilla ...,,"[time skip, tragedy, meta, animals, philosophy..."
751,{'english': None},"[{'name': 'Crime', 'category': 'Theme-Other', ...",25997,,82,"{'month': 2, 'year': 2023}","{'month': None, 'year': None}","[Action, Adventure, Supernatural]",824,82,False,138072,,The stage is set on a volcanic island in the P...,,"[crime, super power, foreign, male protagonist..."
758,{'english': None},"[{'name': 'Time Loop', 'category': 'Theme-Sci-...",10783,172.0,82,"{'month': 5, 'year': 2021}","{'month': 7, 'year': 2024}","[Comedy, Fantasy, Romance]",745,83,False,138705,,Na Kang-Rim is a high school student who gener...,,"[time loop, female harem, school, full color, ..."
809,{'english': None},"[{'name': 'Tsundere', 'category': 'Cast-Traits...",13050,106.0,73,"{'month': 7, 'year': 2022}","{'month': 8, 'year': 2023}","[Comedy, Ecchi, Romance, Slice of Life]",377,74,False,152855,2.0,A tsundere girl getting less and less tsun day...,,"[tsundere, heterosexual, female protagonist, p..."


# User read history

In [8]:
user_df = pd.read_json("../data/raw/user_readdata.json", encoding="utf-8")
user_df.head()

Unnamed: 0,mediaId,userId,user,progress,progressVolumes,score,notes,priority,private,repeat,status
0,30698,1,{'name': 'Josh'},40,6,3,,0,False,0,PAUSED
1,33500,1,{'name': 'Josh'},7,1,1,,0,False,0,PAUSED
2,35178,1,{'name': 'Josh'},6,0,2,,0,False,0,PAUSED
3,31158,1,{'name': 'Josh'},0,0,0,,0,False,0,PLANNING
4,53390,1,{'name': 'Josh'},64,10,2,,0,False,0,DROPPED
