In [1]:
import boto3
import os
from dotenv import load_dotenv
# Connect to s3
load_dotenv()
 
s3 = boto3.client(
"s3",
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
region_name=os.getenv("AWS_DEFAULT_REGION")
)

In [2]:
# Target folder
folder = "../data/raw"
os.makedirs(folder, exist_ok=True)  # create folder if it doesn't exist

# File paths
manga_file_path = os.path.join(folder, 'manga_metadata.json')
user_file_path = os.path.join(folder, 'user_readdata.json')

# Download files from S3
s3.download_file(
    Bucket='manga-recs',
    Key='raw/2026-02-10/manga_metadata.json',
    Filename=manga_file_path
)

s3.download_file(
    Bucket='manga-recs',
    Key='raw/2026-02-10/user_readdata.json',
    Filename=user_file_path
)


# Manga metadata

In [23]:
import pandas as pd

# Read raw manga metadata
manga_df = pd.read_json("../data/raw/manga_metadata.json", encoding="utf-8")

# Helper: extract english title from nested title dicts or return string as-is
def extract_english_title(title):
    if isinstance(title, dict):
        # Prefer 'english' then fallback to other common keys
        extracted =  title.get('english') or title.get('romaji') or title.get('native') or None
        if extracted:
            return extracted.lower()
        return None
    elif isinstance(title, str):
            return title.lower()
    return None

# Helper: extract tag names (lowercased) from list of tag dicts
def extract_tag_names(tags):
    if isinstance(tags, list) and tags:
        names = [t.get('name') for t in tags if isinstance(t, dict) and t.get('name')]
        if names:
            return [n.lower() for n in names]

    # If tags is a single string, return it as a single-item list (lowercased)
    if isinstance(tags, str):
        return [tags.lower()]

    # Return empty list when no tags are available
    return []

# Create cleaned columns
manga_df['title_clean'] = manga_df['title'].apply(extract_english_title)
manga_df['tags_clean'] = manga_df['tags'].apply(extract_tag_names)

# Example: show original and cleaned columns
manga_df.loc[:, ['title', 'title_clean', 'tags', 'tags_clean']].head()

Unnamed: 0,title,title_clean,tags,tags_clean
0,"{'english': 'Monster', 'romaji': 'MONSTER', 'n...",monster,"[{'name': 'Seinen', 'category': 'Demographic',...","[seinen, conspiracy, philosophy, crime, traged..."
1,"{'english': 'Berserk', 'romaji': 'Berserk', 'n...",berserk,"[{'name': 'Tragedy', 'category': 'Theme-Drama'...","[tragedy, seinen, revenge, male protagonist, d..."
2,"{'english': '20th Century Boys', 'romaji': '20...",20th century boys,"[{'name': 'Coming of Age', 'category': 'Theme-...","[coming of age, ensemble cast, politics, seine..."
3,"{'english': 'Yokohama Kaidashi Kikou', 'romaji...",yokohama kaidashi kikou,"[{'name': 'Iyashikei', 'category': 'Theme-Slic...","[iyashikei, female protagonist, post-apocalypt..."
4,{'english': 'Hajime no Ippo: Fighting Spirit!'...,hajime no ippo: fighting spirit!,"[{'name': 'Boxing', 'category': 'Theme-Game-Sp...","[boxing, shounen, male protagonist, primarily ..."


In [24]:
null_counts = manga_df.isnull().sum()
null_counts

title             0
tags              0
popularity        0
chapters        326
averageScore      0
startDate         0
endDate           0
genres            0
favourites        0
meanScore         0
isAdult           0
id                0
volumes         388
description       0
title_clean       0
tags_clean        0
dtype: int64

In [25]:
manga_df.head()

Unnamed: 0,title,tags,popularity,chapters,averageScore,startDate,endDate,genres,favourites,meanScore,isAdult,id,volumes,description,title_clean,tags_clean
0,"{'english': 'Monster', 'romaji': 'MONSTER', 'n...","[{'name': 'Seinen', 'category': 'Demographic',...",99352,162.0,91,"{'month': 12, 'year': 1994}","{'month': 12, 'year': 2001}","[Drama, Mystery, Psychological, Thriller]",10058,91,False,30001,18.0,Everyone faces uncertainty at some point in th...,monster,"[seinen, conspiracy, philosophy, crime, traged..."
1,"{'english': 'Berserk', 'romaji': 'Berserk', 'n...","[{'name': 'Tragedy', 'category': 'Theme-Drama'...",226590,,93,"{'month': 8, 'year': 1989}","{'month': None, 'year': None}","[Action, Adventure, Drama, Fantasy, Horror, Ps...",36017,93,False,30002,,"His name is Guts, the Black Swordsman, a feare...",berserk,"[tragedy, seinen, revenge, male protagonist, d..."
2,"{'english': '20th Century Boys', 'romaji': '20...","[{'name': 'Coming of Age', 'category': 'Theme-...",98859,249.0,88,"{'month': 9, 'year': 1999}","{'month': 4, 'year': 2006}","[Drama, Mystery, Psychological, Sci-Fi, Thriller]",7780,88,False,30003,22.0,"Humanity, having faced extinction at the end o...",20th century boys,"[coming of age, ensemble cast, politics, seine..."
3,"{'english': 'Yokohama Kaidashi Kikou', 'romaji...","[{'name': 'Iyashikei', 'category': 'Theme-Slic...",34640,144.0,86,"{'month': 4, 'year': 1994}","{'month': 2, 'year': 2006}","[Sci-Fi, Slice of Life]",3139,86,False,30004,14.0,Set hundreds of years in the future after an e...,yokohama kaidashi kikou,"[iyashikei, female protagonist, post-apocalypt..."
4,{'english': 'Hajime no Ippo: Fighting Spirit!'...,"[{'name': 'Boxing', 'category': 'Theme-Game-Sp...",32860,,87,"{'month': 9, 'year': 1989}","{'month': None, 'year': None}","[Action, Comedy, Drama, Sports]",3283,87,False,30007,,Makunouchi Ippo has been bullied his entire li...,hajime no ippo: fighting spirit!,"[boxing, shounen, male protagonist, primarily ..."


In [26]:
# Create is_complete column as 1 (complete) or 0 (not complete)
def has_end_date(end_date):
    """Check if endDate contains None values"""
    if isinstance(end_date, dict):
        # If any value in the dict is None, return 0, otherwise 1
        return int(not any(v is None for v in end_date.values()))
    return 0  # If it's not a dict or is None itself

manga_df['is_complete'] = manga_df['endDate'].apply(has_end_date)


In [30]:
# Fill missing chapters with -1 to indicate unknown
manga_df['chapters'] = manga_df['chapters'].fillna(-1)
manga_df['volumes'] = manga_df['volumes'].fillna(-1)

In [31]:
manga_df.head()

Unnamed: 0,title,tags,popularity,chapters,averageScore,startDate,endDate,genres,favourites,meanScore,isAdult,id,volumes,description,title_clean,tags_clean,is_complete
0,"{'english': 'Monster', 'romaji': 'MONSTER', 'n...","[{'name': 'Seinen', 'category': 'Demographic',...",99352,162.0,91,"{'month': 12, 'year': 1994}","{'month': 12, 'year': 2001}","[Drama, Mystery, Psychological, Thriller]",10058,91,False,30001,18.0,Everyone faces uncertainty at some point in th...,monster,"[seinen, conspiracy, philosophy, crime, traged...",1
1,"{'english': 'Berserk', 'romaji': 'Berserk', 'n...","[{'name': 'Tragedy', 'category': 'Theme-Drama'...",226590,-1.0,93,"{'month': 8, 'year': 1989}","{'month': None, 'year': None}","[Action, Adventure, Drama, Fantasy, Horror, Ps...",36017,93,False,30002,-1.0,"His name is Guts, the Black Swordsman, a feare...",berserk,"[tragedy, seinen, revenge, male protagonist, d...",0
2,"{'english': '20th Century Boys', 'romaji': '20...","[{'name': 'Coming of Age', 'category': 'Theme-...",98859,249.0,88,"{'month': 9, 'year': 1999}","{'month': 4, 'year': 2006}","[Drama, Mystery, Psychological, Sci-Fi, Thriller]",7780,88,False,30003,22.0,"Humanity, having faced extinction at the end o...",20th century boys,"[coming of age, ensemble cast, politics, seine...",1
3,"{'english': 'Yokohama Kaidashi Kikou', 'romaji...","[{'name': 'Iyashikei', 'category': 'Theme-Slic...",34640,144.0,86,"{'month': 4, 'year': 1994}","{'month': 2, 'year': 2006}","[Sci-Fi, Slice of Life]",3139,86,False,30004,14.0,Set hundreds of years in the future after an e...,yokohama kaidashi kikou,"[iyashikei, female protagonist, post-apocalypt...",1
4,{'english': 'Hajime no Ippo: Fighting Spirit!'...,"[{'name': 'Boxing', 'category': 'Theme-Game-Sp...",32860,-1.0,87,"{'month': 9, 'year': 1989}","{'month': None, 'year': None}","[Action, Comedy, Drama, Sports]",3283,87,False,30007,-1.0,Makunouchi Ippo has been bullied his entire li...,hajime no ippo: fighting spirit!,"[boxing, shounen, male protagonist, primarily ...",0


In [33]:
def parse_date_to_datetime(date_dict):
    """Convert date dict with month and year to datetime"""
    if isinstance(date_dict, dict) and date_dict.get('month') and date_dict.get('year'):
        try:
            return pd.to_datetime(f"{int(date_dict['year'])}-{int(date_dict['month'])}-01")
        except:
            return pd.NaT
    return pd.NaT

manga_df['startDate_parsed'] = manga_df['startDate'].apply(parse_date_to_datetime)
manga_df[['startDate', 'startDate_parsed']].head(10)

Unnamed: 0,startDate,startDate_parsed
0,"{'month': 12, 'year': 1994}",1994-12-01
1,"{'month': 8, 'year': 1989}",1989-08-01
2,"{'month': 9, 'year': 1999}",1999-09-01
3,"{'month': 4, 'year': 1994}",1994-04-01
4,"{'month': 9, 'year': 1989}",1989-09-01
5,"{'month': 5, 'year': 2003}",2003-05-01
6,"{'month': 2, 'year': 2003}",2003-02-01
7,"{'month': 9, 'year': 1999}",1999-09-01
8,"{'month': 8, 'year': 2001}",2001-08-01
9,"{'month': 7, 'year': 1997}",1997-07-01


In [35]:
manga_df.head()

Unnamed: 0,title,tags,popularity,chapters,averageScore,startDate,endDate,genres,favourites,meanScore,isAdult,id,volumes,description,title_clean,tags_clean,is_complete,startDate_parsed
0,"{'english': 'Monster', 'romaji': 'MONSTER', 'n...","[{'name': 'Seinen', 'category': 'Demographic',...",99352,162.0,91,"{'month': 12, 'year': 1994}","{'month': 12, 'year': 2001}","[Drama, Mystery, Psychological, Thriller]",10058,91,False,30001,18.0,Everyone faces uncertainty at some point in th...,monster,"[seinen, conspiracy, philosophy, crime, traged...",1,1994-12-01
1,"{'english': 'Berserk', 'romaji': 'Berserk', 'n...","[{'name': 'Tragedy', 'category': 'Theme-Drama'...",226590,-1.0,93,"{'month': 8, 'year': 1989}","{'month': None, 'year': None}","[Action, Adventure, Drama, Fantasy, Horror, Ps...",36017,93,False,30002,-1.0,"His name is Guts, the Black Swordsman, a feare...",berserk,"[tragedy, seinen, revenge, male protagonist, d...",0,1989-08-01
2,"{'english': '20th Century Boys', 'romaji': '20...","[{'name': 'Coming of Age', 'category': 'Theme-...",98859,249.0,88,"{'month': 9, 'year': 1999}","{'month': 4, 'year': 2006}","[Drama, Mystery, Psychological, Sci-Fi, Thriller]",7780,88,False,30003,22.0,"Humanity, having faced extinction at the end o...",20th century boys,"[coming of age, ensemble cast, politics, seine...",1,1999-09-01
3,"{'english': 'Yokohama Kaidashi Kikou', 'romaji...","[{'name': 'Iyashikei', 'category': 'Theme-Slic...",34640,144.0,86,"{'month': 4, 'year': 1994}","{'month': 2, 'year': 2006}","[Sci-Fi, Slice of Life]",3139,86,False,30004,14.0,Set hundreds of years in the future after an e...,yokohama kaidashi kikou,"[iyashikei, female protagonist, post-apocalypt...",1,1994-04-01
4,{'english': 'Hajime no Ippo: Fighting Spirit!'...,"[{'name': 'Boxing', 'category': 'Theme-Game-Sp...",32860,-1.0,87,"{'month': 9, 'year': 1989}","{'month': None, 'year': None}","[Action, Comedy, Drama, Sports]",3283,87,False,30007,-1.0,Makunouchi Ippo has been bullied his entire li...,hajime no ippo: fighting spirit!,"[boxing, shounen, male protagonist, primarily ...",0,1989-09-01


In [38]:
manga_df

Unnamed: 0,title,tags,popularity,chapters,averageScore,startDate,endDate,genres,favourites,meanScore,isAdult,id,volumes,description,title_clean,tags_clean,is_complete,startDate_parsed
0,"{'english': 'Monster', 'romaji': 'MONSTER', 'n...","[{'name': 'Seinen', 'category': 'Demographic',...",99352,162.0,91,"{'month': 12, 'year': 1994}","{'month': 12, 'year': 2001}","[Drama, Mystery, Psychological, Thriller]",10058,91,False,30001,18.0,Everyone faces uncertainty at some point in th...,monster,"[seinen, conspiracy, philosophy, crime, traged...",1,1994-12-01
1,"{'english': 'Berserk', 'romaji': 'Berserk', 'n...","[{'name': 'Tragedy', 'category': 'Theme-Drama'...",226590,-1.0,93,"{'month': 8, 'year': 1989}","{'month': None, 'year': None}","[Action, Adventure, Drama, Fantasy, Horror, Ps...",36017,93,False,30002,-1.0,"His name is Guts, the Black Swordsman, a feare...",berserk,"[tragedy, seinen, revenge, male protagonist, d...",0,1989-08-01
2,"{'english': '20th Century Boys', 'romaji': '20...","[{'name': 'Coming of Age', 'category': 'Theme-...",98859,249.0,88,"{'month': 9, 'year': 1999}","{'month': 4, 'year': 2006}","[Drama, Mystery, Psychological, Sci-Fi, Thriller]",7780,88,False,30003,22.0,"Humanity, having faced extinction at the end o...",20th century boys,"[coming of age, ensemble cast, politics, seine...",1,1999-09-01
3,"{'english': 'Yokohama Kaidashi Kikou', 'romaji...","[{'name': 'Iyashikei', 'category': 'Theme-Slic...",34640,144.0,86,"{'month': 4, 'year': 1994}","{'month': 2, 'year': 2006}","[Sci-Fi, Slice of Life]",3139,86,False,30004,14.0,Set hundreds of years in the future after an e...,yokohama kaidashi kikou,"[iyashikei, female protagonist, post-apocalypt...",1,1994-04-01
4,{'english': 'Hajime no Ippo: Fighting Spirit!'...,"[{'name': 'Boxing', 'category': 'Theme-Game-Sp...",32860,-1.0,87,"{'month': 9, 'year': 1989}","{'month': None, 'year': None}","[Action, Comedy, Drama, Sports]",3283,87,False,30007,-1.0,Makunouchi Ippo has been bullied his entire li...,hajime no ippo: fighting spirit!,"[boxing, shounen, male protagonist, primarily ...",0,1989-09-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862,"{'english': 'Shadow of the Supreme', 'romaji':...","[{'name': 'Male Protagonist', 'category': 'Cas...",11717,-1.0,74,"{'month': 8, 'year': 2024}","{'month': None, 'year': None}","[Action, Adventure, Fantasy]",321,75,False,180890,-1.0,"Yang-Cheon Ku, born to the virtuous House of K...",shadow of the supreme,"[male protagonist, martial arts, full color, f...",0,2024-08-01
863,"{'english': 'Absolute Regression', 'romaji': '...","[{'name': 'Martial Arts', 'category': 'Theme-A...",11672,-1.0,80,"{'month': 7, 'year': 2024}","{'month': None, 'year': None}","[Action, Fantasy]",432,81,False,180891,-1.0,"The past has changed, but like a fiery comet, ...",absolute regression,"[martial arts, age regression, wuxia, revenge,...",0,2024-07-01
864,{'english': 'The Regressed Mercenary Has a Pla...,"[{'name': 'Kingdom Management', 'category': 'T...",15728,-1.0,80,"{'month': 9, 'year': 2024}","{'month': None, 'year': None}","[Action, Fantasy]",616,80,False,182066,-1.0,"Cecil Perdium, the Mercenary King, was killed ...",the regressed mercenary has a plan,"[kingdom management, age regression, male prot...",0,2024-09-01
865,"{'english': 'My Bias Gets on the Last Train', ...","[{'name': 'Heterosexual', 'category': 'Theme-R...",11532,-1.0,86,"{'month': 12, 'year': 2024}","{'month': None, 'year': None}","[Drama, Romance]",834,87,False,187944,-1.0,"Every night, Yeo-Un takes the last train home—...",my bias gets on the last train,"[heterosexual, primarily adult cast, work, lon...",0,2024-12-01


# User read history

In [29]:
user_df = pd.read_json("../data/raw/user_readdata.json", encoding="utf-8")
user_df.head()

Unnamed: 0,mediaId,userId,user,progress,progressVolumes,score,notes,priority,private,repeat,status
0,30698,1,{'name': 'Josh'},40,6,3,,0,False,0,PAUSED
1,33500,1,{'name': 'Josh'},7,1,1,,0,False,0,PAUSED
2,35178,1,{'name': 'Josh'},6,0,2,,0,False,0,PAUSED
3,31158,1,{'name': 'Josh'},0,0,0,,0,False,0,PLANNING
4,53390,1,{'name': 'Josh'},64,10,2,,0,False,0,DROPPED


In [36]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9950 entries, 0 to 9949
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   mediaId          9950 non-null   int64  
 1   userId           9950 non-null   int64  
 2   user             9950 non-null   object 
 3   progress         9950 non-null   int64  
 4   progressVolumes  9950 non-null   int64  
 5   score            9950 non-null   int64  
 6   notes            0 non-null      float64
 7   priority         9950 non-null   int64  
 8   private          9950 non-null   bool   
 9   repeat           9950 non-null   int64  
 10  status           9950 non-null   object 
dtypes: bool(1), float64(1), int64(7), object(2)
memory usage: 787.2+ KB


In [37]:
# Extract name from user column and drop notes column
user_df['name'] = user_df['user'].apply(lambda x: x.get('name') if isinstance(x, dict) else None)
user_df = user_df.drop(columns=['notes'])
user_df.head()

Unnamed: 0,mediaId,userId,user,progress,progressVolumes,score,priority,private,repeat,status,name
0,30698,1,{'name': 'Josh'},40,6,3,0,False,0,PAUSED,Josh
1,33500,1,{'name': 'Josh'},7,1,1,0,False,0,PAUSED,Josh
2,35178,1,{'name': 'Josh'},6,0,2,0,False,0,PAUSED,Josh
3,31158,1,{'name': 'Josh'},0,0,0,0,False,0,PLANNING,Josh
4,53390,1,{'name': 'Josh'},64,10,2,0,False,0,DROPPED,Josh
