In [1]:
import pandas as pd 
import numpy as np

df = pd.read_parquet('../data/features/manga_features.parquet')
manga_info = pd.read_parquet('../data/cleaned/cleaned_manga_metadata.parquet')

Load features

In [2]:
df.head()

Unnamed: 0,popularity,chapters,averageScore,isAdult,id,has_end_date,release_year,4-koma,achromatic,achronological order,...,Mecha,Music,Mystery,Psychological,Romance,Sci-Fi,Slice of Life,Sports,Supernatural,Thriller
0,2.365659,0.974305,2.726926,0,30001,1,-2.204892,0,0,1,...,0,0,1,1,0,0,0,0,0,1
1,3.624993,-1.198447,3.149549,0,30002,0,-2.774917,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,2.358061,1.156746,2.092991,0,30003,1,-1.634867,0,0,1,...,0,0,1,1,0,1,0,0,0,1
3,0.756273,0.924391,1.670368,0,30004,1,-2.204892,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,0.675697,-1.198447,1.88168,0,30007,0,-2.774917,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Cosine similarity

In [3]:
def cosine_similarity_matrix(df, columns=None, top_k=None, as_dataframe=True):
    """
    Compute cosine similarity between rows.
    - df: DataFrame with rows to compare.
    - columns: list of numeric column names to use. If None, uses all numeric columns.
    - top_k: if set, zero out all but the top_k highest similarities per row (excluding self).
    - as_dataframe: return a pandas DataFrame (indexed by df.index) if True, else numpy array.
    """
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np

    df = df.copy()

    df = df.drop(columns=['id'])

    if columns is None:
        num_df = df.select_dtypes(include=[np.number]).fillna(0)
    else:
        num_df = df[columns].fillna(0)

    X = num_df.values
    sim = cosine_similarity(X)

    if top_k is not None:
        n = sim.shape[0]
        for i in range(n):
            row = sim[i]
            # keep self plus top_k others
            idx_desc = np.argsort(row)[::-1]
            keep = [i]
            for idx in idx_desc:
                if idx == i:
                    continue
                keep.append(idx)
                if len(keep) >= top_k + 1:
                    break
            mask = np.ones(n, dtype=bool)
            mask[keep] = False
            sim[i, mask] = 0.0

    if as_dataframe:
        return pd.DataFrame(sim, index=df.index, columns=df.index)
    return sim

def top_k_similar_items(df, item, k=5, columns=None):
    """
    Return top-k most similar row indices to `item`.
    - `item` can be an index label (preferred) or an integer position.
    - returns a pandas Series of similarity scores sorted descending (excludes the item itself).
    """
    sim_df = cosine_similarity_matrix(df, columns=columns, as_dataframe=True)
    if item not in sim_df.index:
        # try treat as position
        try:
            item = sim_df.index[int(item)]
        except Exception:
            raise KeyError("item not found as index or position")
    row = sim_df.loc[item].drop(item)
    return row.sort_values(ascending=False).head(k)
# ...existing code...

In [4]:
manga_info[manga_info['title'] == 'jujutsu kaisen']

Unnamed: 0,title,tags,popularity,chapters,averageScore,startDate,genres,favourites,meanScore,isAdult,id,volumes,description,has_end_date
493,jujutsu kaisen,"[urban fantasy, demons, super power, shounen, ...",233000,272.0,80,2018-03-01,"[Action, Drama, Supernatural]",21088,80,False,101517,30.0,Although Yuji Itadori looks like your average ...,1


In [5]:
sims = cosine_similarity_matrix(df)

top_k_similar_items(df, item=493, k=5)

531    0.634767
536    0.592479
481    0.549898
215    0.548825
8      0.548097
Name: 493, dtype: float64

In [6]:
top_matches = list(top_k_similar_items(df, item=493, k=10).index)

In [7]:
manga_info.iloc[top_matches]

Unnamed: 0,title,tags,popularity,chapters,averageScore,startDate,genres,favourites,meanScore,isAdult,id,volumes,description,has_end_date
531,jujutsu kaisen 0,"[shounen, demons, male protagonist, school, tr...",67316,4.0,80,2017-04-01,"[Action, Supernatural]",1999,80,False,105469,1.0,Yuta Okkotsu is a nervous high school student ...,1
536,chainsaw man,"[demons, tragedy, urban fantasy, dissociative ...",307416,-1.0,85,2018-12-01,"[Action, Comedy, Drama, Horror, Supernatural]",42848,85,False,105778,-1.0,The name says it all! Denji's life of poverty ...,0
481,hell’s paradise: jigokuraku,"[survival, samurai, philosophy, mythology, nin...",91160,127.0,81,2018-01-01,"[Action, Adventure, Mystery, Supernatural]",5752,81,False,100994,13.0,Gabimaru the Hollow is one of the most vicious...,1
215,attack on titan,"[dystopian, anti-hero, war, philosophy, traged...",217759,141.0,84,2009-09-01,"[Action, Drama, Fantasy, Mystery]",19740,84,False,53390,34.0,"In this post-apocalyptic sci-fi story, humanit...",1
8,bleach,"[shounen, swordplay, super power, ghost, male ...",111421,706.0,79,2001-08-01,"[Action, Adventure, Supernatural]",10396,79,False,30012,74.0,Ichigo Kurosaki has always been able to see gh...,1
295,mob psycho 100,"[coming of age, philosophy, super power, urban...",46577,113.0,85,2012-04-01,"[Action, Comedy, Drama, Psychological, Slice o...",3073,85,False,85189,16.0,Do you or someone you know need an exorcist wh...,1
268,one-punch man,"[superhero, super power, urban fantasy, parody...",163745,-1.0,85,2012-06-01,"[Action, Comedy, Sci-Fi, Supernatural]",13338,85,False,74347,-1.0,"In this new action-comedy, everything about a ...",0
107,dorohedoro,"[dissociative identities, magic, ensemble cast...",82205,191.0,86,2000-12-01,"[Action, Adventure, Comedy, Fantasy, Horror, M...",7926,86,False,31133,23.0,"In a city so dismal it's known only as ""the Ho...",1
324,tokyo ghoul:re,"[tragedy, dissociative identities, body horror...",95999,181.0,83,2014-10-01,"[Action, Drama, Horror, Mystery, Psychological...",8453,83,False,85611,16.0,"In Tokyo, an unchanging despair is lurking. My...",1
128,jojo's bizarre adventure part 6: stone ocean,"[female protagonist, super power, prison, trag...",69483,158.0,82,1999-12-01,"[Action, Adventure, Drama, Supernatural]",4579,82,False,33009,17.0,Jolyne Cujoh—daughter of Jotaro Kujo—is senten...,1


# Collaborative filtering

In [8]:
user_interactions = pd.read_parquet('../data/features/user_features.parquet')

In [9]:
user_interactions.head()

Unnamed: 0,mediaId,userId,score,status,interaction_strength
0,30698,1,3,0.5,0.15
1,33500,1,1,0.5,0.05
2,35178,1,2,0.5,0.1
3,31158,1,0,0.4,0.0
4,53390,1,2,0.1,0.02


In [16]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Encode users
user_encoder = LabelEncoder()
user_interactions["user_idx"] = user_encoder.fit_transform(user_interactions["userId"])

# Encode manga/items
item_encoder = LabelEncoder()
user_interactions["item_idx"] = item_encoder.fit_transform(user_interactions["mediaId"])

# Get counts
n_users = user_interactions["user_idx"].nunique()
n_items = user_interactions["item_idx"].nunique()

print("Users:", n_users)
print("Manga:", n_items)


Users: 6
Manga: 47


In [17]:
user_interactions['userId'].nunique()

6