import pandas as pd 
import numpy as np

df = pd.read_parquet('../data/features/manga_features.parquet')
manga_info = pd.read_parquet('../data/cleaned/cleaned_manga_metadata.parquet')

Load features

In [2]:
df.head()

Unnamed: 0,popularity,chapters,averageScore,id,has_end_date,release_year,4-koma,achromatic,achronological order,acrobatics,...,Mecha,Music,Mystery,Psychological,Romance,Sci-Fi,Slice of Life,Sports,Supernatural,Thriller
0,2.354564,0.977141,2.72116,30001,1,-2.192731,0,0,1,0,...,0,0,1,1,0,0,0,0,0,1
1,3.611489,-1.190957,3.143884,30002,0,-2.760574,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,2.34698,1.159191,2.087075,30003,1,-1.624887,0,0,1,0,...,0,0,1,1,0,1,0,0,0,1
3,0.748257,0.927334,1.664351,30004,1,-2.192731,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,0.667836,-1.190957,1.875713,30007,0,-2.760574,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Cosine similarity

In [4]:
def cosine_similarity_matrix(df, columns=None, top_k=None, as_dataframe=True):
    """
    Compute cosine similarity between rows.
    - df: DataFrame with rows to compare.
    - columns: list of numeric column names to use. If None, uses all numeric columns.
    - top_k: if set, zero out all but the top_k highest similarities per row (excluding self).
    - as_dataframe: return a pandas DataFrame (indexed by df.index) if True, else numpy array.
    """
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np

    df = df.copy()

    df = df.drop(columns=['id'])

    if columns is None:
        num_df = df.select_dtypes(include=[np.number]).fillna(0)
    else:
        num_df = df[columns].fillna(0)

    X = num_df.values
    sim = cosine_similarity(X)

    if top_k is not None:
        n = sim.shape[0]
        for i in range(n):
            row = sim[i]
            # keep self plus top_k others
            idx_desc = np.argsort(row)[::-1]
            keep = [i]
            for idx in idx_desc:
                if idx == i:
                    continue
                keep.append(idx)
                if len(keep) >= top_k + 1:
                    break
            mask = np.ones(n, dtype=bool)
            mask[keep] = False
            sim[i, mask] = 0.0

    if as_dataframe:
        return pd.DataFrame(sim, index=df.index, columns=df.index)
    return sim

def top_k_similar_items(df, item, k=5, columns=None):
    """
    Return top-k most similar row indices to `item`.
    - `item` can be an index label (preferred) or an integer position.
    - returns a pandas Series of similarity scores sorted descending (excludes the item itself).
    """
    sim_df = cosine_similarity_matrix(df, columns=columns, as_dataframe=True)
    if item not in sim_df.index:
        # try treat as position
        try:
            item = sim_df.index[int(item)]
        except Exception:
            raise KeyError("item not found as index or position")
    row = sim_df.loc[item].drop(item)
    return row.sort_values(ascending=False).head(k)
# ...existing code...

In [5]:
manga_info[manga_info['title'] == 'uzumaki: spiral into horror']


Unnamed: 0,title,tags,popularity,chapters,averageScore,startDate,genres,favourites,meanScore,isAdult,id,volumes,description,has_end_date
53,uzumaki: spiral into horror,"[tragedy, body horror, cosmic horror, seinen, ...",82862,20.0,79,1998-01-01,"[Drama, Horror, Mystery, Psychological, Romanc...",4522,79,False,30436,3.0,"Kurozu-cho, a small fogbound town on the coast...",1


In [6]:
sims = cosine_similarity_matrix(df) 
top_k_similar_items(df, item=53, k=5)

94     0.646712
242    0.610982
98     0.543087
324    0.541062
695    0.515857
Name: 53, dtype: float64

Find top matches

In [7]:
top_matches = list(top_k_similar_items(df, item=53, k=10).index)
manga_info.iloc[top_matches]

Unnamed: 0,title,tags,popularity,chapters,averageScore,startDate,genres,favourites,meanScore,isAdult,id,volumes,description,has_end_date
94,tomie,"[body horror, monster girl, gore, tragedy, fem...",39710,13.0,76,1987-02-01,"[Drama, Fantasy, Horror, Supernatural, Thriller]",1585,76,False,30912,1.0,Tomie Kawakami is a femme fatale with long bla...,1
242,tokyo ghoul,"[tragedy, cannibalism, urban fantasy, gore, ma...",181136,144.0,84,2011-09-01,"[Action, Horror, Mystery, Psychological, Super...",15534,84,False,63327,14.0,Shy Ken Kaneki is thrilled to go on a date wit...,1
98,homunculus,"[philosophy, seinen, psychosexual, male protag...",65736,166.0,83,2003-03-01,"[Drama, Horror, Mystery, Psychological, Supern...",4322,83,False,30936,15.0,"Nakoshi Susumu, age 34, is living out of his c...",1
324,tokyo ghoul:re,"[tragedy, dissociative identities, body horror...",95999,181.0,83,2014-10-01,"[Action, Drama, Horror, Mystery, Psychological...",8453,83,False,85611,16.0,"In Tokyo, an unchanging despair is lurking. My...",1
695,bibliomania,"[post-apocalyptic, philosophy, female protagon...",21033,12.0,78,2016-09-01,"[Adventure, Drama, Fantasy, Horror, Psychologi...",1124,78,False,126135,1.0,Alice had woken up in Room 431 of a mysterious...,1
479,sweet home,"[post-apocalyptic, survival, demons, tragedy, ...",51760,141.0,81,2017-10-01,"[Drama, Horror, Psychological, Supernatural, T...",2929,82,False,100954,12.0,"After an unexpected family tragedy, a reclusiv...",1
15,death note,"[detective, police, anti-hero, tragedy, urban ...",93049,108.0,84,2003-12-01,"[Drama, Mystery, Psychological, Supernatural, ...",5372,84,False,30021,12.0,Light Yagami is an ace student with great pros...,1
45,blame!,"[cyborg, seinen, post-apocalyptic, cyberpunk, ...",59141,66.0,81,1997-01-01,"[Action, Drama, Horror, Psychological, Sci-Fi]",3734,81,False,30149,10.0,"In a future version of Earth, there is a city ...",1
50,parasyte,"[male protagonist, seinen, body horror, aliens...",30646,64.0,82,1989-11-01,"[Action, Drama, Horror, Psychological, Sci-Fi]",1548,82,False,30401,10.0,They arrive in silence and darkness. They desc...,1
396,the promised neverland,"[survival, primarily child cast, body horror, ...",127432,181.0,79,2016-08-01,"[Drama, Fantasy, Horror, Mystery, Psychologica...",7113,79,False,87423,20.0,"Emma, Norman and Ray are the brightest kids at...",1


# Collaborative filtering

In [8]:
user_interactions = pd.read_parquet('../data/features/user_features.parquet')

In [9]:
user_interactions.head()

Unnamed: 0,mediaId,userId,progress,score,status_COMPLETED,status_CURRENT,status_DROPPED,status_PAUSED,status_PLANNING
0,30698,1,40,3,False,False,False,True,False
1,33500,1,7,1,False,False,False,True,False
2,35178,1,6,2,False,False,False,True,False
3,31158,1,0,0,False,False,False,False,True
4,53390,1,64,2,False,False,True,False,False


In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Encode users
user_encoder = LabelEncoder()
user_interactions["user_idx"] = user_encoder.fit_transform(user_interactions["userId"])

# Encode manga/items
item_encoder = LabelEncoder()
user_interactions["item_idx"] = item_encoder.fit_transform(user_interactions["mediaId"])

# Get counts
n_users = user_interactions["user_idx"].nunique()
n_items = user_interactions["item_idx"].nunique()

print("Users:", n_users)
print("Manga:", n_items)


Users: 6
Manga: 47


In [11]:
user_interactions['userId'].nunique()

6