# Anime Recommendation System (Cosine Similarity)
This notebook loads and cleans the dataset, builds a content-based recommender, evaluates it with a proxy metric, and saves artifacts used by the Streamlit app.

In [None]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix
import joblib, os, json

DATA_PATH = "anime.csv"
CLEAN_PATH = "anime_cleaned.csv"
ART_DIR = "anime_reco_artifacts"
os.makedirs(ART_DIR, exist_ok=True)

#load dataset
df = pd.read_csv(DATA_PATH)
df.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


## Cleaning

In [2]:

# Standardize column names and ensure required columns exist
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
if "name" not in df.columns and "title" in df.columns:
    df["name"] = df["title"]
df = df.dropna(subset=["name"])

def clean_genres(g):
    if isinstance(g, str) and g.strip():
        parts = [p.strip() for p in g.replace(";", ",").split(",") if p.strip()]
        seen = set(); out=[]
        for p in parts:
            if p.lower() not in seen:
                out.append(p); seen.add(p.lower())
        return ", ".join(out)
    return ""

if "genre" in df.columns:
    df["genre"] = df["genre"].apply(clean_genres)
else:
    df["genre"] = ""

if "type" not in df.columns:
    df["type"] = "Unknown"
df["type"] = df["type"].fillna("Unknown").replace("", "Unknown")

if "episodes" not in df.columns:
    df["episodes"] = np.nan
df["episodes"] = pd.to_numeric(df["episodes"], errors="coerce")
med_eps = int(np.nanmedian(df["episodes"])) if np.isfinite(np.nanmedian(df["episodes"])) else 12
df["episodes"] = df["episodes"].fillna(med_eps).astype(int)

if "rating" not in df.columns:
    df["rating"] = np.nan
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
mean_rating = df["rating"].mean(skipna=True)
if np.isnan(mean_rating):
    mean_rating = 6.5
df["rating"] = df["rating"].fillna(round(mean_rating, 2))

if "members" not in df.columns:
    df["members"] = 0
df["members"] = pd.to_numeric(df["members"], errors="coerce").fillna(0)

keep_cols = [c for c in ["anime_id", "name", "genre", "type", "episodes", "rating", "members"] if c in df.columns]
df_clean = df[keep_cols].drop_duplicates(subset=["name"]).reset_index(drop=True)
df_clean.to_csv(CLEAN_PATH, index=False)
df_clean.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


## Feature Extraction & Similarity

In [3]:

from scipy.sparse import hstack

tfidf = TfidfVectorizer(token_pattern=r"[^,]+", lowercase=True, strip_accents="unicode")
X_genre = tfidf.fit_transform(df_clean["genre"].fillna(""))
# ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)

X_type = ohe.fit_transform(df_clean[["type"]])

num_feats = np.vstack([
    df_clean["rating"].values.astype(float),
    np.log1p(df_clean["members"].values.astype(float)),
    df_clean["episodes"].values.astype(float)
]).T
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
X_num = scaler.fit_transform(csr_matrix(num_feats))

X_all = hstack([X_genre, X_type, X_num]).tocsr()
similarity_matrix = cosine_similarity(X_all, dense_output=False)


## Recommend Function

In [4]:

name_to_index = {n.lower(): i for i, n in enumerate(df_clean["name"])}

def recommend(title, top_n=10, min_sim=0.15):
    idx = name_to_index.get(title.lower())
    if idx is None:
        # fallback to simple substring search
        lower = title.lower()
        candidates = [i for i, n in enumerate(df_clean["name"]) if lower in str(n).lower()]
        if not candidates:
            return pd.DataFrame(columns=["name", "similarity", "genre", "type", "episodes", "rating", "members"])
        idx = candidates[0]
    sims = similarity_matrix.getrow(idx).toarray().ravel()
    order = np.argsort(-sims)
    rec_ids = [j for j in order if j != idx and sims[j] >= min_sim][:top_n]
    out = df_clean.loc[rec_ids, ["name", "genre", "type", "episodes", "rating", "members"]].copy()
    out.insert(1, "similarity", sims[rec_ids])
    return out.reset_index(drop=True)

recommend(df_clean.loc[0, "name"], top_n=5, min_sim=0.15)


Unnamed: 0,name,similarity,genre,type,episodes,rating,members
0,Kokoro ga Sakebitagatterunda.,0.998463,"Drama, Romance, School",Movie,1,8.32,59652
1,Harmonie,0.998075,"Drama, School, Supernatural",Movie,1,7.52,29029
2,Air Movie,0.997085,"Drama, Romance, Supernatural",Movie,1,7.39,44179
3,Momo e no Tegami,0.996823,"Drama, Supernatural",Movie,1,7.78,30519
4,Hotarubi no Mori e,0.996681,"Drama, Romance, Shoujo, Supernatural",Movie,1,8.61,197439


## Proxy Evaluation (Precision@10, Recall@10)

In [5]:

def precision_recall_at_k(k=10, min_sim=0.15, sample_size=200, random_state=42):
    rng = np.random.default_rng(random_state)
    indices = rng.choice(len(df_clean), size=min(sample_size, len(df_clean)), replace=False)
    precs, recs = [], []
    genre_sets = [set([g.strip().lower() for g in s.split(",") if g.strip()]) for s in df_clean["genre"].fillna("")]
    for idx in indices:
        sims = similarity_matrix.getrow(idx).toarray().ravel()
        order = np.argsort(-sims)
        rec_ids = [j for j in order if j != idx and sims[j] >= min_sim][:k]
        target_g = genre_sets[idx]
        if not target_g:
            continue
        relevant = [j for j in range(len(df_clean)) if j != idx and (genre_sets[j] & target_g)]
        if not relevant:
            continue
        hits = sum(1 for j in rec_ids if genre_sets[j] & target_g)
        prec = hits / max(len(rec_ids), 1)
        rec = hits / len(relevant)
        precs.append(prec); recs.append(rec)
    return float(np.mean(precs)) if precs else np.nan, float(np.mean(recs)) if recs else np.nan

p_at_10, r_at_10 = precision_recall_at_k(k=10, min_sim=0.15, sample_size=300)
p_at_10, r_at_10


(0.9878378378378377, 0.004201031351493855)

## Save Artifacts for the Streamlit App

In [6]:

meta = {
    "display_cols": list(df_clean.columns),
    "index_to_name": df_clean["name"].tolist(),
    "evaluation": {
        "precision_at_10": p_at_10,
        "recall_at_10": r_at_10,
        "k": 10,
        "min_sim": 0.15
    }
}
joblib.dump(tfidf, os.path.join(ART_DIR, "tfidf.pkl"))
joblib.dump(ohe, os.path.join(ART_DIR, "type_ohe.pkl"))
joblib.dump(scaler, os.path.join(ART_DIR, "num_scaler.pkl"))
joblib.dump(X_all, os.path.join(ART_DIR, "X_all.npz"))
joblib.dump(similarity_matrix, os.path.join(ART_DIR, "similarity_matrix.npz"))
joblib.dump(df_clean, os.path.join(ART_DIR, "df_clean.pkl"))
import json
with open(os.path.join(ART_DIR, "meta.json"), "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
"Artifacts saved."


'Artifacts saved.'

### 1. Can you explain the difference between user-based and item-based collaborative filtering?

**User-Based Collaborative Filtering**  
- Focuses on finding similarity **between users**.  
- Idea: If two users have rated or interacted with items similarly, they are considered similar.  
- Recommendation: A user is recommended items that similar users liked.  
- **Example:** If User A and User B both liked the same 5 movies, and User A also liked a 6th movie, then that movie may be recommended to User B.  

**Item-Based Collaborative Filtering**  
- Focuses on finding similarity **between items**.  
- Idea: If two items are rated similarly by many users, they are considered similar.  
- Recommendation: A user is recommended items that are similar to the ones they already liked.  
- **Example:** If most users who liked "Harry Potter" also liked "Lord of the Rings", then "Lord of the Rings" will be recommended to a new user who liked "Harry Potter".  

**Key Difference:**  
- User-based → “Find people similar to me, and recommend what they liked.”  
- Item-based → “Find items similar to what I liked, and recommend those.”  

---

### 2. What is collaborative filtering, and how does it work?

**Collaborative Filtering**  
- A recommendation system technique that makes predictions based on **past behavior and preferences of users**.  
- It does not require explicit knowledge of item features (like genre, category, etc.).  
- Works on the assumption that **users with similar behavior in the past will have similar preferences in the future**.  

**How it works:**  
1. Collect user-item interaction data (ratings, clicks, purchases).  
2. Build a **user-item matrix** (rows = users, columns = items).  
3. Calculate similarities (either between users or between items).  
4. Generate recommendations using those similarities.  

**Types of Collaborative Filtering:**  
- **User-based Collaborative Filtering**  
- **Item-based Collaborative Filtering**  

**Advantages:** Simple and effective, especially when user behavior data is rich.  
**Limitations:** Struggles with *cold-start problem* (new users/items) and *sparsity* (when user-item matrix is mostly empty).  
