In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# <p style="background-color:#F8F1E8; font-family:newtimeroman;color:#602F44; font-size:150%; text-align:center; border-radius: 15px 50px;"> ⇣ Reading and Cleaning Data ⇣</p>

In [2]:
anime = pd.read_csv("/kaggle/input/anime-recommendations-database/anime.csv")
anime.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
anime_id,32281,5114,28977,9253,9969,32935,11061,820,15335,15417
name,Kimi no Na wa.,Fullmetal Alchemist: Brotherhood,Gintama°,Steins;Gate,Gintama&#039;,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,Hunter x Hunter (2011),Ginga Eiyuu Densetsu,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,Gintama&#039;: Enchousen
genre,"Drama, Romance, School, Supernatural","Action, Adventure, Drama, Fantasy, Magic, Mili...","Action, Comedy, Historical, Parody, Samurai, S...","Sci-Fi, Thriller","Action, Comedy, Historical, Parody, Samurai, S...","Comedy, Drama, School, Shounen, Sports","Action, Adventure, Shounen, Super Power","Drama, Military, Sci-Fi, Space","Action, Comedy, Historical, Parody, Samurai, S...","Action, Comedy, Historical, Parody, Samurai, S..."
type,Movie,TV,TV,TV,TV,TV,TV,OVA,Movie,TV
episodes,1,64,51,24,51,10,148,110,1,13
rating,9.37,9.26,9.25,9.17,9.16,9.15,9.13,9.11,9.1,9.11
members,200630,793665,114262,673572,151266,93351,425855,80679,72534,81109


In [3]:
rating = pd.read_csv("/kaggle/input/anime-recommendations-database/rating.csv")
rating.head(5)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


<div style="border-radius:10px; border:#4E5672 solid; padding: 15px; background-color: #F8F1E8; font-size:100%; text-align:left">

<h3 align="left"><font color='#4E5672'>👀 Features</font></h3>


**Anime Data Features**
    
1. **anime_id**: Unique identifier for each anime.
2. **name**: The title or name of the anime.
3. **genre**: The genre or genres associated with the anime.
4. **type**: The type of the anime (e.g., TV, Movie, OVA).
5. **episodes**: The number of episodes the anime has.
6. **rating**: The average rating of the anime.
7. **members**: The number of members who have added the anime to their list.

**Rating Data Features**
    
1. **user_id**: Unique identifier for each user providing a rating.
2. **anime_id**: Unique identifier for the anime being rated.
3. **rating**: The rating given by the user to the specific anime, **-1 if the user watched it but didn't assign a rating**
    

In [4]:
rating["rating"].value_counts()

rating
 8     1646019
-1     1476496
 7     1375287
 9     1254096
 10     955715
 6      637775
 5      282806
 4      104291
 3       41453
 2       23150
 1       16649
Name: count, dtype: int64

In [5]:
rating = rating[rating["rating"] >= 8]

In [6]:
rating.shape

(3855830, 3)

<div style="border-radius:10px; border:#484366 solid; padding: 15px; background-color: #FFEBCC; font-size:100%; text-align:left">

<h3 align="left"><font color='#484366'>💬 Comment</font></h3>

*  Dropped ratings under 0 which means the user watched but didn't rate the anime

In [7]:
df = pd.merge(anime, rating, on="anime_id", how="inner", suffixes = ("_anime","_user"))
df.head(5).T

Unnamed: 0,0,1,2,3,4
anime_id,32281,32281,32281,32281,32281
name,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.
genre,"Drama, Romance, School, Supernatural","Drama, Romance, School, Supernatural","Drama, Romance, School, Supernatural","Drama, Romance, School, Supernatural","Drama, Romance, School, Supernatural"
type,Movie,Movie,Movie,Movie,Movie
episodes,1,1,1,1,1
rating_anime,9.37,9.37,9.37,9.37,9.37
members,200630,200630,200630,200630,200630
user_id,152,244,271,322,398
rating_user,10,10,10,10,10


<div style="border-radius:10px; border:#484366 solid; padding: 15px; background-color: #FFEBCC; font-size:100%; text-align:left">

<h3 align="left"><font color='#484366'>💬 Comment</font></h3>

*  Merged the anime and rating data, since both has rating feature, renamed it with suffix parameter

In [8]:
df["anime_id"].value_counts()

anime_id
1535     29541
16498    21529
1575     21439
5114     20500
6547     18993
         ...  
5022         1
5833         1
30813        1
33508        1
4266         1
Name: count, Length: 8265, dtype: int64

In [9]:
import statsmodels.stats.api as sms
low_conf, up_conf = sms.DescrStatsW(df["anime_id"].value_counts()).tconfint_mean()
print(f"Lower Confidence Interval: {low_conf:.0f}")
print(f"Upper Confidence Interval: {up_conf:.0f}")

Lower Confidence Interval: 436
Upper Confidence Interval: 497


<div style="border-radius:10px; border:#484366 solid; padding: 15px; background-color: #FFEBCC; font-size:100%; text-align:left">

<h3 align="left"><font color='#484366'>💬 Comment</font></h3>

The terms "lower confidence interval" and "upper confidence interval" are related to statistical inference and confidence intervals.

1. **Lower Confidence Interval:**
   - **Meaning:** In statistics, the lower confidence interval is the lower bound of a range of values within which a population parameter, such as the mean, is believed to lie with a certain level of confidence. It represents the lower limit of a range estimated from sample data.
    
2. **Upper Confidence Interval:**
   - **Meaning:** Similarly, the upper confidence interval is the upper bound of a range of values within which a population parameter is believed to lie with a certain level of confidence. It represents the upper limit of a range estimated from sample data.
    
* **So, we will consider the lower confidence interval as our minimum rated count and filter our data to include only those entries with more than 603 votes.**
 

In [10]:
values_pd = df["anime_id"].value_counts()

rare_animes = values_pd[values_pd < low_conf].index

df_ = df[~df["anime_id"].isin(rare_animes)]

df_["anime_id"].value_counts()

anime_id
1535     29541
16498    21529
1575     21439
5114     20500
6547     18993
         ...  
9751       437
20449      436
6948       436
7222       436
1847       436
Name: count, Length: 1592, dtype: int64

<div style="border-radius:10px; border:#484366 solid; padding: 15px; background-color: #FFEBCC; font-size:100%; text-align:left">

<h3 align="left"><font color='#484366'>💬 Comment</font></h3>
    
* So, we have cleaned approximately 8000 low-rated animes; this way, we will reduce the system requirements for our recommendation system
 

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#006600; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #003300">Association Rule Learning</p>

In [11]:
user_anime_matrix = df_.groupby(["user_id","anime_id"])["rating_anime"].count().unstack().notnull()
user_anime_matrix

anime_id,1,5,6,7,15,16,18,19,20,22,...,32380,32542,32681,32729,32828,32935,32998,33028,34103,34240
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
73513,True,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
73514,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
73515,True,True,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


<div style="border-radius:10px; border:#484366 solid; padding: 15px; background-color: #FFEBCC; font-size:100%; text-align:left">

<h3 align="left"><font color='#484366'>💬 Comment</font></h3>
    
* So, we have created our user-by-anime matrix, indicating whether a user has rated an anime or not. In this context, we are not concerned with how many points a user has assigned to an anime; the crucial aspect for us is whether a user has watched and rated an anime or not.
 

In [12]:
frequent_itemsets = apriori(user_anime_matrix,min_support=0.08,use_colnames=True,verbose=1)
frequent_itemsets.sort_values("support", ascending=False)
rules = association_rules(frequent_itemsets,metric="support",min_threshold=0.01)
rules

Processing 105 combinations | Sampling itemset size 543


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(1),(1535),0.170501,0.429612,0.094965,0.556977,1.296465,0.021716,1.287490,0.275674
1,(1535),(1),0.429612,0.170501,0.094965,0.221049,1.296465,0.021716,1.064892,0.400906
2,(1),(1575),0.170501,0.311786,0.084945,0.498209,1.597921,0.031785,1.371516,0.451100
3,(1575),(1),0.311786,0.170501,0.084945,0.272447,1.597921,0.031785,1.140122,0.543707
4,(1),(2001),0.170501,0.217547,0.082415,0.483367,2.221894,0.045323,1.514524,0.662971
...,...,...,...,...,...,...,...,...,...,...
1145,"(9253, 1575)","(2904, 16498)",0.135496,0.131584,0.080088,0.591070,4.491950,0.062259,2.123630,0.899221
1146,(2904),"(16498, 9253, 1575)",0.275966,0.086850,0.080088,0.290209,3.341482,0.056120,1.286504,0.967816
1147,(16498),"(2904, 9253, 1575)",0.313080,0.122320,0.080088,0.255806,2.091280,0.041792,1.179370,0.759658
1148,(9253),"(2904, 16498, 1575)",0.234723,0.126771,0.080088,0.341202,2.691491,0.050332,1.325489,0.821217


In [18]:
def arl_recommender(rules_df, id, rec=1):
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    recommendation_list = []
    for i, k in enumerate(sorted_rules["antecedents"]):
        for j in list(k):
            if j == id :
                for k in list(sorted_rules.iloc[i]["consequents"]):
                    if k not in recommendation_list:
                        recommendation_list.append(k)

    return recommendation_list[0:rec]

In [19]:
df_[["anime_id","name"]][df_["name"].str.contains(r"^Naruto$", regex=True)].drop_duplicates()

Unnamed: 0,anime_id,name
2118630,20,Naruto


In [20]:
suggest_list = arl_recommender(rules,20,5)
suggest_list

[11757, 16498, 1535, 5114, 1575]

In [21]:
def check_id(data,id):
    name = data["name"][data["anime_id"] == id].iloc[0]
    return name

In [22]:
for suggest in suggest_list:
    print(check_id(anime,id=suggest))

Sword Art Online
Shingeki no Kyojin
Death Note
Fullmetal Alchemist: Brotherhood
Code Geass: Hangyaku no Lelouch


### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#006600; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #003300">Content Based Filtering</p>

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
anime = pd.read_csv("/kaggle/input/anime-recommendations-database/anime.csv")
anime.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
anime_id,32281,5114,28977,9253,9969,32935,11061,820,15335,15417
name,Kimi no Na wa.,Fullmetal Alchemist: Brotherhood,Gintama°,Steins;Gate,Gintama&#039;,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,Hunter x Hunter (2011),Ginga Eiyuu Densetsu,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,Gintama&#039;: Enchousen
genre,"Drama, Romance, School, Supernatural","Action, Adventure, Drama, Fantasy, Magic, Mili...","Action, Comedy, Historical, Parody, Samurai, S...","Sci-Fi, Thriller","Action, Comedy, Historical, Parody, Samurai, S...","Comedy, Drama, School, Shounen, Sports","Action, Adventure, Shounen, Super Power","Drama, Military, Sci-Fi, Space","Action, Comedy, Historical, Parody, Samurai, S...","Action, Comedy, Historical, Parody, Samurai, S..."
type,Movie,TV,TV,TV,TV,TV,TV,OVA,Movie,TV
episodes,1,64,51,24,51,10,148,110,1,13
rating,9.37,9.26,9.25,9.17,9.16,9.15,9.13,9.11,9.1,9.11
members,200630,793665,114262,673572,151266,93351,425855,80679,72534,81109


In [32]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [35]:
anime.dropna(subset="genre",inplace=True)

In [36]:
anime.isnull().sum()

anime_id      0
name          0
genre         0
type         22
episodes      0
rating      215
members       0
dtype: int64

<div style="border-radius:10px; border:#484366 solid; padding: 15px; background-color: #FFEBCC; font-size:100%; text-align:left">

<h3 align="left"><font color='#484366'>💬 Comment</font></h3>
    
* Dropped the animes which has no Genre
 

In [38]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(anime['genre'])
tfidf.get_feature_names_out()

array(['action', 'adventure', 'ai', 'arts', 'cars', 'comedy', 'dementia',
       'demons', 'drama', 'ecchi', 'fantasy', 'fi', 'game', 'harem',
       'hentai', 'historical', 'horror', 'josei', 'kids', 'life', 'magic',
       'martial', 'mecha', 'military', 'music', 'mystery', 'parody',
       'police', 'power', 'psychological', 'romance', 'samurai', 'school',
       'sci', 'seinen', 'shoujo', 'shounen', 'slice', 'space', 'sports',
       'super', 'supernatural', 'thriller', 'vampire', 'yaoi', 'yuri'],
      dtype=object)

<div style="border-radius:10px; border:#484366 solid; padding: 15px; background-color: #FFEBCC; font-size:100%; text-align:left">

<h3 align="left"><font color='#484366'>💬 Comment</font></h3>
    
* Cleaned genres from their stop words

In [56]:
tfidf_matrix.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.29450574, 0.31749916, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25046406, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

<div style="border-radius:10px; border:#4E5672 solid; padding: 15px; background-color: #F8F1E8; font-size:100%; text-align:left">

<h3 align="left"><font color='#4E5672'>📄 TF-IDF</font></h3>

TF-IDF, which stands for Term Frequency-Inverse Document Frequency, is a numerical statistic used in natural language processing and information retrieval to evaluate the importance of a word in a document relative to a collection of documents (corpus). Here's a breakdown of how TF-IDF works:

1. **Term Frequency (TF):**
   - TF measures how often a term (word) appears in a document. It is calculated as the ratio of the number of times a term appears in a document to the total number of terms in that document.
    

2. **Inverse Document Frequency (IDF):**
   - IDF measures the importance of a term across a collection of documents. It is calculated as the logarithm of the ratio of the total number of documents to the number of documents containing the term.
    

3. **TF-IDF Score:**
   - The TF-IDF score for a term in a document is obtained by multiplying the TF and IDF values.
    

4. **Application:**
   - Higher TF-IDF scores are given to terms that are frequent in a document but rare in the entire corpus. This helps to identify terms that are distinctive and important for describing the content of a particular document.
   - TF-IDF is often used in text mining, information retrieval, and document categorization to rank and filter terms based on their significance in individual documents within a larger collection.

In summary, TF-IDF is a technique that assigns weights to terms based on their frequency in a document and their rarity across a collection of documents, aiming to highlight terms that are both relevant and distinctive to each document.

In [67]:
cosine_sim = cosine_similarity(tfidf_matrix,
                               tfidf_matrix)
cosine_sim

array([[1.        , 0.14778251, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.14778251, 1.        , 0.17849957, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.17849957, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ]])

<div style="border-radius:10px; border:#4E5672 solid; padding: 15px; background-color: #F8F1E8; font-size:100%; text-align:left">

<h3 align="left"><font color='#4E5672'>📄 Cosine Similarity </font></h3>

Cosine Similarity, often used in text mining and information retrieval, measures the cosine of the angle between two non-zero vectors. In the context of text data, these vectors typically represent the term frequency or TF-IDF representation of documents. 

1. **Calculate Dot Product (A · B):**
  
2. **Calculate Magnitude (||A|| and ||B||):**
    
   - Calculate the magnitude (Euclidean norm) of each vector
    
  
3. **Compute Cosine Similarity:**
   - Cosine Similarity (cos θ) is then calculated as the dot product divided by the product of the magnitudes:

*    1 indicates perfect similarity, 0 indicates no similarity


### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#BE5F78; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #EDCCAF">Suggest - One Piece</p>

<img src="https://i.imgur.com/RI5DdLh.jpg" style ="text-align: center;">

In [76]:
index = anime[anime["name"].str.contains(r"^One Piece$", regex=True)].drop_duplicates().index[0]
index

74

In [77]:
similarity_scores = pd.DataFrame(cosine_sim[index],
                                 columns=["score"])

movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index

anime['name'].iloc[movie_indices]

241     One Piece: Episode of Nami - Koukaishi no Nami...
74                                              One Piece
896     One Piece: Episode of Sabo - 3 Kyoudai no Kizu...
2723    One Piece Movie 3: Chinjuu-jima no Chopper Oukoku
1793                 One Piece Movie 5: Norowareta Seiken
352                One Piece Film: Strong World Episode 0
753     One Piece: Episode of Luffy - Hand Island no B...
2161                                      One Piece Recap
1795              One Piece: Umi no Heso no Daibouken-hen
1171    One Piece Movie 9: Episode of Chopper Plus - F...
Name: name, dtype: object

### <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#BE5F78; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #EDCCAF">Suggest - Naruto</p>

<img src="https://i.imgur.com/9qcvGGU.png" style ="text-align: center;">

In [82]:
index = anime[anime["name"].str.contains(r"^Naruto$", regex=True)].drop_duplicates().index[0]
index

841

In [83]:
similarity_scores = pd.DataFrame(cosine_sim[index],
                                 columns=["score"])

movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index

anime['name'].iloc[movie_indices]

1472          Naruto: Shippuuden Movie 4 - The Lost Tower
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1573    Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
486                              Boruto: Naruto the Movie
1343                                          Naruto x UT
2997    Naruto Soyokazeden Movie: Naruto to Mashin to ...
2458                 Naruto Shippuuden: Sunny Side Battle
615                                    Naruto: Shippuuden
7628                              Kyutai Panic Adventure!
784            Naruto: Shippuuden Movie 6 - Road to Ninja
Name: name, dtype: object