In [19]:
import pandas as pd
import re
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


'''Import data files'''

# Anime title, categorical ratings (as percentages), and genres
anime_genre = pd.read_csv('/Users/nickburkhalter/Desktop/Data Science/Projects/Data Sets/Anime/anime-data-score-staff-synopsis-and-genre/datagenre-all-share-new.csv', 
                          sep='|')

anime_score = pd.read_csv('/Users/nickburkhalter/Desktop/Data Science/Projects/Data Sets/Anime/anime-data-score-staff-synopsis-and-genre/datascorehist-all-share-new.csv', 
                          sep='|')

anime_title = pd.read_csv('/Users/nickburkhalter/Desktop/Data Science/Projects/Data Sets/Anime/anime-data-score-staff-synopsis-and-genre/datatitle-all-share-new.csv', 
                          sep='|')


# Anime title, user ratings, and genres
anime_list = pd.read_csv('/Users/nickburkhalter/Desktop/Data Science/Projects/Data Sets/Anime/anime-recommendations-database/anime.csv')

user_rating = pd.read_csv('/Users/nickburkhalter/Desktop/Data Science/Projects/Data Sets/Anime/anime-recommendations-database/rating.csv')



print(anime_genre.shape)
print(anime_score.shape)
print(anime_title.shape)
print(anime_title.shape)
print(user_rating.shape)

(4029, 2)
(4029, 12)
(4029, 2)
(4029, 2)
(7813737, 3)


In [20]:
'''Analyze table columns to identify similarities'''

# From the results, we notice that the Dataframe anime_list seems to provide everything we need we want
# We will investigate this DataFrame further

print(anime_genre.columns)
print(anime_score.columns)
print(anime_title.columns)
print(anime_list.columns)
print(user_rating.columns)

Index(['Anime_ID', 'Genres'], dtype='object')
Index(['Anime_ID', 'Master-piece', 'Excellent', 'Very good', 'Good', 'Decent',
       'So-so', 'Not really good', 'Weak', 'Bad', 'Awful', 'Worst ever'],
      dtype='object')
Index(['Anime_ID', 'Anime_name'], dtype='object')
Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')
Index(['user_id', 'anime_id', 'rating'], dtype='object')


In [21]:
'''Analysis of the anime_list DataFrame'''
from collections import Counter

# Count the null values in each column of the dataset
print(anime_list.isnull().sum())

'''We see that there are 62 elements in the genre column without entries. Because the dataset contains 12294 anime,
and we are only interested in the top 50 Shounen anime, we can safely assume these 62 unclassified anime can be
dropped without adversely affecting our research'''
anime_list = anime_list.dropna(subset=['genre','type','rating'])

# View the shape of the dataset
print(anime_list.shape)

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64
(12017, 7)


In [22]:
'''We will further clean the data by excluding the columns that are of no interest to us'''

anime_list = anime_list[['name', 'genre', 'type', 'rating', 'members']]

anime_list

Unnamed: 0,name,genre,type,rating,members
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,9.37,200630
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,9.26,793665
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.25,114262
3,Steins;Gate,"Sci-Fi, Thriller",TV,9.17,673572
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.16,151266
5,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,9.15,93351
6,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,9.13,425855
7,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,9.11,80679
8,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,9.10,72534
9,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.11,81109


In [23]:
'''And the last bit of cleaning we need to do before we can start having fun with the data is to sort the anime 
by their rating (though it looks like this has already been done, we're just going to double-check)'''

anime_list.sort_values('rating', ascending=False)        # Looks like we were wrong!

Unnamed: 0,name,genre,type,rating,members
10464,Taka no Tsume 8: Yoshida-kun no X-Files,"Comedy, Parody",Movie,10.00,13
10400,Spoon-hime no Swing Kitchen,"Adventure, Kids",TV,9.60,47
9595,Mogura no Motoro,Slice of Life,Movie,9.50,62
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,9.37,200630
9078,Kahei no Umi,Historical,Movie,9.33,44
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,9.26,793665
10786,Yakusoku: Africa Mizu to Midori,"Drama, Kids",OVA,9.25,53
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.25,114262
3,Steins;Gate,"Sci-Fi, Thriller",TV,9.17,673572
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.16,151266


In [24]:
'''It looks like rating is not the best metric to base our search on. The members column seems to play a larger factor
in the popularity of an anime, so we will sort the data based on members next'''




'''From this, it looks like the members column will give us more honest results on the top 50 shounen anime. While 
rating seemed like a good metric at first, it turned out to be not so reliable simply because it showed only the 
"average" rating, and not how many people rated it (though the top rated anime in the dataset was a solid 10.0, it 
only received 13 reviews, compared to the most-watched Death Note, which received an 8.71 rating with over 1 million 
reviews)'''


anime_list.sort_values('members', ascending=False)        # MUCH better results!

Unnamed: 0,name,genre,type,rating,members
40,Death Note,"Mystery, Police, Psychological, Supernatural, ...",TV,8.71,1013917
86,Shingeki no Kyojin,"Action, Drama, Fantasy, Shounen, Super Power",TV,8.54,896229
804,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance",TV,7.83,893100
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,9.26,793665
159,Angel Beats!,"Action, Comedy, Drama, School, Supernatural",TV,8.39,717796
19,Code Geass: Hangyaku no Lelouch,"Action, Mecha, Military, School, Sci-Fi, Super...",TV,8.83,715151
841,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,7.81,683297
3,Steins;Gate,"Sci-Fi, Thriller",TV,9.17,673572
445,Mirai Nikki (TV),"Action, Mystery, Psychological, Shounen, Super...",TV,8.07,657190
131,Toradora!,"Comedy, Romance, School, Slice of Life",TV,8.45,633817


In [25]:
'''Next, we will filter out the dataset so that the only ones remaining are those with \shounen\ contained in their
genre and filter out the top 50'''

anime_list = anime_list[anime_list['genre'].str.contains('Shounen')]

anime_list[:50]

Unnamed: 0,name,genre,type,rating,members
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,9.26,793665
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.25,114262
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.16,151266
5,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,9.15,93351
6,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,9.13,425855
8,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,9.1,72534
9,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.11,81109
11,Koe no Katachi,"Drama, School, Shounen",Movie,9.05,102733
12,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.04,336376
14,Haikyuu!! Second Season,"Comedy, Drama, School, Shounen, Sports",TV,8.93,179342


In [26]:
'''There seem to be a lot of duplicate titles in this set. We will further filter to show only TV shows'''

anime_list = anime_list.loc[anime_list['type'] == 'TV']

anime_list[:50]

Unnamed: 0,name,genre,type,rating,members
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,9.26,793665
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.25,114262
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.16,151266
5,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,9.15,93351
6,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,9.13,425855
9,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.11,81109
12,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.04,336376
14,Haikyuu!! Second Season,"Comedy, Drama, School, Shounen, Sports",TV,8.93,179342
16,Shigatsu wa Kimi no Uso,"Drama, Music, Romance, School, Shounen",TV,8.92,416397
20,Hajime no Ippo,"Comedy, Drama, Shounen, Sports",TV,8.83,157670


In [29]:
'''We can see that there are multiple instances where the same title has many different shows. We will need to 
further edit the data so that we get the top 50 UNIQUE shows. To do this, we will utilize COSINE SIMILARITY to find
similar elements, then remove the similar elements based on a specified threshold. The tricl here is to make sure the
FIRST instance of the anime title remains in tact (if it gets deleted, it may inadvertently remove a top 50 anime from
our list)'''

# Use sklearn to find cosine similarity between anime names
tfidf = TfidfVectorizer().fit_transform(anime_list['name'])    # Convert strings in column 'name' to Tf-idf vectors
print(cosine_similarity(tfidf))                                # Calculate cosine similarity between all rows in df

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.76064177 ... 0.         0.         0.        ]
 [0.         0.76064177 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
