In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd drive/MyDrive/CSCE670_ISR/project/Animendations/
!ls

/content/drive/MyDrive/CSCE670_ISR/project/Animendations
'Animendations Project Proposal.pdf'   app.py   LICENSE		      README.md
 Anushka_Preprocessing.ipynb	       data     Preprocessing.ipynb


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import ast

animes_data_path = "./data/animes.csv"
profiles_data_path = "./data/profiles.csv"
reviews_data_path = "./data/reviews.csv"

In [159]:
animes_data = pd.read_csv(animes_data_path)
profiles_data = pd.read_csv(profiles_data_path)
reviews_data = pd.read_csv(reviews_data_path, engine='python', sep=',', error_bad_lines=False)



  reviews_data = pd.read_csv(reviews_data_path, engine='python', sep=',', error_bad_lines=False)


In [160]:
animes = animes_data.copy()
profiles = profiles_data.copy()
reviews = reviews_data.copy()

In [161]:
# function to calculate sparsity given number of items, number of users, and number of ratings
def sparsity(num_items, num_users, num_ratings):
  return (num_items * num_users - num_ratings) / (num_users * num_items)

In [162]:
print(animes.shape)
print(profiles.shape)
print(reviews.shape)

(19311, 12)
(81727, 5)
(192112, 7)


### Dropped duplicates values wrt uid in animes
### Dropped duplicates values wrt profile in profiles
### Dropped duplicates wrt profile and uid in reviews

In [163]:
# drop all duplicates by primary key
animes.drop_duplicates('uid', keep = 'first', inplace=True, ignore_index = True)
animes.reset_index(drop = True, inplace=True)

profiles.drop_duplicates('profile', keep = 'first', inplace=True, ignore_index = True)
profiles.reset_index(drop = True, inplace=True)

reviews.drop_duplicates(subset = ['profile', 'uid'], keep = 'first', inplace=True, ignore_index = True)
reviews.reset_index(drop = True, inplace=True)

In [164]:
# shapes after dropping duplicates match exactly with what is given on the Kaggle dataset description
print(animes.shape)
print(profiles.shape)
print(reviews.shape)

(16216, 12)
(47885, 5)
(130519, 7)


### Dropped the birthday column -> We are not using age
### Removing genres as only Hentai

In [165]:
# drop age column initially for base model setup
profiles.drop("birthday", axis='columns', inplace=True)
profiles.reset_index(drop = True, inplace=True)

# drop animes which are only Hentai, and (optionally) drop animes with only one episode (feel free to change and not have it dropped)
animes = animes[animes.genre != "['Hentai']"]
print(animes.shape)

# drop all review entries that do not have the animes or profiles after the above preprocessing steps
reviews = reviews[reviews.anime_uid.isin(animes.uid)]
reviews = reviews[reviews.profile.isin(profiles.profile)]
print(profiles.shape)
print(reviews.shape)

(15293, 12)
(47885, 4)
(128771, 7)


### Removed all users with less than 3 ratings and corresponding reviews and any unique anime to them

In [166]:
#remove all reviews of users who have given lesser than three reviews
freq = reviews['profile'].value_counts()
frequent_values = freq[freq >= 3].index
reviews = reviews[reviews['profile'].isin(frequent_values)]
print(reviews.shape)

# Remove all users from profiles dataset who have given lesser than three reviews
profiles = profiles[profiles.profile.isin(reviews.profile)]
print(profiles.shape)

# remove all animes after the above two preprocessing steps
animes = animes[animes.uid.isin(reviews.anime_uid)]
print(animes.shape)

(84150, 7)
(10692, 4)
(6787, 12)


In [167]:
animes[animes['episodes'] == 1].shape[0]/animes.shape[0] ### The percentage distribution of movies gets skewed after removing the animes which have freq reviews

0.38205392662442905

In [168]:
# current sparsity of the dataset after preprocessing
print(sparsity(animes.shape[0], profiles.shape[0], reviews.shape[0]))

0.9988403756637144


## Cleaned 'genre' in animes table

In [169]:
animes.head()

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


In [170]:
def clean_genres(genres):
  """
  Coinverts the str(list) -> list format
  """
  genres = genres[1:-1].strip().split(',')
  genres = list(set(genre.strip()[1:-1] for genre in genres))
  return genres

In [176]:
animes['genre'] = animes.apply(lambda x: clean_genres(x['genre']), axis = 1)
animes.head()

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"[Shounen, Sports, School, Comedy, Drama]","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"[Shounen, Romance, School, Music, Drama]","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"[Sci-Fi, Mystery, Adventure, Fantasy, Drama]","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","[Shounen, Action, Military, Adventure, Magic, ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"[Mystery, Action, Vampire, Supernatural]","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


## Cleaned 'favorites_anime' in profiles table

In [177]:
profiles.head()

Unnamed: 0,profile,gender,favorites_anime,link
0,DesolatePsyche,Male,"['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche
1,baekbeans,Female,"['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans
2,skrn,,"['918', '2904', '11741', '17074', '23273', '32...",https://myanimelist.net/profile/skrn
3,edgewalker00,Male,"['5680', '849', '2904', '3588', '37349']",https://myanimelist.net/profile/edgewalker00
4,aManOfCulture99,Male,"['4181', '7791', '9617', '5680', '2167', '4382...",https://myanimelist.net/profile/aManOfCulture99


In [178]:
def clean_favorites_anime(favorites_anime):
  """
  Coinverts the str(list) -> list format
  """
  favorites_anime = favorites_anime[1:-1].strip().split(',')
  favorites_anime = list(set(anime.strip()[1:-1] for anime in favorites_anime))
  return favorites_anime

In [179]:
profiles['favorites_anime'] = profiles.apply(lambda x: clean_favorites_anime(x['favorites_anime']), axis = 1)
profiles.head()

Unnamed: 0,profile,gender,favorites_anime,link
0,DesolatePsyche,Male,"[25013, 33352, 33674, 269, 8525, 457, 17074, 5...",https://myanimelist.net/profile/DesolatePsyche
1,baekbeans,Female,"[918, 6956, 853, 34599, 11061, 3588, 31964, 13...",https://myanimelist.net/profile/baekbeans
2,skrn,,"[918, 17074, 32281, 2904, 9989, 1943, 512, 117...",https://myanimelist.net/profile/skrn
3,edgewalker00,Male,"[5680, 37349, 2904, 849, 3588]",https://myanimelist.net/profile/edgewalker00
4,aManOfCulture99,Male,"[9617, 5680, 4181, 235, 31646, 4382, 7791, 216...",https://myanimelist.net/profile/aManOfCulture99


## Cleaned 'scores' in reviews table
## Removed unnecessary review texts and stored them in 'text' in reviews table

In [180]:
reviews.head()

Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8...",https://myanimelist.net/reviews.php?id=255938
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=259117
2,253664,skrn,28891,\n \n \n \n ...,7,"{'Overall': '7', 'Story': '7', 'Animation': '9...",https://myanimelist.net/reviews.php?id=253664
3,8254,edgewalker00,2904,\n \n \n \n ...,9,"{'Overall': '9', 'Story': '9', 'Animation': '9...",https://myanimelist.net/reviews.php?id=8254
4,291149,aManOfCulture99,4181,\n \n \n \n ...,10,"{'Overall': '10', 'Story': '10', 'Animation': ...",https://myanimelist.net/reviews.php?id=291149


In [181]:
def format_scores(scores):
  """
  The scores are poresent in form of str(dictionary) which is converted to a dictionary
  """
  scores = scores[1:-1].strip().split(',')
  scores = list(set(score.strip()[1:-1] for score in scores))
  scores = [score.split(':') for score in scores]
  scores = [[score[0].strip()[:-1], score[1].strip()[1:]] for score in scores]
  res_dict = dict()
  res_dict = {score[0]: score[1] for score in scores}
  # scores = [score[0] score[1] for score in scores)
  # print(type(res_dict))
  return res_dict

In [182]:
reviews['scores'] = reviews.apply(lambda x: format_scores(x['scores']), axis = 1)
reviews.head()

Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
0,255938,DesolatePsyche,34096,\n \n \n \n ...,8,"{'Sound': '10', 'Overall': '8', 'Animation': '...",https://myanimelist.net/reviews.php?id=255938
1,259117,baekbeans,34599,\n \n \n \n ...,10,"{'Story': '10', 'Animation': '10', 'Sound': '1...",https://myanimelist.net/reviews.php?id=259117
2,253664,skrn,28891,\n \n \n \n ...,7,"{'Story': '7', 'Sound': '8', 'Overall': '7', '...",https://myanimelist.net/reviews.php?id=253664
3,8254,edgewalker00,2904,\n \n \n \n ...,9,"{'Overall': '9', 'Story': '9', 'Enjoyment': '9...",https://myanimelist.net/reviews.php?id=8254
4,291149,aManOfCulture99,4181,\n \n \n \n ...,10,"{'Story': '10', 'Sound': '9', 'Overall': '10',...",https://myanimelist.net/reviews.php?id=291149


In [183]:
def clean_reviews(reviews):
  """
  The reviews test present are very sparsely put, lot of new lines and redundant information as scores column.
  Hence, cleaned all texts in every line which has less than 30 charcaters => This removes all unuseful information.
  Stores the orginal review by the user
  """
  # print(reviews)
  reviews = reviews.strip().split('\n')
  # print(reviews)
  reviews = [review.strip().lower() for review in reviews if len(review)>=30]
  reviews = ' '.join(review for review in reviews)
  return reviews

In [184]:
reviews['text'] = reviews.apply(lambda x: clean_reviews(x['text']), axis = 1)
reviews.head()

Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
0,255938,DesolatePsyche,34096,"first things first. my ""reviews"" system is exp...",8,"{'Sound': '10', 'Overall': '8', 'Animation': '...",https://myanimelist.net/reviews.php?id=255938
1,259117,baekbeans,34599,let me start off by saying that made in abyss ...,10,"{'Story': '10', 'Animation': '10', 'Sound': '1...",https://myanimelist.net/reviews.php?id=259117
2,253664,skrn,28891,"art 9/10: it is great, especially the actions ...",7,"{'Story': '7', 'Sound': '8', 'Overall': '7', '...",https://myanimelist.net/reviews.php?id=253664
3,8254,edgewalker00,2904,taking place 1 yr from where season 1 trailed ...,9,"{'Overall': '9', 'Story': '9', 'Enjoyment': '9...",https://myanimelist.net/reviews.php?id=8254
4,291149,aManOfCulture99,4181,kyoto animations greatest strength is being ab...,10,"{'Story': '10', 'Sound': '9', 'Overall': '10',...",https://myanimelist.net/reviews.php?id=291149


## Storing the preprocessed tables

In [187]:
animes.to_csv('./preprocessed_data/animes.csv', index = False)

In [192]:
animes_data = pd.read_csv('./preprocessed_data/animes.csv')
animes_data.head()

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Shounen', 'Sports', 'School', 'Comedy', 'Dra...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,https://cdn.myanimelist.net/images/anime/9/766...,https://myanimelist.net/anime/28891/Haikyuu_Se...
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Shounen', 'Romance', 'School', 'Music', 'Dra...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,https://cdn.myanimelist.net/images/anime/3/671...,https://myanimelist.net/anime/23273/Shigatsu_w...
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Mystery', 'Adventure', 'Fantasy', ...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,https://cdn.myanimelist.net/images/anime/6/867...,https://myanimelist.net/anime/34599/Made_in_Abyss
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Shounen', 'Action', 'Military', 'Adventure',...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,https://cdn.myanimelist.net/images/anime/1223/...,https://myanimelist.net/anime/5114/Fullmetal_A...
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Mystery', 'Action', 'Vampire', 'Supernatural']","Jan 6, 2017",1.0,214621,502,22.0,8.83,https://cdn.myanimelist.net/images/anime/3/815...,https://myanimelist.net/anime/31758/Kizumonoga...


In [195]:
profiles.to_csv('./preprocessed_data/profiles.csv', index = False)

In [196]:
profiles_data = pd.read_csv('./preprocessed_data/profiles.csv')
profiles_data.head()

Unnamed: 0,profile,gender,favorites_anime,link
0,DesolatePsyche,Male,"['25013', '33352', '33674', '269', '8525', '45...",https://myanimelist.net/profile/DesolatePsyche
1,baekbeans,Female,"['918', '6956', '853', '34599', '11061', '3588...",https://myanimelist.net/profile/baekbeans
2,skrn,,"['918', '17074', '32281', '2904', '9989', '194...",https://myanimelist.net/profile/skrn
3,edgewalker00,Male,"['5680', '37349', '2904', '849', '3588']",https://myanimelist.net/profile/edgewalker00
4,aManOfCulture99,Male,"['9617', '5680', '4181', '235', '31646', '4382...",https://myanimelist.net/profile/aManOfCulture99


In [199]:
type(profiles_data['favorites_anime'][0])

str

In [200]:
reviews.to_csv('./preprocessed_data/reviews.csv', index = False)

In [201]:
reviews_data = pd.read_csv('./preprocessed_data/reviews.csv')
reviews_data.head()

Unnamed: 0,uid,profile,anime_uid,text,score,scores,link
0,255938,DesolatePsyche,34096,"first things first. my ""reviews"" system is exp...",8,"{'Sound': '10', 'Overall': '8', 'Animation': '...",https://myanimelist.net/reviews.php?id=255938
1,259117,baekbeans,34599,let me start off by saying that made in abyss ...,10,"{'Story': '10', 'Animation': '10', 'Sound': '1...",https://myanimelist.net/reviews.php?id=259117
2,253664,skrn,28891,"art 9/10: it is great, especially the actions ...",7,"{'Story': '7', 'Sound': '8', 'Overall': '7', '...",https://myanimelist.net/reviews.php?id=253664
3,8254,edgewalker00,2904,taking place 1 yr from where season 1 trailed ...,9,"{'Overall': '9', 'Story': '9', 'Enjoyment': '9...",https://myanimelist.net/reviews.php?id=8254
4,291149,aManOfCulture99,4181,kyoto animations greatest strength is being ab...,10,"{'Story': '10', 'Sound': '9', 'Overall': '10',...",https://myanimelist.net/reviews.php?id=291149


In [204]:
type(reviews_data['scores'][0])

str