In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast

# Importing Datasets

In [2]:
#importing "anime_marlesson_jan20.csv"
anime_data = pd.read_csv("animes_marlesson_jan20.csv")
print("Number of Columns in anime_data: " + str(len(anime_data.columns)))
print("Number of Observations in anime_data: " + str(len(anime_data)))
#importing "profiles_marlesson_jan20.csv"
users = pd.read_csv("profiles_marlesson_jan20.csv")
print("\nNumber of Columns in users: " + str(len(users.columns)))
print("Number of Observations in users: " + str(len(users)))


Number of Columns in anime_data: 12
Number of Observations in anime_data: 19311

Number of Columns in users: 5
Number of Observations in users: 81727


# Cleaning Data: anime_data

In [3]:
anime_data = anime_data.dropna()
anime_data = anime_data.drop(columns = ['img_url', 'link'])
anime_data = anime_data.rename(columns = {'score':'rating'}) 
print("Number of Columns After Cleaning Data: " + str(len(anime_data.columns)))
print("Number of Observations After Cleaning Data: " + str(len(anime_data)))
anime_data.head()

Number of Columns After Cleaning Data: 10
Number of Observations After Cleaning Data: 15187


Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,rating
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83


### Creating "aired_year" column

In [4]:
# Adding a new column to represent the aired year 
# Cases: month, year ; Not available ; just 1 year ; 20xx to 20xx 
aired_years = [] 
for dates in anime_data['aired']: 
    if dates == "Not available":
        aired_years.append(np.nan)
    elif len(dates) > 4 and dates[0].isalpha():
        start = dates.index(",") + 2 
        year = dates[start:start+4] #gets the first year  
        aired_years.append(int(year))
    else:
        aired_years.append(dates[0:3])
anime_data['aired_year'] = aired_years
anime_data = anime_data.dropna()
print("Number of observations after dropping anime without aired_year: " + str(len(anime_data)))
print("Below are the first 5 rows of the dataset with the new column 'aired_year' ")
anime_data.head()

Number of observations after dropping anime without aired_year: 15072
Below are the first 5 rows of the dataset with the new column 'aired_year' 


Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,rating,aired_year
0,28891,Haikyuu!! Second Season,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun...","Oct 4, 2015 to Mar 27, 2016",25.0,489888,141,25.0,8.82,2015
1,23273,Shigatsu wa Kimi no Uso,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun...","Oct 10, 2014 to Mar 20, 2015",22.0,995473,28,24.0,8.83,2014
2,34599,Made in Abyss,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F...","Jul 7, 2017 to Sep 29, 2017",13.0,581663,98,23.0,8.83,2017
3,5114,Fullmetal Alchemist: Brotherhood,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ...","Apr 5, 2009 to Jul 4, 2010",64.0,1615084,4,1.0,9.23,2009
4,31758,Kizumonogatari III: Reiketsu-hen,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']","Jan 6, 2017",1.0,214621,502,22.0,8.83,2017


# Cleaning Data: users

In [5]:
users = users.dropna()
users = users.rename(columns = {'favorites_anime':'favorites'}) 
print("Number of Columns After Cleaning Data: " + str(len(users.columns)))
print("Number of Observations After Cleaning Data: " + str(len(users)))

Number of Columns After Cleaning Data: 5
Number of Observations After Cleaning Data: 43636


In [6]:
birth_years = []
for birthday in users['birthday']: #str 
    if "," in birthday:
        comma_index = birthday.index(',') 
        year = birthday[comma_index + 2:]
        birth_years.append(year)
    else: 
        birth_years.append(np.nan)
users['birth_year']  = birth_years
#dropping users without a birth year
users = users.dropna()
print("Number of users after dropping users without birth year: " + str(len(users)))
users = users.drop(columns = ['birthday'])
users.head()

Number of users after dropping users without birth year: 34550


Unnamed: 0,profile,gender,favorites,link,birth_year
0,DesolatePsyche,Male,"['33352', '25013', '5530', '33674', '1482', '2...",https://myanimelist.net/profile/DesolatePsyche,1994
1,baekbeans,Female,"['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans,2000
4,aManOfCulture99,Male,"['4181', '7791', '9617', '5680', '2167', '4382...",https://myanimelist.net/profile/aManOfCulture99,1999
7,NIGGER_BONER,Male,"['11061', '30', '6594', '28701', '10087', '674...",https://myanimelist.net/profile/NIGGER_BONER,1985
8,jchang,Male,"['846', '2904', '5114', '2924', '72']",https://myanimelist.net/profile/jchang,1992


# Summary of Data Sets After Cleaning

## anime_data

In [7]:
print("Number of columns: " + str(len(anime_data.columns)))
print("Column names:") 
for column in anime_data.columns: 
    print(column)
print("Number of observations (anime): " + str(len(anime_data)))

Number of columns: 11
Column names:
uid
title
synopsis
genre
aired
episodes
members
popularity
ranked
rating
aired_year
Number of observations (anime): 15072


## users

In [8]:
print("Number of columns: " + str(len(users.columns)))
print("Column names:")
for column in users.columns:
    print(column)
print("Number of observations (users): " + str(len(users)))

Number of columns: 5
Column names:
profile
gender
favorites
link
birth_year
Number of observations (users): 34550


# Merging user and user reviews

In [13]:
reviews = pd.read_csv("reviews.csv")
user_reviews = pd.merge(users, reviews, on = "profile")
user_reviews = user_reviews.drop(columns = ['link_x', 'link_y'])
user_reviews.head()

Unnamed: 0,profile,gender,favorites,birth_year,uid,anime_uid,text,score,scores
0,DesolatePsyche,Male,"['33352', '25013', '5530', '33674', '1482', '2...",1994,255938,34096,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '8', 'Animation': '8..."
1,DesolatePsyche,Male,"['33352', '25013', '5530', '33674', '1482', '2...",1994,255793,12403,\n \n \n \n ...,6,"{'Overall': '6', 'Story': '6', 'Animation': '9..."
2,DesolatePsyche,Male,"['33352', '25013', '5530', '33674', '1482', '2...",1994,298207,35073,\n \n \n \n ...,6,"{'Overall': '6', 'Story': '6', 'Animation': '8..."
3,DesolatePsyche,Male,"['33352', '25013', '5530', '33674', '1482', '2...",1994,255795,23225,\n \n \n \n ...,7,"{'Overall': '7', 'Story': '8', 'Animation': '9..."
4,DesolatePsyche,Male,"['33352', '25013', '5530', '33674', '1482', '2...",1994,291256,2035,\n \n \n \n ...,8,"{'Overall': '8', 'Story': '9', 'Animation': '7..."


# Data Analysis: Gender Preferences

### Creating Separate datasets for male and female users

In [14]:
male_users = users[users.gender == "Male"]
female_users = users[users.gender == "Female"]
#favorites are stored as strings: '['10', '3', '4']'

In [15]:
female_users

Unnamed: 0,profile,gender,favorites,link,birth_year
1,baekbeans,Female,"['11061', '31964', '853', '20583', '918', '925...",https://myanimelist.net/profile/baekbeans,2000
10,angelsreview,Female,"['534', '71', '7724', '861', '5060', '853']",https://myanimelist.net/profile/angelsreview,1989
27,GinKagu,Female,"['918', '2904', '4181', '9989']",https://myanimelist.net/profile/GinKagu,2012
35,greatyu,Female,"['20', '21', '918', '11061', '934', '13125']",https://myanimelist.net/profile/greatyu,1993
37,Slushpuppy282,Female,"['407', '3588', '177', '1604', '269']",https://myanimelist.net/profile/Slushpuppy282,1990
...,...,...,...,...,...
81715,sango868,Female,"['269', '355', '5114', '4898', '934', '32995']",https://myanimelist.net/profile/sango868,1996
81719,samdineen20,Female,"['120', '150', '3457', '3655', '4898']",https://myanimelist.net/profile/samdineen20,1992
81721,anjel,Female,[],https://myanimelist.net/profile/anjel,1988
81722,lovelessxd,Female,"['853', '5114']",https://myanimelist.net/profile/lovelessxd,1992


In [109]:
combined = []
for fav_list in female_users['favorites']:
    if '[]' not in fav_list: #ignoring empty lists 
        fav_list = fav_list.strip('[]') 
        fav_list = fav_list.split("'")
        for anime_id in fav_list:
            if anime_id.isdigit():
                combined.append(int(anime_id))
#converting to pandas series for easier processing 
combined = np.array(combined) 
female_favs = pd.Series(combined) 
print("First five observations for female favorites:")
female_favs.head() #List of all the favorite animes, contains repeats 

First five observations for female favorites:


0    11061
1    31964
2      853
3    20583
4      918
dtype: int64

In [110]:
combined = []
for fav_list in male_users['favorites']:
    if '[]' not in fav_list: #ignoring empty lists 
        fav_list = fav_list.strip('[]') 
        fav_list = fav_list.split("'")
        for anime_id in fav_list:
            if anime_id.isdigit():
                combined.append(int(anime_id))
#converting to pandas series for easier processing 
combined = np.array(combined) 
male_favs = pd.Series(combined) 
print("First five observations for male favorites:")
male_favs.head() #List of all the favorite animes, contains repeats 

First five observations for male favorites:


0    33352
1    25013
2     5530
3    33674
4     1482
dtype: int64

### Counting Frequency of Favorites

In [124]:
ff_count = female_favs.value_counts()
print("Anime ID of Top 10 female favorites: ")
print(ff_count[0:9])

Anime ID of Top 10 female favorites: 
5114     1274
1535      903
853       772
11061     712
16498     659
9756      556
9253      552
21        549
1575      501
dtype: int64


In [125]:
mf_count = male_favs.value_counts()
print("Anime ID of Top 10 male favorites: ")
print(mf_count[0:9])

Anime ID of Top 10 male favorites: 
9253     3723
5114     3417
4181     2311
1535     2272
2001     2263
11061    2187
1575     2009
1        1867
30       1675
dtype: int64


### Genre of Favorites

Problem: There seems to be duplicates in the dataset for anime ID

In [132]:
anime_data[anime_data.uid == 1575].drop_duplicates() #change the uid number to get the anime data

Unnamed: 0,uid,title,synopsis,genre,aired,episodes,members,popularity,ranked,rating,aired_year
759,1575,Code Geass: Hangyaku no Lelouch,"In the year 2010, the Holy Empire of Britannia...","['Action', 'Military', 'Sci-Fi', 'Super Power'...","Oct 6, 2006 to Jul 29, 2007",25.0,1231546,11,31.0,8.76,2006
