# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [2]:
pd.set_option('display.max_columns', None)

# First exploration

In [3]:
users_anime_df = pd.read_csv('../data/raw_data/animelist.csv')


In [4]:
anime_df = pd.read_csv('../data/raw_data/anime.csv')
#watching_status_df = pd.read_csv('./.data/raw_data/watching_status.csv')
#rating_complete_df = pd.read_csv('../data/raw_data/rating_complete.csv')
#anime_df_relevant_PG = pd.read_csv('../data/raw_data/anime_df_relevant_PG.csv')

In [5]:
anime_df.shape

(17562, 35)

In [6]:
anime_df.columns


Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'],
      dtype='object')

In [7]:
users_anime_df.shape

(109224747, 5)

In [8]:
users_anime_df.head()

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,0,67,9,1,1
1,0,6702,7,1,4
2,0,242,10,1,4
3,0,4898,0,1,1
4,0,21,10,1,0


# Prepare full data set for visualization

In [9]:
anime_df.rename(columns={'MAL_ID': 'anime_id'}, inplace=True)

In [10]:
users_anime_df.head(2)

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,0,67,9,1,1
1,0,6702,7,1,4


In [11]:
# take a fraction of full 100M lines
users_anime_sample = users_anime_df.sample(frac=0.05, random_state=42)

In [12]:
users_anime_sample.shape

(5461237, 5)

In [13]:
#merge anime_df with user_amine_sample
full_df_sample = anime_df.merge(users_anime_sample, on="anime_id", how='inner')


In [14]:
full_df_sample.shape

(5461237, 39)

# Visualization for audience

## N Most popular anime (ranking and completed and members)  bar chart on y axis

### Most popular anime by user score

In [15]:
full_df_sample.columns

Index(['anime_id', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1', 'user_id',
       'rating', 'watching_status', 'watched_episodes'],
      dtype='object')

In [16]:
full_df_sample.Ranked.replace(['Unknown'], -1,inplace = True)

In [17]:
full_df_sample.Ranked = full_df_sample.Ranked.apply(float)

In [18]:
full_df_sample.Ranked

0          28.0
1          28.0
2          28.0
3          28.0
4          28.0
           ... 
5461232    -1.0
5461233    -1.0
5461234    -1.0
5461235    -1.0
5461236    -1.0
Name: Ranked, Length: 5461237, dtype: float64

In [19]:
ranked = full_df_sample.sort_values(by='Ranked',ascending=True)

In [None]:
ranked = ranked[['Ranked','Name']]

In [None]:
ranked.Ranked=ranked.Ranked[ranked['Ranked']!=-1]

In [None]:
ranked.dropna(inplace=True)

In [None]:
ranked.drop_duplicates(inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(12,15))
#sns.set(font_scale=1)
sns.barplot(data=ranked.iloc[:20], y = 'Name',x='Ranked',ax=ax)

### Most popular fully watched anime


In [None]:
completed = full_df_sample.sort_values(by='Completed',ascending=False)

In [None]:
completed = completed[['Completed','Name']]

In [None]:
completed.dropna(inplace=True)

In [None]:
completed.drop_duplicates(inplace=True)

### Most popular anime by number of communities

In [None]:
members = full_df_sample.sort_values(by='Members',ascending=False)
members = members[['Members','Name']]
members.dropna(inplace=True)
members.drop_duplicates(inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(9,12))
sns.set(font_scale=1.1)
p2 = sns.barplot(data=members.iloc[:20], y = 'Name',x='Members',ax=ax)
p2.set_xlabel("Community members")

## N Most popular anime (user count )  bar chart on y axis -----> Tanguy made it

In [None]:
# FIND IT IN NOTEBOOK OF DIMITRI 

## Most popular anime genres

In [None]:
full_df_sample.columns

In [None]:
genres = pd.concat([full_df_sample.drop(columns =['anime_id', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1', 'user_id',
       'rating', 'watching_status', 'watched_episodes']), full_df_sample['Genres'].str.get_dummies(sep=", ")], 1)

In [None]:
genres.head(2)

In [None]:
popular_genres = genres.apply(np.sum).sort_values(ascending=False)

In [None]:
popular_genres = popular_genres.to_frame()

In [None]:
popular_genres.reset_index(inplace = True)

In [None]:
popular_genres['index'].to_list()

In [None]:
# Start with one review:
text = str(popular_genres['index'].to_list())

# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)

# Display the generated image:
plt.subplots(figsize=(16, 14))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## Most popular Studios 

In [None]:
studios = full_df_sample.Studios.value_counts().to_frame().reset_index()

In [None]:
studios = studios.rename(columns={'index':'Studios', 'Studios':'count'})

In [None]:
studios[studios.Studios == 'Unknown']

In [None]:
studios.drop([7],inplace=True)

In [None]:
top_50_studios = studios.iloc[:50]
top_10_studios = studios.iloc[:10]

In [None]:
sample_studios = studios.sample(8).sort_values(by='count',ascending=False)

In [None]:
# Creating plot
fig = plt.figure(figsize =(14, 9))
#plt.pie(sample_studios['count'], labels = sample_studios['Studios'])
#plt.pie(studios['count'], labels = studios['Studios'])

plt.pie(top_10_studios['count'], labels = top_10_studios['Studios'])

# show plot
plt.show()

## Sources of anime: movie, OTA, TV - pie

In [None]:
source = full_df_sample.Source.value_counts().to_frame().reset_index()

In [None]:
source = source.rename(columns={'index':'Source', 'Source':'count'})

In [None]:
source[source.Source == 'Unknown']

In [None]:
source.index[source['Source']=='Unknown'].tolist()[0]

In [None]:
source.drop([source.index[source['Source']=='Unknown'].tolist()[0]],inplace=True)

In [None]:
top_10_sources = source.iloc[:10]

In [None]:
top_10_sources

In [None]:
# Creating plot
fig = plt.figure(figsize =(14, 9))
#plt.pie(sample_studios['count'], labels = sample_studios['Studios'])
#plt.pie(studios['count'], labels = studios['Studios'])

plt.pie(top_10_sources['count'], labels = top_10_sources['Source'])

# show plot
plt.show()

## Community per anime 

## Count of animes accoding to PG rating

## How many anime each Studio produces

## What are the most popular anime according to watching or score or rank

## How many anime made per year 

## How many anime watched through years

## Anime duration

## Popularity  with duration (rank(?) vs duration?)

## Dropped anime 

## Top anime by number of episodes