In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud
from collections import Counter
import seaborn as sns
import nltk

In [None]:
anime=pd.read_csv("./Datasets/anime-info-main-clean.csv", sep=';')
anime.sample(5)

## Data Analysis

### Famous Genre of Anime

In [None]:
anime.Genres=anime.Genres.fillna(anime['Genres'].mode().iloc[0])

the_list=anime.Genres.to_list()
the_list = [map(lambda x: x.strip(), item.split(',')) for item in the_list]
new_list = [item for sub_list in the_list for item in sub_list]

str1 = " " 
str1=str1.join(new_list)

genre_list = str1
wordcloud = WordCloud(background_color="white").generate(genre_list)
plt.figure(figsize=(8,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

### Top 3 Famous Genres

In [None]:
c = Counter(new_list)
c.most_common(3)

Comedy, Slice of Life and Action are top 3 Genres 

### Highest Scored Anime

In [None]:
top10_animescore=anime[['Title', 'Score']].sort_values(by = 'Score',ascending = False).head(10)
top10_animescore

In [None]:
top10_animescore.plot.bar(x="Title", y="Score", rot=70, title="Top Anime based on Score");
plt.xlabel('Anime',fontsize = 20) 
plt.ylabel('Score', fontsize = 20)
plt.show(block=True)

## Top ranked Anime

In [None]:
top10_animerank=anime[['Title', 'Ranked']].sort_values(by = 'Ranked',ascending = False).head(10)
top10_animerank

In [None]:
top10_animerank.plot.bar(x="Title", y="Ranked", rot=70, title="Top Anime based on Score");
plt.xlabel('Anime',fontsize = 20) 
plt.ylabel('Ranked', fontsize = 20)
plt.show(block=True)

### Top Anime Based on Members

In [None]:
top10_animemembers=anime[['Title', 'Members']].sort_values(by = 'Members',ascending = False).head(10)
top10_animemembers

In [None]:
top10_animemembers

In [None]:
top10_animemembers.plot.bar(x="Title", y="Members", rot=70, title="Top 10 Anime based on members")
plt.xlabel('Anime',fontsize = 20) 
plt.ylabel('Community Size', fontsize = 20)
plt.show(block=True)

### Top Anime based on Number of Episodes

In [None]:
top10_animeepisodes=anime[['Title', 'Episodes']].sort_values(by = 'Episodes',ascending = False).head(5)
top10_animeepisodes

In [None]:
top10_animeepisodes.plot.bar(x="Title", y="Episodes", rot=70, title="Top 5 Anime with maximum episodes ");
plt.xlabel('Anime',fontsize = 20) 
plt.ylabel('No. of Episodes', fontsize = 20)
plt.show(block=True);

In [None]:
top10_animeepisodes.plot.bar(x="Title", y="Episodes", rot=70, title="Top 5 Anime with maximum episodes ");
plt.xlabel('Anime',fontsize = 20) 
plt.ylabel('No. of Episodes', fontsize = 20)
plt.show(block=True);

## Anime content Rating

In [None]:
anime.Rating.value_counts().plot.bar()

### Anime Sources

In [None]:
anime.Source.value_counts().plot.bar()

## Type of Anime Content Available

In [None]:
anime['Type'].hist(bins=20)

In [None]:
# Creating plot
fig = plt.figure(figsize =(10, 7))
plt.pie(anime['Type'].value_counts().values, labels = anime['Type'].value_counts().index)
  
# show plot
plt.show()

### Most Favorite Anime liked by users

In [None]:
top10_animefavorites=anime[['Title', 'Favorites']].sort_values(by = 'Favorites',ascending = False).head(10)
top10_animefavorites

In [None]:
top10_animefavorites.plot.bar(x="Title", y="Favorites", rot=70, title="Top 10 Anime based on favorites");
plt.xlabel('Anime',fontsize = 20) 
plt.ylabel('Loving Anime by Community Size', fontsize = 20)
plt.show(block=True);

### Popular Anime

In [None]:
top_popular_anime=anime[['Title', 'Popularity']].sort_values(by = 'Popularity',ascending = False).head(10)
top_popular_anime.plot.bar(x="Title", y="Popularity", rot=70, title="Top 10 popular Anime ");
plt.xlabel('Anime',fontsize = 20) 
plt.ylabel('Popularity', fontsize = 20)
plt.show(block=True);

### The popularity feature is irrelevant as it is not giving much information about the Anime. We can drop this feature also.

In [None]:
anime=anime.drop(columns='Popularity')

## Data Distribution

In [None]:
check_behavior = ["Score","Episodes", "Members","Favorites","Status"]
for feature in check_behavior:
    plt.figure(figsize = (15, 7))
    plt.subplot(1,2,1)
    anime[feature].hist(bins=70)
    plt.xlabel(feature)
    plt.ylabel("count")
    plt.title(feature)
    plt.show()

In [None]:
corr = anime.corr()
corr

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(8,8))
cor = anime.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
#Correlation with output variable
cor_target = abs(cor["Score"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.05]
relevant_features

In [None]:
continuous_features=['Episodes','Members','Favorites','Score','Ranked']
for feature in continuous_features:
    data=anime.copy()
    data.boxplot(column=feature)
    plt.ylabel(feature)
    plt.title(feature)
    plt.show()

### Staus of Anime content

In [None]:
anime.Status.value_counts().plot.bar()

## Score vs Rank

In [None]:
rank=anime['Ranked']
score=anime['Score']
favorite=anime['Favorites']

In [None]:
plt.figure(7)
plt.title("Score Vs Rank")
sns.scatterplot( score, rank);

### Score vs Favorite

In [None]:
plt.figure(7)
plt.title("Score Vs Favorite")
sns.scatterplot( score, favorite);

# Analysis of User Reviews

In [None]:
reviews=pd.read_csv('./Datasets/anime-reviews-main.csv', sep=';')
reviews.sample(5)

In [None]:
reviews=reviews.drop(columns=['Title', 'anime_id' ,'review_5'])
reviews.sample(5)

In [None]:
reviews['Reviews'] = reviews[reviews.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)

reviews=reviews.drop(columns=['review_1','review_2','review_3','review_4'])
reviews['Reviews'].replace('', np.nan, inplace=True)
reviews=reviews.dropna()
reviews.sample(5)

In [None]:
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
# Add sentiment anaylsis columns
sia = nltk.sentiment.vader.SentimentIntensityAnalyzer()

reviews['positive'] = reviews.iloc[:,0].apply(lambda x: sia.polarity_scores(x)['pos'])
reviews['neutral'] = reviews.iloc[:,0].apply(lambda x: sia.polarity_scores(x)['neu'])
reviews['negative'] = reviews.iloc[:,0].apply(lambda x: sia.polarity_scores(x)['neg'])
reviews['compound'] = reviews.iloc[:,0].apply(lambda x: sia.polarity_scores(x)['compound'])

In [None]:
reviews.sample(5)

## Top 10 positive reviews

In [None]:
reviews.sort_values(by='positive', ascending=False).head(10)

## Top 10 negative reviews

In [None]:
reviews.sort_values(by='negative', ascending=False).head(10)

In [None]:
# wordcloud function
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'black',
        max_words = 200,
        max_font_size = 40, 
        scale = 3,
        random_state = 42
    ).generate(str(data))

    fig = plt.figure(1, figsize = (10, 10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud,interpolation="bilinear" )
    plt.show()
    
# print wordcloud
show_wordcloud(reviews["Reviews"])

Some of the positive reviews are:

1. Good
2. beautiful
3. first
4. pretty
5. Quick
6. better