# Dataset Summary

# Shape: 
The original dataset contains 6,668 rows and 33 columns.
The new set contains 23750 rows and 34 columns.

# Columns:
Includes details like anime_id, title, title_english, title_japanese, genre, type, episodes, rating, score, studio, aired, and more.
Features information about anime attributes (e.g., titles, genres, producers) and performance metrics (e.g., score, popularity, favorites).

# Data Types:
23 columns are of type object (e.g., textual data like titles, genres, and studios).
6 columns are int64 (e.g., numerical data like members, favorites).
4 columns are float64 (e.g., scores, duration in minutes).
1 column is bool (e.g., airing status).

# Missing Values:
Columns with significant missing values:

background (5,855 missing values),

premiered (3,702 missing values),

licensor (3,881 missing values).

Other columns like title_english, title_synonyms, producer, and studio also contain some missing data.

This dataset is rich in information and provides numerous opportunities for analysis. With attributes ranging from production details to user ratings, it can be utilized for exploring trends in the anime industry, identifying audience preferences, and uncovering factors influencing anime success.


# Set up the notebook and connect the dataset

The command “pip install pandas matplotlib seaborn openpyxl” is used in Python to install several libraries for data analysis and visualization, along with Excel file handling.This command equips Python with tools for handling a wide range of data processing, visualization, and file management tasks. Here’s what each library does:
1）pandas: Used for data manipulation and analysis. 2）matplotlib: Helps in creating charts and graphs. 3）seaborn: Enhances matplotlib’s capabilities with more visually appealing and informative statistical graphics. 4）openpyxl: Allows Python to read and write Excel files.

In [1]:
pip install pandas matplotlib seaborn openpyxl 

Note: you may need to restart the kernel to use updated packages.


In [2]:
#It's particularly useful for data visualization within notebooks
#It allows you to see the plots immediately and provides a more integrated and interactive workflow for data analysis
%matplotlib inline

In [None]:
# Importing the necessary libraries for data handling and visualization
import pandas as pd              # Pandas library for data manipulation
import matplotlib.pyplot as plt  # Matplotlib's pyplot for creating static plots
import seaborn as sns            # Seaborn library for making statistical graphics

# URL for the raw CSV data
# This URL points to the location of the CSV file we want to work with
url = "https://raw.githubusercontent.com/oneaboveall168/DH-Anime-list/refs/heads/main/animelistnew.csv"

# Load the dataset from the provided URL into a DataFrame
# pd.read_csv() reads the CSV file from the URL and loads it into a pandas DataFrame
df = pd.read_csv(url)

# 1. A look inside the data

In [None]:
# Get the dimensions of the DataFrame
# 'df.shape' returns a tuple containing the number of rows and columns in the DataFrame
df.shape

☝️The dataset contains information on 6668 words, with 33 different pieces of features recorded for each word.

In [None]:
# Get the number of records in the DataFrame
len(df)

In [None]:
#What do the last few entries in the dataset look like?
# Display the first few rows of the DataFrame to get a quick overview of the data
df.head(5)

In [None]:
#What do the last few entries in the dataset look like?
# View the last 5 rows
df.tail(5)

In [None]:
# Display : Show only one column
df['type']

# 2. Describe the data with the .describe() function

In [None]:
# Use Pandas to describe the "type" column
df['type'].describe()

In [None]:
# Use Pandas to describe the "source" column
df['source'].describe()

In [None]:
# Use Pandas to describe the "rating" column
df['rating'].describe()

In [None]:
# Show the data types
df.dtypes

# Descriptive statistics

In [None]:
# 1. Top 10 Popular Anime by Favorites
popular_anime = df.nlargest(10, 'favorites')[['title', 'favorites']]
plt.figure(figsize=(10, 6))
sns.barplot(data=popular_anime, x='favorites', y='title', palette='viridis')
plt.title('Top 10 Anime by Favorites')
plt.xlabel('Favorites')
plt.ylabel('Anime Title')
plt.show()

In [None]:
# 2. Score Distribution: Analyze how scores are distributed across the dataset
plt.figure(figsize=(10, 6))
sns.histplot(df['score'], bins=20, kde=True, color='blue')
plt.title('Score Distribution')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
#3. Genres Word Cloud: Visualize the frequency of genres in an engaging way
# Import the necessary module
from wordcloud import WordCloud

genres_combined = ' '.join(df['genre'].dropna().str.replace(', ', ' ').tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(genres_combined)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Genres')
plt.show()


In [None]:
# 4. Anime Over Time: Show the number of anime titles released over the years
yearly_counts = df.groupby('aired_from_year').size()
plt.figure(figsize=(10, 6))
yearly_counts.plot(kind='bar', color='teal')
plt.title('Anime Released Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Anime')
plt.show()


In [None]:
# 5. Top 10 Producers' Contribution
producer_counts = df['producer'].value_counts().head(10)
plt.figure(figsize=(10, 6))
producer_counts.plot(kind='bar', color='orange')
plt.title('Top 10 Producers by Anime Count')
plt.xlabel('Producer')
plt.ylabel('Number of Anime')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 6. Top 10 Studios by Popularity
studio_popularity = df.groupby('studio')['popularity'].mean().nsmallest(10)
plt.figure(figsize=(10, 6))
studio_popularity.plot(kind='barh', color='purple')
plt.title('Top 10 Studios by Popularity')
plt.xlabel('Average Popularity Rank')
plt.ylabel('Studio')
plt.gca().invert_xaxis()
plt.show()


In [None]:
# 7. Episodes vs. Score
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='episodes', y='score', alpha=0.6, color='green')
plt.title('Episodes vs. Score')
plt.xlabel('Episodes')
plt.ylabel('Score')
plt.show()


In [None]:
# 8. Rating Analysis
rating_counts = df['rating'].value_counts()
plt.figure(figsize=(10, 6))
rating_counts.plot(kind='pie', autopct='%1.1f%%', startangle=100, colors=sns.color_palette('pastel'))
plt.title('Rating Distribution')
plt.ylabel('')
plt.show()


In [None]:
# 9. Airing Status Distribution
status_counts = df['status'].value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=status_counts.index, y=status_counts.values, palette='magma')
plt.title('Airing Status Distribution')
plt.xlabel('Status')
plt.ylabel('Count')
plt.show()

In [None]:
# 10. Source Popularity:  Compare how different sources (e.g., manga, novel) influence popularity.
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='source', y='popularity', palette='cool')
plt.title('Source vs. Popularity')
plt.xlabel('Source')
plt.ylabel('Popularity')
plt.xticks(rotation=45)
plt.show()

In [None]:
#11. Average Score by Genre
# Extract and explode genres
df['genre_split'] = df['genre'].str.split(', ')
genre_scores = df.explode('genre_split').groupby('genre_split')['score'].mean().sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 6))
genre_scores.plot(kind='bar', color='skyblue')
plt.title('Average Score by Genre')
plt.xlabel('Genre')
plt.ylabel('Average Score')
plt.xticks(rotation=45, ha='right')
plt.show()


In [None]:
#12. Top 20 Anime by Number of Members
top_members = df.nlargest(20, 'members')[['title', 'members']]

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(y='title', x='members', data=top_members, palette='viridis')
plt.title('Top 20 Anime by Members')
plt.xlabel('Number of Members')
plt.ylabel('Anime Title')
plt.show()

In [None]:
#13. Correlation Heatmap
# Select numerical columns
corr_data = df[['score', 'rank', 'popularity', 'members', 'favorites']].corr()

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_data, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
#14. Anime Release Trend by Decade
df['decade'] = (df['aired_from_year'] // 10) * 10
release_trend = df.groupby('decade').size()

# Plot
plt.figure(figsize=(10, 6))
release_trend.plot(kind='line', marker='o', color='green')
plt.title('Anime Release Trend by Decade')
plt.xlabel('Decade')
plt.ylabel('Number of Anime')
plt.grid(True)
plt.show()


In [None]:
#15. Top Genres by Popularity
genre_popularity = df.explode('genre_split').groupby('genre_split')['popularity'].sum().sort_values(ascending=True)

# Plot
plt.figure(figsize=(10, 8))
genre_popularity.plot(kind='barh', color='purple')
plt.title('Top Genres by Popularity')
plt.xlabel('Popularity Score')
plt.ylabel('Genre')
plt.show()


In [None]:
#16. Studio Contribution Over Time
top_studios = df['studio'].value_counts().head(5).index
studio_trend = df[df['studio'].isin(top_studios)].groupby(['studio', 'decade']).size().unstack()

# Plot
studio_trend.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='viridis')
plt.title('Studio Contribution Over Time')
plt.xlabel('Decade')
plt.ylabel('Number of Anime')
plt.legend(title='Studio', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
#17. Anime Duration vs. Score
plt.figure(figsize=(10, 6))
sns.regplot(x='duration_min', y='score', data=df, scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
plt.title('Anime Duration vs. Score')
plt.xlabel('Duration (minutes)')
plt.ylabel('Score')
plt.show()


In [None]:
#18. Favorite Anime Over Time
favorites_trend = df.groupby('aired_from_year')['favorites'].sum()

# Plot
plt.figure(figsize=(10, 6))
favorites_trend.plot(kind='line', marker='o', color='orange')
plt.title('Favorites Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Favorites')
plt.grid(True)
plt.show()


In [None]:
#19. Most Common Airing Days
df['broadcast_day'] = df['broadcast'].str.split(' ').str[0]
airing_days = df['broadcast_day'].value_counts()

# Plot
plt.figure(figsize=(10, 6))
airing_days.plot(kind='bar', color='teal')
plt.title('Most Common Airing Days')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Anime')
plt.xticks(rotation=45)
plt.show()


In [None]:
#20. Anime Score Distribution by Status
plt.figure(figsize=(10, 6))
sns.boxplot(x='status', y='score', data=df, palette='pastel')
plt.title('Score Distribution by Airing Status')
plt.xlabel('Airing Status')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.show()


In [None]:
#21. Anime production trends over time

import matplotlib.pyplot as plt

# Grouping by year and counting the number of anime produced
production_trends = df['aired_from_year'].dropna().astype(int).value_counts().sort_index()

# Plotting the production trends over time
plt.figure(figsize=(12, 6))
plt.plot(production_trends.index, production_trends.values, marker='o', linestyle='-', color='blue')
plt.title('Anime Production Trends Over Time', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Anime Produced', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.axvline(2015, color='red', linestyle='--', label='Peak Year: 2015')
plt.legend()
plt.show()


In [None]:
#22.Studios and Producers Collaboration Analysis

import networkx as nx
import matplotlib.pyplot as plt

# Create a bipartite graph for studios and producers
G = nx.Graph()

# Loop through rows to establish producer-studio relationships
for _, row in df.dropna(subset=['producer', 'studio']).iterrows():
    producers = row['producer'].split(', ')
    studios = row['studio'].split(', ')
    for producer in producers:
        for studio in studios:
            G.add_edge(producer, studio)

# Filter the graph for the most frequent studios (e.g., Sunrise and Toei Animation)
top_studios = ['Sunrise', 'Toei Animation']
filtered_edges = [(u, v) for u, v in G.edges if v in top_studios]
filtered_graph = nx.Graph(filtered_edges)

# Plot the bipartite graph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(filtered_graph, seed=42)
nx.draw_networkx_nodes(filtered_graph, pos, node_size=300, node_color='skyblue')
nx.draw_networkx_edges(filtered_graph, pos, width=1, alpha=0.6, edge_color='gray')
nx.draw_networkx_labels(filtered_graph, pos, font_size=10, font_family="sans-serif")

plt.title('Producer-Studio Collaboration Network')
plt.show()


# Linguistic Features Analysis

In [None]:
#1. Average Length of Titles in Different Languages
import matplotlib.pyplot as plt

df['title_length'] = df['title'].str.len()
df['title_english_length'] = df['title_english'].str.len()
df['title_japanese_length'] = df['title_japanese'].str.len()

avg_lengths = df[['title_length', 'title_english_length', 'title_japanese_length']].mean()

plt.bar(avg_lengths.index, avg_lengths.values, color=['blue', 'orange', 'green'])
plt.title('Average Length of Titles in Different Languages')
plt.ylabel('Average Length (characters)')
plt.show()



In [None]:
#2. Frequency of Common Words in Titles
from collections import Counter
from wordcloud import STOPWORDS
import pandas as pd

all_titles = ' '.join(df['title'].dropna())
tokens = [word for word in all_titles.split() if word.lower() not in STOPWORDS]
word_counts = Counter(tokens).most_common(20)

words, counts = zip(*word_counts)
plt.bar(words, counts, color='purple')
plt.title('Most Common Words in Titles')
plt.xticks(rotation=90)
plt.show()


In [None]:
#3. Text Length Distribution
plt.hist(df['title_length'].dropna(), bins=20, alpha=0.7, label='Original Titles')
plt.hist(df['title_english_length'].dropna(), bins=20, alpha=0.7, label='English Titles')
plt.hist(df['title_japanese_length'].dropna(), bins=20, alpha=0.7, label='Japanese Titles')
plt.title('Text Length Distribution Across Languages')
plt.xlabel('Length (characters)')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
#4. Top Keywords by Genre
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns

vectorizer = CountVectorizer(stop_words='english')
genre_matrix = vectorizer.fit_transform(df['genre'].dropna())
genre_keywords = pd.DataFrame(genre_matrix.toarray(), columns=vectorizer.get_feature_names_out())
top_keywords = genre_keywords.sum().sort_values(ascending=False).head(10)

sns.barplot(x=top_keywords.values, y=top_keywords.index, palette='cool')
plt.title('Top Keywords by Genre')
plt.xlabel('Frequency')
plt.ylabel('Keywords')
plt.show()


In [None]:
#5. Topic Modeling
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topics = lda.fit_transform(genre_matrix)
topics = ['Topic ' + str(i) for i in range(lda.components_.shape[0])]

for i, topic in enumerate(topics):
    words = [vectorizer.get_feature_names_out()[index] for index in lda.components_[i].argsort()[-10:]]
    print(f'{topic}: {", ".join(words)}')


In [None]:
#6. Count genres in English and Japanese titles
english_genres = df[df['title_english'].notna()]['genre'].str.split(', ').explode().value_counts()
japanese_genres = df[df['title_japanese'].notna()]['genre'].str.split(', ').explode().value_counts()

# Create comparison bar chart
genre_comparison = pd.DataFrame({'English': english_genres, 'Japanese': japanese_genres}).fillna(0)

genre_comparison.plot(kind='bar', figsize=(12, 6), color=['blue', 'orange'])
plt.title('Comparison of Genre Frequency: English vs Japanese Titles')
plt.ylabel('Count')
plt.xlabel('Genre')
plt.xticks(rotation=45)
plt.legend(title='Language')
plt.show()


# Social Network Analysis

In [None]:
import networkx as nx

# Group genres by anime title or ID
grouped_genres = df.groupby('anime_id')['genre'].apply(list)

# Create the graph
G = nx.Graph()

for genre_list in grouped_genres:
    for i, genre1 in enumerate(genre_list):
        for genre2 in genre_list[i + 1:]:
            if G.has_edge(genre1, genre2):
                G[genre1][genre2]['weight'] += 1
            else:
                G.add_edge(genre1, genre2, weight=1)

# Check the graph
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

In [None]:
#1.Producer-Studio Collaboration Network
import networkx as nx
import matplotlib.pyplot as plt

# Group by anime ID to ensure proper relationships between producers and studios
grouped_producers_studios = df.groupby('anime_id').apply(
    lambda x: (x['producer'].iloc[0], x['studio'].iloc[0])
)

# Create the graph
G = nx.Graph()
for producer, studio in grouped_producers_studios:
    G.add_edge(producer, studio)

# Visualize the graph
plt.figure(figsize=(10, 8))
nx.draw(G, with_labels=True, node_size=30, font_size=8)
plt.title('Producer-Studio Collaboration Network')
plt.show()


In [None]:
#2.Producer-Studio Collaboration

import networkx as nx
import matplotlib.pyplot as plt

# Create a bipartite graph for producer-studio collaboration
G = nx.Graph()

# Process each row to add producers and studios as nodes and edges
for _, row in df.dropna(subset=['producer', 'studio']).iterrows():
    producers = row['producer'].split(', ')
    studios = row['studio'].split(', ')
    for producer in producers:
        for studio in studios:
            G.add_edge(producer, studio)

# Generate positions for nodes
pos = nx.spring_layout(G, k=0.3)

# Plot the graph
plt.figure(figsize=(12, 10))
nx.draw_networkx_nodes(G, pos, node_color='lightgreen', node_size=100)
nx.draw_networkx_edges(G, pos, alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=8)

plt.title('Producer-Studio Collaboration Network')
plt.show()


In [None]:
#3.Producer-Genre Associations
import pandas as pd
import matplotlib.pyplot as plt

# Explode genres for many-to-one mapping
df['genre_split'] = df['genre'].str.split(', ')
producer_genre = df.explode('genre_split').groupby(['producer', 'genre_split']).size().reset_index(name='count')

# Find top 5 producers by genre count
top_producers = producer_genre.groupby('producer')['count'].sum().sort_values(ascending=False).head(5).index
filtered_data = producer_genre[producer_genre['producer'].isin(top_producers)]

# Plot results
plt.figure(figsize=(10, 6))
for producer in top_producers:
    subset = filtered_data[filtered_data['producer'] == producer]
    plt.bar(subset['genre_split'], subset['count'], label=producer)

plt.title('Top Producers and Their Genre Associations')
plt.ylabel('Count')
plt.xlabel('Genre')
plt.xticks(rotation=45)
plt.legend(title='Producers')
plt.tight_layout()
plt.show()
