In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sqlite3
import matplotlib.pyplot as plt
%matplotlib inline

## IMDB Data

In [None]:
conn = sqlite3.connect("im.db")

In [None]:
imdb_movie_info = pd.read_sql("""SELECT * FROM movie_basics;""", conn)

In [None]:
imdb_movie_info.head()

### Top Movies by # Votes

In [None]:
top_movies_by_votes = pd.read_sql("""SELECT genres, primary_title, averagerating, start_year, numvotes
                                FROM movie_basics
                                JOIN movie_ratings
                                    USING(movie_id)
                                ORDER BY numvotes DESC
;""", conn)

In [None]:
top_movies_by_votes

### Top Genres by Count

In [None]:
top_genres_by_count = pd.read_sql("""SELECT genres, COUNT(*) as amount
                                        FROM movie_basics
                                        GROUP BY GENRES
                                        ORDER BY amount DESC
;""", conn)

In [None]:
top_genres_by_count = pd.read_sql("""
    SELECT COALESCE(genres, 'Unknown') as genres, COUNT(*) as amount
    FROM movie_basics
    GROUP BY genres
    ORDER BY amount DESC
    LIMIT 10;
""", conn)

plt.figure(figsize=(10, 6))

# Create the bar plot
plt.bar(top_genres_by_count['genres'], top_genres_by_count['amount'])
plt.xlabel('Genres')
plt.ylabel('Number of Movies')
plt.title('Top Genres by Number of Movies')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Show the plot

In [None]:
top_genres_by_count = pd.read_sql("""
    SELECT COALESCE(genres, 'Unknown') as genres, COUNT(*) as amount
    FROM movie_basics
    GROUP BY genres
    ORDER BY amount DESC
    LIMIT 10;  -- Limit to top 10 genres
""", conn)

# Set style using seaborn
sns.set_style("whitegrid")

# Create the bar plot
plt.figure(figsize=(10, 6))
bar_plot = sns.barplot(
    x='amount',
    y='genres',
    data=top_genres_by_count,
    palette="viridis"  # Change the color palette to 'viridis'
)
plt.xlabel('Number of Movies')
plt.ylabel('Genres')
plt.title('Top Genres by Number of Movies')

# Customize the style of the plot
sns.despine(left=True)  # Remove the left spine
plt.grid(axis='x', linestyle='--', alpha=0.7)  # Add a dashed grid for better readability
plt.tight_layout()

# Annotate the bars with values
for p in bar_plot.patches:
    bar_plot.annotate(format(p.get_width(), '.0f'), 
                      (p.get_width(), p.get_y() + p.get_height() / 2),
                      xytext=(5, 0),  # Offset text from point
                      textcoords='offset points',
                      ha='left', va='center', color='black', weight='bold')

# Show the plot
plt.show()

In [None]:
top_genres_by_count = pd.read_sql("""
    SELECT COALESCE(genres, 'Unknown') as genres, COUNT(*) as amount
    FROM movie_basics
    GROUP BY genres
    ORDER BY amount DESC
    LIMIT 10;  -- Limit to top 10 genres
""", conn)

# Combine multiple genres with commas
top_genres_by_count['genres'] = top_genres_by_count['genres'].str.replace(',', ', ')

# Set style using seaborn
sns.set_style("whitegrid")

# Create the bar plot
plt.figure(figsize=(10, 6))
bar_plot = sns.barplot(
    x='amount',
    y='genres',
    data=top_genres_by_count,
    palette="viridis"  # Change the color palette to 'viridis'
)
plt.xlabel('Number of Movies')
plt.ylabel('Genres')
plt.title('Top Genres by Number of Movies')

# Customize the style of the plot
sns.despine(left=True)  # Remove the left spine
plt.grid(axis='x', linestyle='--', alpha=0.7)  # Add a dashed grid for better readability
plt.tight_layout()

# Annotate the bars with values
for p in bar_plot.patches:
    bar_plot.annotate(format(p.get_width(), '.0f'), 
                      (p.get_width(), p.get_y() + p.get_height() / 2),
                      xytext=(5, 0),  # Offset text from point
                      textcoords='offset points',
                      ha='left', va='center', color='black', weight='bold')

# Show the plot
plt.show()

In [None]:
top_genres_by_count = pd.read_sql("""
    SELECT COALESCE(genres, 'Unknown') as genres, COUNT(*) as amount
    FROM movie_basics
    GROUP BY genres
    ORDER BY amount DESC
    LIMIT 10;  -- Limit to top 10 genres
""", conn)

# Set style using seaborn
sns.set_style("whitegrid")

# Create the bar plot using a horizontal bar chart
plt.figure(figsize=(10, 6))
bar_plot = sns.barplot(
    x='amount',
    y='genres',
    data=top_genres_by_count,
    palette="viridis"  # Change the color palette to 'viridis'
)
plt.xlabel('Number of Movies')
plt.ylabel('Genres')
plt.title('Top Genres by Number of Movies')

# Customize the style of the plot
sns.despine()  # Remove spines to improve aesthetics
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add a dashed grid for better readability
plt.tight_layout()

# Annotate the bars with values
for p in bar_plot.patches:
    bar_plot.annotate(format(p.get_width(), '.0f'), 
                      (p.get_width(), p.get_y() + p.get_height() / 2),
                      xytext=(5, 0),  # Offset text from point
                      textcoords='offset points',
                      ha='left', va='center', color='black', weight='bold')

# Show the plot
plt.show()

### Top Genres by Rating

In [None]:
top_genres_by_rating = pd.read_sql("""SELECT genres, AVG(averagerating) as average_rating
                                        FROM movie_basics
                                        JOIN movie_ratings
                                            USING(movie_id)
                                        GROUP BY GENRES
                                        ORDER BY average_rating DESC
;""", conn)

In [None]:
top_genres_by_rating


In [None]:
# plt.figure(figsize=(10, 6))  # Adjust the figure size if needed
top_genres_by_rating.dropna(subset=['genres', 'average_rating'], inplace=True)

plt.bar(top_genres_by_rating['genres'], top_genres_by_rating['average_rating'])
# plt.xlabel('Genres')
# plt.ylabel('Average Rating')
# plt.title('Top Genres by Average Rating')
# plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
# plt.tight_layout()  # Adjust layout to prevent clipping of labels
# plt.show()
plt.show()

In [None]:
subset_genres = top_genres_by_rating.head(10)

plt.figure(figsize=(10, 6))
plt.bar(subset_genres['genres'], subset_genres['average_rating'])
plt.xlabel('Genres')
plt.ylabel('Average Rating')
plt.title('Top Genres by Average Rating')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# top_genres_by_rating.dropna(subset=['genres', 'average_rating'], inplace=True)

top_genres_by_rating.dropna(subset=['genres', 'average_rating'], inplace=True)

# Split multiple genres and create a DataFrame
genre_df = pd.concat([
    pd.Series(row['average_rating'], row['genres'].split(',')) for _, row in top_genres_by_rating.iterrows()
], axis=1)

genre_df = genre_df.transpose()

# Plot using seaborn
plt.figure(figsize=(12, 6))
sns.barplot(data=genre_df, ci=None)
plt.xlabel('Top Genres')
plt.ylabel('Average Rating')
plt.title('Average Ratings for Top Genres')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data=genre_df, ci=None, palette=colors)
plt.xlabel('Top Genres')
plt.ylabel('Average Rating')
plt.title('Average Ratings for Top Genres')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Save the plot as an image file with higher DPI
output_filename = 'average_ratings_by_genre_2.png'
plt.savefig(output_filename, dpi=300)  # Set DPI to 300 (adjust as needed)

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have already loaded or created the 'top_genres_by_rating' DataFrame

# Drop rows with missing values in 'genres' and 'average_rating' columns
top_genres_by_rating.dropna(subset=['genres', 'average_rating'], inplace=True)

# Sort the DataFrame by average_rating in descending order and select top 25
top_25_genres = top_genres_by_rating.sort_values(by='average_rating', ascending=False).head(10)

# Plot using seaborn with custom colors
plt.figure(figsize=(12, 6))
sns.barplot(x='genres', y='average_rating', data=top_25_genres, ci=None, palette=colors)
plt.xlabel('Genres')
plt.ylabel('Average Rating')
plt.title('Top 25 Genres by Average Rating')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Save the plot as an image file with higher DPI
output_filename = 'top_25_genres_ratings.png'
plt.savefig(output_filename, dpi=300)  # Set DPI to 300 (adjust as needed)

# Show the plot
plt.show()