In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
data = pd.read_csv("imdb_top_1000.csv")

In [None]:
# Display basic information about the dataset
print(data.head())
print(data.info())

In [None]:
#drop the unwanted column
data = data.drop(['Poster_Link','Star1','Star2','Star3','Star4','Meta_score'],axis=1)


In [None]:
# Display the first 5 rows
data.head()

In [None]:
# check the number of missing values (NaN) in each column of our DataFrame
data.isna().sum()

In [None]:
data =data[data['Gross'].isna()==False]


In [None]:
#renaming data column
data.rename(columns={'Released_Year':'Release Year', 
             'Certificate':'Age Rating', 
             'IMDB_Rating':'IMDB Rating',
             'Meta_score':'Metascore',
             'No_of_Votes':'Votes',
             'Gross':'Gross Revenue'}, inplace=True)

In [None]:
#consolidating different age ratings into three main categories: 'U' (Universal), 'UA' (Universal - Adult), and 'A' (Adult).
data['Age Rating'] = data['Age Rating'].map({'U':'U','G':'U','PG':'U','GP':'U','TV-PG':'U',
                                             'UA':'UA','PG-13':'UA','U/A':'UA','Passed':'UA','Approved':'UA',
                                             'A':'A','R':'A'})
data = data[data['Age Rating'].isna() == False]
data['Age Rating'].value_counts()

In [None]:
#Filtering Release Year
year_format = r'\d\d\d\d'
data = data[data['Release Year'].str.match(year_format)]

#Changing Release Year type to int
data['Release Year'] = data['Release Year'].astype(int)

# Changing Runtime type to int 
data['Runtime'] = data['Runtime'].str[:-4].astype(int)

#Changing Gross Revenue type to int
data['Gross Revenue'] = data['Gross Revenue'].str.replace(',','').astype(int)
data['Gross Revenue'] = data['Gross Revenue']*(10**-6)

In [None]:
#Grouping by Genre
top_Genres = data.groupby('Genre')[['IMDB Rating']].mean().sort_values('IMDB Rating',ascending=False).head(10).round(2)
top_Genres.reset_index(inplace=True)
top_Genres

#summing up the occurrences of each genre after creating dummy variable
Genre = data['Genre'].str.get_dummies(sep=', ')
sum_of_generes = Genre.sum().sort_values(ascending=True)

In [None]:
print(data.isna().sum())
print(data.dtypes)

In [None]:
# creating a horizontal bar chart to visualize the total counts of genres using matplotlib. 
colors = plt.cm.viridis(np.linspace(0, 1, len(sum_of_generes)))
plt.barh(sum_of_generes.keys(),sum_of_generes,color=colors)
plt.title('Total Counts of Genres', fontsize=12, weight=600)
plt.xlabel('Total Movies')
plt.ylabel('Genres')
plt.show()


In [None]:
#creates a histogram to visualize the distribution of movie releases over different years. 
plt.hist(data['Release Year'],color='skyblue', edgecolor='white')
plt.xlabel('Release Year')
plt.ylabel('Release Movie Count')
plt.title('Total Released Movie by Date')
plt.show()

In [None]:
#create a line plot to visualize the relationship between 'Release Year' and 'IMDB Rating.
sns.lineplot(x='Release Year', y='IMDB Rating', errorbar=None, data=data)
plt.title('Total IMDB_Ratings by Released Movie', fontsize=12, weight=600)
plt.show()

In [None]:
# creates a bar chart to visualize the top 10 movies based on their gross revenue. 
top_10_movies = data.nlargest(10, 'Gross Revenue')
plt.bar(top_10_movies['Series_Title'], top_10_movies['Gross Revenue'], color='skyblue')
plt.xlabel('Movie Name')
plt.ylabel('Gross Revenue')
plt.title('Top 10 Movies by Gross Revenue')

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45, ha='right')  
plt.show()