In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Ingestion 

In [None]:
# Reading our dataset
imdb_df = pd.read_csv('/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv')

#### Run basic analysis for high level assessment of the data ingested

In [None]:
#looking at top rows of the imdb dataset
imdb_df.head()

In [None]:
imdb_df.info

In [None]:
#Get the dataset size 
imdb_df.shape #1000 rows and 16 columns

In [None]:
#Assess the datatypes of the features imported from Kaggle's IMDB dataset
imdb_df.dtypes

#### Run checks for missing values

In [None]:
#Studying missing values
imdb_df.isnull().any()

In [None]:
#Counting missing values
imdb_df.isna().sum()

In [None]:
#Total number of missing values
imdb_df.isna().sum().sum()

# Data Cleansing 

In [None]:
#Filling in zero values for nulls 
imdb_df[["Meta_score", "Gross"]] = imdb_df[["Meta_score", "Gross"]].fillna(0)

In [None]:
#Checking for zero values
imdb_df[["Meta_score", "Gross"]]

In [None]:
#Checking to see if any duplicates available
imdb_df.dropna()
imdb_df.drop_duplicates() 

In [None]:
#Looking at data size to check duplicates
imdb_df.shape
#there are no duplicates

In [None]:
#Checking for duplicate title
#Arrange by Series_Title
imdb_df.sort_values("Series_Title", inplace = True)

In [None]:
# Removing rows with the same series title
imdb_df.drop_duplicates(subset ="Series_Title",
                     keep = False, inplace = True)

#Checking if the Series_Title can be used as index
imdb_df.Series_Title.is_unique

In [None]:
#Making Series_Title as index
imdb_df.set_index('Series_Title')
imdb_df.shape 
#Difference in dataset size. 998 rows after removing duplicate rows

# Data Analysis

#### Question 1 - Finding movies distribution on IMDB based on their genres

In [None]:
# Genre based analysis - Finding total number of movies across genres
imdb_dfQ1 = imdb_df.copy()
imdb_dfQ1
imdb_dfQ1['Genre'] = imdb_dfQ1["Genre"].str.split(',')
imdb_dfQ1a = imdb_dfQ1.explode("Genre")
imdb_dfQ1a
imdb_dfQ1a['Genre'].str.strip().value_counts()

#### Question 2 - Finding Top 3 directors for movies uploaded on IMDB

In [None]:
# Top directors
plt.figure(figsize=(10,5))
plt.title('Top 3 Director Based on Movies in IMDB')
sns.countplot(y = imdb_df['Director'] , order=imdb_df['Director'].value_counts().index[:3], palette='Greens')
plt.show()

#### Question 3 - Finding top 3 movies based on IMDB rating and no. of votes

In [None]:
# Top 3 movies based on IMDB ratings and number of votes 
imdb_df3 = imdb_df.sort_values(["IMDB_Rating", "No_of_Votes"], ascending=False).reset_index()
Top3 = imdb_df3['Series_Title'].head(3)
Top3

In [None]:
#plotting the top movies based on ratings
fig,axs=plt.subplots(figsize=(5,5))
g=sns.barplot(x=imdb_df['Series_Title'][:3],y=imdb_df['IMDB_Rating'][:3], palette = "crest")
g.set_title("Top Rated Movies")
plt.show()

In [None]:
#plotting the top movies based on no of votes
fig,axs=plt.subplots(figsize=(5,5))
g=sns.barplot(x=imdb_df['Series_Title'][:3],y=imdb_df['No_of_Votes'][:3], palette = "rocket")
g.set_title("Top Voted Movies")
plt.show()

#### Question 4 - Finding movies distribution on IMDB for different ratings 

In [None]:
# Looking at total number of movies on IMDB based on their rating type
rating_group = imdb_df[['IMDB_Rating', 'Series_Title']].groupby('IMDB_Rating').count()
# Visualizing the data
rating_group.plot(kind='bar', title='Rating Distribution for Movies', color = "g")
plt.xlabel('Rating given')
plt.ylabel('Total no.of movies')
plt.show()

#### Question 5 - Running time/length of movie analysis

In [None]:
imdb_df['Runtime_num']= imdb_df['Runtime'].str.split(' ', expand = True)[0]
imdb_df.head()
plt.figure(figsize= [15,7])
g = sns.distplot(imdb_df['Runtime_num'])
g.axes.set_title('Movie runtime')
g.set_xlabel('Movie Duration (mins)')
plt.show()

In [1]:
!pip freeze

absl-py==0.12.0
adal @ file:///home/conda/feedstock_root/build_artifacts/adal_1611297630154/work
affine==2.3.0
aiobotocore==1.2.2
aiohttp @ file:///home/conda/feedstock_root/build_artifacts/aiohttp_1610358547752/work
aiohttp-cors==0.7.0
aioitertools==0.7.1
aioredis==1.3.1
albumentations==0.5.2
alembic==1.5.8
allennlp==2.2.0
altair==4.1.0
annoy==1.17.0
ansiwrap==0.8.4
appdirs @ file:///home/conda/feedstock_root/build_artifacts/appdirs_1603108395799/work
argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1610522577486/work
arrow @ file:///home/conda/feedstock_root/build_artifacts/arrow_1610483719724/work
arviz==0.11.2
asn1crypto @ file:///home/conda/feedstock_root/build_artifacts/asn1crypto_1595949944546/work
astropy @ file:///home/conda/feedstock_root/build_artifacts/astropy_1612734311738/work
astunparse==1.6.3
async-generator==1.10
async-timeout==3.0.1
attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1605083924122/work
audioread==2.1.9
autocfg==0