**AC Project: Working with Netflix Dataset**

*By:* 

-Peeya Thacker



---



**(1) Data Understanding/Exploration**

In [None]:
import pandas as pd
import numpy as np

In [None]:
#reading the dataset
netflix_data = pd.read_csv("NetflixTitles.csv",index_col=0) 

In [None]:
#checking various columns
netflix_data.columns

In [None]:
#viewing first few rows using head()
netflix_data.head()

In [None]:
#selecting a specific column using column name
netflix_data['release_year']

In [None]:
#checking minimum year of release
netflix_data['release_year'].min()

In [None]:
#checking maximum year of release
netflix_data['release_year'].max()

In [None]:
#selecting multiple columns using column names
netflix_data[['date_added','release_year']]

In [None]:
#selecting rows by their position
netflix_data.iloc[:10]

In [None]:
#selecting columns by thier position
netflix_data.iloc[:,:5]

In [None]:
#accessing all movie type data
netflix_data[netflix_data['type']=='Movie']

In [None]:
#checking series of data type of each column
netflix_data.dtypes

In [None]:
#checking data type of each column
netflix_data.dtypes

In [None]:
#counting and checking unique data types in the dataframe
netflix_data.value_counts()

In [None]:
#selecting data based on data type
netflix_data.select_dtypes(include='int64')

In [None]:
#subset of the columns from dataframe based on the column dtypes
netflix_data.select_dtypes(exclude=[object]) #Not having Object/String

In [None]:
#summary of dataframe
netflix_data.info() 

In [None]:
#finding unique number of values for each variable
netflix_data.nunique(axis=0)

In [None]:
#finding the unique elements of a column
np.unique(netflix_data['release_year']) 

In [None]:
#viewing last few rows using tail()
netflix_data.tail(20)

In [None]:
#checking number of rows and columns in dataset
netflix_data.shape

**(2) Data Cleaning/Manipulation**

In [None]:
#converting release year to object datatype
netflix_data["release_year"]=netflix_data["release_year"].astype('object')
netflix_data.info()

In [None]:
#creating deep copies
netflix_data1 = netflix_data.copy()
print(netflix_data1.shape)
netflix_data.shape

In [None]:
#checking not null values in year or release
not_missing = netflix_data[netflix_data["release_year"].notnull()]
not_missing.shape

In [None]:
#check number of missing values in each column
netflix_data1.isnull().sum()

In [None]:
#check number of missing values in each column
netflix_data1.isna().sum()

In [None]:
#subsetting rows with missing values
missing = netflix_data1[netflix_data1.isnull().any(axis=1)]
missing.shape

In [None]:
#describing dataset
netflix_data1.describe()

In [None]:
#filling missing values of release_year
netflix_data1['release_year'].mean()
netflix_data1['release_year'].fillna(netflix_data1['release_year'].mean(), inplace=True)
netflix_data1.head(5)

In [None]:
#filling missing values of director
netflix_data1['director'].fillna('Not known',inplace=True)
netflix_data1.isna().sum()

In [None]:
#filling missing values of cast
netflix_data1['cast'].fillna('Watch the show and find out!',inplace=True)
netflix_data1.isna().sum()

In [None]:
#filling missing values of country
netflix_data1['country'].fillna('Somewhere on earth',inplace=True)
netflix_data1.isna().sum()

In [None]:
#filling missing values of date_added
netflix_data1['date_added'].fillna('None',inplace=True)
netflix_data1.isna().sum()

In [None]:
#filling missing values of rating
netflix_data1['rating'].fillna('Some star',inplace=True)
netflix_data1.isna().sum()

**(3) Data Analysis/Visualisation**

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
print ('Matplotlib version: ', mpl.__version__)

In [None]:
#creating deep copies
netflix_data2 = netflix_data.copy()
print(netflix_data2.shape)
netflix_data.shape

In [None]:
#plotting bar graph of number of movies/TV shows
netflix_data2['type'].value_counts()
counts=[5377,2410]
Type = ['Movie','TV Show']
index = np.arange(len(Type))
plt.bar(index,counts,color=['blue','cyan'])
plt.title("Bar plot of movie/show")
plt.xlabel("Type")
plt.ylabel("Frequency")
plt.show()
print("Conclusion: Number of movies>Tv shows")

In [None]:
#plotting histogram of number of release year of movie/Tv show
plt.hist(netflix_data2['release_year'],
         color='maroon',
         edgecolor='white',
         bins = 10)
plt.title("Histogram of release year")
plt.xlabel("Year")
plt.ylabel("Frequency")
plt.show()
print("Conclusion: Number of movies/Tv shows have drastically increased over the years")

In [None]:
#plotting pie chart of rating
netflix_data2['rating'].value_counts()
counts=[2863,1931,806,665,386,280,271,247,194,84,39,6,5,3]
Type = ['TV-MA','TV-14','TV-PG','R','PG-13','TV-Y','TV-Y7','PG','TV-G','NR','G','TV-Y7-FV','UR','NC-17']
plt.pie(counts, labels=Type, autopct='%.1f%%',shadow=True, startangle=60) 
plt.axis('equal')
plt.show()
print("Conclusion: Maximum movies/TV shows have TV-MA rating and minimum have NC-17 rating")

In [None]:
#plotting multiple diagrams
plt.figure(figsize=(9,9)) 
plt.subplots_adjust(hspace=.5,wspace=.5)

plt.subplot(221)
netflix_data2['type'].value_counts(normalize=True).plot(figsize=(10,8),kind='bar',color='pink')
plt.title("Type frequency diagram")
plt.ylabel('Number')

plt.subplot(222)
netflix_data2['release_year'].value_counts(normalize= True).plot(figsize=(10,8),kind='line',color='violet')
plt.title("Year frequency diagram")
plt.ylabel('Number')
plt.xlabel('Year')