<a href="https://colab.research.google.com/github/nandu26m/data-analytics-projects/blob/main/netflix-data-analytics-project/netflix-data-analytics-project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import dependencies
import pandas as pd
print(pd.__version__)

2.2.2


In [2]:
data = pd.read_csv("netflix.csv")

In [3]:
# print the top 2 rows
data.head(2)

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."


In [4]:
# print the last 2 rows
data.tail(2)

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
8788,s8784,TV Show,Yoko,Not Given,Pakistan,6/23/2018,2016,TV-Y,1 Season,Kids' TV
8789,s8786,TV Show,YOM,Not Given,Pakistan,6/7/2018,2016,TV-Y7,1 Season,Kids' TV


In [5]:
data.shape

(8790, 10)

In [6]:
data.describe()

Unnamed: 0,release_year
count,8790.0
mean,2014.183163
std,8.825466
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8790 non-null   object
 1   type          8790 non-null   object
 2   title         8790 non-null   object
 3   director      8790 non-null   object
 4   country       8790 non-null   object
 5   date_added    8790 non-null   object
 6   release_year  8790 non-null   int64 
 7   rating        8790 non-null   object
 8   duration      8790 non-null   object
 9   listed_in     8790 non-null   object
dtypes: int64(1), object(9)
memory usage: 686.8+ KB


## Step 1: Deleting redundant columns.

In [8]:
data.columns

Index(['show_id', 'type', 'title', 'director', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in'],
      dtype='object')

In [9]:
# Dropping unwanted columns, Dropping column "rating".
columns_to_remove = ["rating"]
data.drop(columns=columns_to_remove, inplace=True)

## Step 2: Renaming the columns.

In [10]:
data.columns

Index(['show_id', 'type', 'title', 'director', 'country', 'date_added',
       'release_year', 'duration', 'listed_in'],
      dtype='object')

In [11]:
# rename specific columns
data.rename(columns={
    "date_added": "Date_added",
    "country": "Country"
}, inplace=True)

data.columns

Index(['show_id', 'type', 'title', 'director', 'Country', 'Date_added',
       'release_year', 'duration', 'listed_in'],
      dtype='object')

In [12]:
# renaming all columns at once
updated_column_names = [i.capitalize() for i in data.columns]
data.columns = updated_column_names

data.columns

Index(['Show_id', 'Type', 'Title', 'Director', 'Country', 'Date_added',
       'Release_year', 'Duration', 'Listed_in'],
      dtype='object')

## Step 3: Dropping duplicates.

In [13]:
data.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
8785,False
8786,False
8787,False
8788,False


In [14]:
data.duplicated().sum() # the output is "0". which menas there is no duplicate values.

np.int64(0)

In [15]:
# For cleaning duplicates, If there is any
data.drop_duplicates(inplace=True)

## Step 4: Remove the NaN values from the dataset.

In [16]:
data.isna()

Unnamed: 0,Show_id,Type,Title,Director,Country,Date_added,Release_year,Duration,Listed_in
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
8785,False,False,False,False,False,False,False,False,False
8786,False,False,False,False,False,False,False,False,False
8787,False,False,False,False,False,False,False,False,False
8788,False,False,False,False,False,False,False,False,False


In [17]:
data.isna().sum() # There is no "NaN" values.

Unnamed: 0,0
Show_id,0
Type,0
Title,0
Director,0
Country,0
Date_added,0
Release_year,0
Duration,0
Listed_in,0


In [18]:
# To remove "NaN" values if there is any.
data.dropna(inplace=True)

## Step 5: Check for some more transformation.

In [19]:
# In this step, we will remove all non-numeric characters from the "Show_id" column and retain only the integer values.
print(data.head(2))

Unnamed: 0,Show_id,Type,Title,Director,Country,Date_added,Release_year,Duration,Listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,1 Season,"Crime TV Shows, International TV Shows, TV Act..."


In [24]:
# check the trends of the data
data["Show_id"].nunique() # This returns the total number of unique values in the Show_id column. Since the number of unique values is equal to the total number of records, it indicates that there are no duplicates.

data["Show_id"].unique() # This also helps us understand the structure of the data. From this, we can see that all values in the Show_id column begin with a leading "s" followed by numbers.

array(['s1', 's3', 's6', ..., 's8801', 's8784', 's8786'], dtype=object)

In [26]:
# removing the leading "s" from the column "Show_id"
data["Show_id"] = data["Show_id"].apply(lambda x: x.split("s")[1])

In [27]:
data.head(2)

Unnamed: 0,Show_id,Type,Title,Director,Country,Date_added,Release_year,Duration,Listed_in
0,1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,90 min,Documentaries
1,3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,1 Season,"Crime TV Shows, International TV Shows, TV Act..."


In [29]:
# check the data type
type(data["Show_id"][0])

str

In [33]:
# convert the data type
data["Show_id"] = data["Show_id"].astype(int)

In [34]:
# check the data type
type(data["Show_id"][0])

numpy.int64

In [37]:
# Change the date format
data["Date_added"] = data["Date_added"].str.replace("/", "-")

In [38]:
data.head()

Unnamed: 0,Show_id,Type,Title,Director,Country,Date_added,Release_year,Duration,Listed_in
0,1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9-25-2021,2020,90 min,Documentaries
1,3,TV Show,Ganglands,Julien Leclercq,France,9-24-2021,2021,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,6,TV Show,Midnight Mass,Mike Flanagan,United States,9-24-2021,2021,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9-22-2021,2021,91 min,"Children & Family Movies, Comedies"
4,8,Movie,Sankofa,Haile Gerima,United States,9-24-2021,1993,125 min,"Dramas, Independent Movies, International Movies"
