# Reading the dataSet

In [1]:
import pandas as pd 
import numpy as np
df = pd.read_csv('VideosUS.csv') 

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40949 entries, 0 to 41414
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   video_id                40949 non-null  object 
 1   trending_date           40949 non-null  object 
 2   title                   40949 non-null  object 
 3   channel_title           40949 non-null  object 
 4   category_id             40949 non-null  float64
 5   publish_time            40949 non-null  object 
 6   tags                    40949 non-null  object 
 7   views                   40949 non-null  float64
 8   likes                   40949 non-null  float64
 9   dislikes                40949 non-null  float64
 10  comment_count           40949 non-null  float64
 11  thumbnail_link          40949 non-null  object 
 12  comments_disabled       40949 non-null  object 
 13  ratings_disabled        40949 non-null  object 
 14  video_error_or_removed  40949 non-null

# Checking if there are some missing values 

In [3]:
df.isna().any().any()

True

# Check number of missing values 

In [4]:
df.isna().sum()

video_id                     0
trending_date              466
title                      466
channel_title              466
category_id                466
publish_time               466
tags                       466
views                      466
likes                      466
dislikes                   466
comment_count              466
thumbnail_link             466
comments_disabled          466
ratings_disabled           466
video_error_or_removed     466
description               1036
dtype: int64

We Notice that the number of missing values is the same in each column we can say that we might be dealing with empty rows its a MAR case some videos have less informations provided, but for the discribtion its a MNAR since not all videos have discribtions

In [5]:
df[df['title'].isna()]

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
53,\nSubscribe to WWE on YouTube: http://bit.ly/1...,,,,,,,,,,,,,,,
54,\nVisit WWE.com: http://goo.gl/akf0J4,,,,,,,,,,,,,,,
55,\nMust-See WWE videos on YouTube: https://goo....,,,,,,,,,,,,,,,
319,\nSubscribe to WWE on YouTube: http://bit.ly/1...,,,,,,,,,,,,,,,
320,\nVisit WWE.com: http://goo.gl/akf0J4,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36570,Follow me on Twitter ► https://twitter.com/mar...,,,,,,,,,,,,,,,
36571,Like me on Facebook ► https://www.facebook.com...,,,,,,,,,,,,,,,
36572,Join us on Reddit! ► https://www.reddit.com/r/...,,,,,,,,,,,,,,,
36573,Horror Outro ► https://soundcloud.com/shurkoff...,,,,,,,,,,,,,,,


# Cleaning the data

In [6]:
df['description'].fillna('',inplace = True)

In [8]:
df.dropna(inplace = True)

# Calcuations

In [11]:
numFrame = df[['category_id','views','likes','dislikes','comment_count']]
numFrame

Unnamed: 0,category_id,views,likes,dislikes,comment_count
0,22.0,748374.0,57527.0,2966.0,15954.0
1,24.0,2418783.0,97185.0,6146.0,12703.0
2,23.0,3191434.0,146033.0,5339.0,8181.0
3,24.0,343168.0,10172.0,666.0,2146.0
4,24.0,2095731.0,132235.0,1989.0,17518.0
...,...,...,...,...,...
41410,15.0,1685609.0,38160.0,1385.0,2657.0
41411,22.0,1064798.0,60008.0,382.0,3936.0
41412,24.0,1066451.0,48068.0,1032.0,3992.0
41413,1.0,5660813.0,192957.0,2846.0,13088.0


In [12]:
numFrame.mean()

category_id      1.997243e+01
views            2.360785e+06
likes            7.426670e+04
dislikes         3.711401e+03
comment_count    8.446804e+03
dtype: float64

In [13]:
numFrame.median()

category_id          24.0
views            681861.0
likes             18091.0
dislikes            631.0
comment_count      1856.0
dtype: float64

In [14]:
numFrame.quantile([0.25,0.5,0.75])

Unnamed: 0,category_id,views,likes,dislikes,comment_count
0.25,17.0,242329.0,5424.0,202.0,614.0
0.5,24.0,681861.0,18091.0,631.0,1856.0
0.75,25.0,1823157.0,55417.0,1938.0,5755.0


# Removing outliers

In [15]:
from scipy import stats
z = np.abs(stats.zscore(numFrame))
z

Unnamed: 0,category_id,views,likes,dislikes,comment_count
0,0.267905,0.218069,0.073137,0.025677,0.200566
1,0.532168,0.007844,0.100131,0.083867,0.113711
2,0.400037,0.112341,0.313551,0.056067,0.007101
3,0.532168,0.272871,0.280033,0.104908,0.168336
4,0.532168,0.035847,0.253267,0.059333,0.242351
...,...,...,...,...,...
41410,0.657013,0.091314,0.157752,0.080140,0.154683
41411,0.267905,0.175275,0.062297,0.114691,0.120513
41412,0.532168,0.175051,0.114464,0.092300,0.119017
41413,2.506850,0.446310,0.518564,0.029811,0.123997


In [18]:
numFrame_or = numFrame[(z < 3).all(axis=1)]
numFrame_or.shape , numFrame.shape

((40029, 5), (40949, 5))

In [19]:
df.nunique()

video_id                   6351
trending_date               205
title                      6455
channel_title              2207
category_id                  16
publish_time               6269
tags                       6055
views                     40478
likes                     29850
dislikes                   8516
comment_count             13773
thumbnail_link             6352
comments_disabled             2
ratings_disabled              2
video_error_or_removed        2
description                6902
dtype: int64

# change type

yes there are some types that can be changed such as comments disabled,ratings_disabled and video error the other categorical data can't be changed 

In [20]:
df['tags'].nunique()

6055

In [25]:
df['tags'].value_counts().idxmax()

'[none]'