In [1]:
import pandas as pd
import numpy as np

In [2]:
db_video = pd.read_csv('video_dataset_channel_criteria.csv', sep=';')
db_video.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18699177 entries, 0 to 18699176
Data columns (total 10 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   Unnamed: 0    int64 
 1   category      object
 2   channel       object
 3   date_crawled  object
 4   description   object
 5   id            object
 6   duration      int64 
 7   tags          object
 8   title         object
 9   upload        object
dtypes: int64(2), object(8)
memory usage: 1.4+ GB


In [3]:
db_video['channel'].nunique()

90688

## Creating upload video period variables

#### Selecting videos published between 15th August 2016 and 29th October 2017

In [4]:
db_video['upload']=pd.to_datetime(db_video['upload'], yearfirst=True)

In [5]:
# Dropping rows for videos published before 15th August 2016
db_video = db_video[db_video['upload']>='2016-08-15']

# Dropping rows for videos published after 29th October 2017
db_video = db_video[db_video['upload']<'2017-10-30']

#### Creating time event variables

1st content moderation update (1st adpocalypse) = 12th week (March 2017)

In [6]:
db_video['week']=db_video['upload'].apply(lambda x: x.strftime(format='%Y-%W')) 

In [7]:
# Videos published before any YouTube"s update
condition=[(db_video["upload"]>"2017-03-26"),
           (db_video["upload"].between("2017-03-20", "2017-03-26")),
          (db_video["upload"]<"2017-03-20")]
value=[0, None, 1]
db_video["before_updates"]=np.select(condition, value)


## Processing video categories

In [8]:
# Checking if there is missing video category name
db_video['category'].unique()

array(['Sports', 'Gaming', 'Science & Technology', 'People & Blogs',
       'Film & Animation', 'Howto & Style', 'Comedy', 'Entertainment',
       'News & Politics', 'Education', 'Music', 'Pets & Animals', nan,
       'Travel & Events', 'Nonprofits & Activism', 'Autos & Vehicles'],
      dtype=object)

In [9]:
# Rename the missing category into unknown
db_video['category']=db_video['category'].fillna('Unknown')

In [10]:
db_video

Unnamed: 0.1,Unnamed: 0,category,channel,date_crawled,description,id,duration,tags,title,upload,week,before_updates
27,127,Sports,UCzWn_gTaXyH5Idyo8Raf7_A,2019-11-03 16:39:12.116589,Fishing for flathead catfish and channel catfi...,UsuYXdyieFU,1080,"flathead,flathead catfish,channel catfish,chan...",How to catch flathead catfish with live bait -...,2017-10-28,2017-43,0
28,128,Sports,UCzWn_gTaXyH5Idyo8Raf7_A,2019-11-03 16:39:12.784544,Gar fishing with lures we made ourselves from ...,P7kBIPXegWM,921,"Gar,longnose gar,spotted gar,catch gar,fishing...",Catching gar with rope?!! Fishing for gar with...,2017-10-21,2017-42,0
29,129,Sports,UCzWn_gTaXyH5Idyo8Raf7_A,2019-11-03 16:39:13.428325,1600 dad jokes were submitted but only one cou...,phF8Vtg_iH8,526,"Dad jokes,cheesey jokes,jokes,puns,cheesy puns...",Best of the Worst Dad Jokes!!! Catfish rod win...,2017-10-20,2017-42,0
30,130,Sports,UCzWn_gTaXyH5Idyo8Raf7_A,2019-11-03 16:39:14.142225,Epic dad jokes battle between me and my wife w...,zV1r_QT2S8w,740,"Dad Joke,Dad jokes,Dad jokes battle,Dad joke b...",Dad Jokes Battle While Fishing for Catfish!!! ...,2017-10-14,2017-41,0
31,131,Sports,UCzWn_gTaXyH5Idyo8Raf7_A,2019-11-03 16:39:14.875664,Fall is here are the big catfish are everywher...,HYR-7tdy1SE,569,"catfish,catfishing,fishing for catfish,big cat...",Catching BIG catfish with little kids - catfis...,2017-10-07,2017-40,0
...,...,...,...,...,...,...,...,...,...,...,...,...
18699171,2373542,People & Blogs,UCrwMvZOgSvS2srsIjY-iIGg,2019-11-19 08:52:36.122483,HEAVY K SWEETIE SWEETIE,Kw6-qUsQFmc,84,,SWEETIE SWEETIE DANCE,2017-01-20,2017-03,1
18699172,2373543,People & Blogs,UCrwMvZOgSvS2srsIjY-iIGg,2019-11-19 08:52:36.729797,CHEEMA AND KHUSA IN AFRICA,ONr0TugpkmY,131,,SOUTH AFRICAN BLACK CHEEMA SPEAKS PUNJABI,2017-01-18,2017-03,1
18699173,2373544,People & Blogs,UCrwMvZOgSvS2srsIjY-iIGg,2019-11-19 08:52:37.239115,very funny,XdJFbBO3J7Q,252,,ARAB BOY AND USA LIVE CHAT,2017-01-17,2017-03,1
18699174,2373545,People & Blogs,UCrwMvZOgSvS2srsIjY-iIGg,2019-11-19 08:52:37.819109,PAKISTAN VS AUSTRALIA 2ND ODI 15 JAN 2017 HIGH...,syfKxRYAEKI,947,,PAKISTAN VS AUSTRALIA 2ND ODI 15 JAN 2017 HIGH...,2017-01-16,2017-03,1


In [11]:
db_video.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11845978 entries, 27 to 18699175
Data columns (total 12 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Unnamed: 0      int64         
 1   category        object        
 2   channel         object        
 3   date_crawled    object        
 4   description     object        
 5   id              object        
 6   duration        int64         
 7   tags            object        
 8   title           object        
 9   upload          datetime64[ns]
 10  week            object        
 11  before_updates  object        
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 1.1+ GB


In [12]:
db_video.to_csv('video_database_per_period_selected.csv', sep=';')

In [13]:
db_video['channel'].nunique()

88493