### Information regarding the code

This script creates the channel dataset based on the selection criteria for the study. 

In [1]:
import pandas as pd
import numpy as np
import csv

## Loading channels database

In [2]:
#Encoding latin-1 because of following error "'utf-8' codec can't decode byte 0xe9 in position 10158: invalid continuation byte"
db_cha=pd.read_csv('df_channels_en.csv', sep=';', encoding='latin-1')
db_cha.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136470 entries, 0 to 136469
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   category_cc         136342 non-null  object 
 1   join_date           136469 non-null  object 
 2   channel             136470 non-null  object 
 3   name_cc             136460 non-null  object 
 4   subscribers_cc      136470 non-null  int64  
 5   videos_cc           136470 non-null  int64  
 6   subscriber_rank_sb  136470 non-null  float64
 7   weights             136470 non-null  float64
dtypes: float64(2), int64(2), object(4)
memory usage: 8.3+ MB


In [3]:
db_nb_cha=db_cha.groupby(['channel']).size().to_frame()
db_nb_cha.info()

<class 'pandas.core.frame.DataFrame'>
Index: 136470 entries, UC--24Q3_ZQeFmgJE-Um5QZQ to UCzzzZ3-icktxbC3j7hkWqRw
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   0       136470 non-null  int64
dtypes: int64(1)
memory usage: 2.1+ MB


## Sample criteria

### Creation date

In [4]:
# As mentionned in the proposal, we will study channels that have a creation date prior to 1st January 2015

condition=[(db_cha['join_date']<='2016-01-01'),
          (db_cha['join_date']>'2016-01-01')]
values=[1,0]

db_cha['prior_crea_date']=np.select(condition, values)

In [5]:
#Keeping channels which were created prior to 1st January 2015
db_cha=db_cha[db_cha['prior_crea_date']==1] 

In [6]:
db_cha.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100451 entries, 0 to 136469
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   category_cc         100329 non-null  object 
 1   join_date           100451 non-null  object 
 2   channel             100451 non-null  object 
 3   name_cc             100443 non-null  object 
 4   subscribers_cc      100451 non-null  int64  
 5   videos_cc           100451 non-null  int64  
 6   subscriber_rank_sb  100451 non-null  float64
 7   weights             100451 non-null  float64
 8   prior_crea_date     100451 non-null  int32  
dtypes: float64(2), int32(1), int64(2), object(4)
memory usage: 7.3+ MB


### Channel with complete consecutive timeseries data

In [7]:
# Keeping channels with time series information available for the studied period
db_time=pd.read_csv('timeseries_cleaned.csv', sep=';')

channel_time=db_time['channel'].unique().tolist()
len(channel_time)

126297

In [8]:
db_cha=db_cha[db_cha.channel.isin(channel_time)]
db_cha.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95859 entries, 0 to 136469
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   category_cc         95738 non-null  object 
 1   join_date           95859 non-null  object 
 2   channel             95859 non-null  object 
 3   name_cc             95853 non-null  object 
 4   subscribers_cc      95859 non-null  int64  
 5   videos_cc           95859 non-null  int64  
 6   subscriber_rank_sb  95859 non-null  float64
 7   weights             95859 non-null  float64
 8   prior_crea_date     95859 non-null  int32  
dtypes: float64(2), int32(1), int64(2), object(4)
memory usage: 6.9+ MB


In [10]:
len(db_cha['channel'])

95859

In [12]:
db_cha.to_csv('channels_criteria.csv', sep=';')

In [11]:
db_cha.head()

Unnamed: 0,category_cc,join_date,channel,name_cc,subscribers_cc,videos_cc,subscriber_rank_sb,weights,prior_crea_date
0,Gaming,2010-04-29,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,101000000,3956,3.0,2.087,1
1,Education,2006-09-01,UCbCmjCuTUZos6Inko4u57UQ,Cocomelon - Nursery ...,60100000,458,7.0,2.087,1
2,Entertainment,2006-09-20,UCpEhnqL0y41EpW2TvWAHD7Q,SET India,56018869,32661,8.0,2.087,1
4,Sports,2007-05-11,UCJ5v_MCY6GNUBTO8-D3XoAg,WWE,48400000,43421,11.0,2.087,1
5,Entertainment,2007-01-15,UCIwFjwMjI0y7PDBVEO9-bkQ,Justin Bieber,46574085,134,12.0,2.087,1
