In [1]:
import pandas as pd

In [2]:
pd.__version__

'1.5.3'

In [3]:
!ls youtube-data/youtube-csv-data/

CAvideos.csv FRvideos.csv INvideos.csv KRvideos.csv RUvideos.csv
DEvideos.csv GBvideos.csv JPvideos.csv MXvideos.csv USvideos.csv


First I'm going to quickly explore the first few rows and get the feeling of kind of data that I'm going to be working with.

In [4]:
#Let's take for example Germany data since it is relevant for task_1 of the assingment

In [5]:
videos_df = pd.read_csv('./youtube-data/youtube-csv-data/DEvideos.csv')

In [6]:
videos_df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,LgVi6y5QIjM,17.14.11,Sing zu Ende! | Gesangseinlagen vom Feinsten |...,inscope21,24,2017-11-13T17:08:49.000Z,"inscope21|""sing zu ende""|""gesangseinlagen""|""ge...",252786,35885,230,1539,https://i.ytimg.com/vi/LgVi6y5QIjM/default.jpg,False,False,False,Heute gibt es mal wieder ein neues Format... w...
1,Bayt7uQith4,17.14.11,Kinder ferngesteuert im Kiosk! Erwachsene abzo...,LUKE! Die Woche und ich,23,2017-11-12T22:30:01.000Z,"Kinder|""ferngesteuert""|""Kinder ferngesteuert""|...",797196,53576,302,1278,https://i.ytimg.com/vi/Bayt7uQith4/default.jpg,False,False,False,Kinder ferngesteuert! Kinder lassen sich sooo ...
2,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97190,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
3,AHtypnRk7JE,17.14.11,Das Fermi-Paradoxon,100SekundenPhysik,27,2017-11-12T15:00:01.000Z,"Physik|""Wissenschaft""|""Technik""|""Science-Ficti...",380247,31821,458,1955,https://i.ytimg.com/vi/AHtypnRk7JE/default.jpg,False,False,False,►Alle Videos: http://bit.ly/1fa7Tw3\n\n\n✚Snap...
4,ZJ9We4bjcg0,17.14.11,18 SONGS mit Kelly MissesVlog (Sing-off),rezo,24,2017-11-12T13:10:36.000Z,"kelly|""missesvlog""|""kelly song""|""bausa""|""bausa...",822213,100684,2467,10244,https://i.ytimg.com/vi/ZJ9We4bjcg0/default.jpg,False,False,False,18 Song Mashup über den (veränderten) Beat von...


I see that we probably won't need these columns: **video_id, title, publish_time, tags, comment_count, thumbnail_link, comments_disabled, ratings_disabled** and **description**, so I will drop them. At least, not for any given task in the assingment, but those could be useful in other kind of analyses.

--- 
I will leave the column **video_error_or_removed** because according to metadata:

 **"video_error_or_removed: If this is true, it means that the video was removed by the user, or some unspecified error occurred during collection."**

For the simplicity sake I will assume the edge case that the video is being removed if this column has value **True**.

--- 

Columns like **trending_date** could be used to track how many times unique channel was in trending. Apart from number of views, that in my opinion could be a potential candidate as one of the measures of popularity.

For example, if one **channel_title** has been 3 times in trending then it is much popular than **channel_title** that has been none. Yes, there might be a possibility to "trick" the YT algorithm to get a video in Trending. Those kind of things are done daily in Balkans especially when it comes to (trashy) music, but for a simplicity sake we will say that YT recommendation algorithm is credible, without bias as a measure of popularity.

Since in the last task of this assignment it is required to find channels that are popular in most countries I think it would be a good idea to add the column depicting country name. That is because after transforming each table and getting it ready for visualization I will probably UNION ALL tables and then add another measure of popularity, that is, counting the number of countries unique channel_title is popular in (GROUP BY) and try to extract a column which will show all the Countries that the channel was popular in (GROUP_CONCAT).

For now, I will first concentrate on how to join this table with JSON files such that I can obtain category name for each **category_id** that is different for each country.

In [7]:
COLUMNS_USED = ['channel_title', 'category_id', 'views', 'likes', 'dislikes', 'video_error_or_removed']

# Preparing Germany .csv 

In [8]:
#Checking if there are videos which are removed
videos_df["video_error_or_removed"].value_counts()

False    40826
True        14
Name: video_error_or_removed, dtype: int64

In [9]:
#Pre-checking if we did proper filtering
len(videos_df[COLUMNS_USED][videos_df['video_error_or_removed'] == False])

40826

We see that number of **False** ones we are trying to select is equals to ones we previously showed in 
```value_counts()```

In [10]:
#Filtering only videos that are not removed...
videos_df = videos_df[COLUMNS_USED][videos_df['video_error_or_removed'] == False]

In [11]:
#...and dropping the uneccesary column video_error_or_removed
videos_df = videos_df.loc[:, videos_df.columns!='video_error_or_removed']
videos_df.head()

Unnamed: 0,channel_title,category_id,views,likes,dislikes
0,inscope21,24,252786,35885,230
1,LUKE! Die Woche und ich,23,797196,53576,302
2,LastWeekTonight,24,2418783,97190,6146
3,100SekundenPhysik,27,380247,31821,458
4,rezo,24,822213,100684,2467


In [12]:
#Reading JSON file for obtaining categories
videos_json = pd.read_json('./youtube-data/youtube-json-data/DE_category_id.json')

In [13]:
videos_json.head()

Unnamed: 0,kind,etag,items
0,youtube#videoCategoryListResponse,"""ld9biNPKjAjgjV7EZ4EKeEGrhao/1v2mrzYSYG6onNLt2...","{'kind': 'youtube#videoCategory', 'etag': '""ld..."
1,youtube#videoCategoryListResponse,"""ld9biNPKjAjgjV7EZ4EKeEGrhao/1v2mrzYSYG6onNLt2...","{'kind': 'youtube#videoCategory', 'etag': '""ld..."
2,youtube#videoCategoryListResponse,"""ld9biNPKjAjgjV7EZ4EKeEGrhao/1v2mrzYSYG6onNLt2...","{'kind': 'youtube#videoCategory', 'etag': '""ld..."
3,youtube#videoCategoryListResponse,"""ld9biNPKjAjgjV7EZ4EKeEGrhao/1v2mrzYSYG6onNLt2...","{'kind': 'youtube#videoCategory', 'etag': '""ld..."
4,youtube#videoCategoryListResponse,"""ld9biNPKjAjgjV7EZ4EKeEGrhao/1v2mrzYSYG6onNLt2...","{'kind': 'youtube#videoCategory', 'etag': '""ld..."


In [14]:
#Finding ways to access specific values

In [15]:
videos_json['items'][0]

{'kind': 'youtube#videoCategory',
 'etag': '"ld9biNPKjAjgjV7EZ4EKeEGrhao/Xy1mB4_yLrHy_BmKmPBggty2mZQ"',
 'id': '1',
 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
  'title': 'Film & Animation',
  'assignable': True}}

In [16]:
videos_json['items'][0]['id'], videos_json['items'][0]['snippet']['title']

('1', 'Film & Animation')

In [17]:
#I'm planning to collect all the tuples of type (category_id, category) and making a dataframe
#which will be used to JOIN tables

In [18]:
list_json = list(videos_json['items'].values)

In [19]:
category_list = []

In [20]:
#Let's see what kind of datatype is category_id such that we can properly populate the Dataframe and not having
#issues when joining two Dataframe's on the same column
videos_df.dtypes

channel_title    object
category_id       int64
views             int64
likes             int64
dislikes          int64
dtype: object

In [21]:
#We will cast the category_id to integer value
for element in list_json:
    category_list.append((int(element['id']), element['snippet']['title']))

In [22]:
#I will use the same name for the dataframe for each country in order to save a bit of memory
category_df = pd.DataFrame(category_list, columns =['category_id', 'category'])

In [23]:
category_df.head()

Unnamed: 0,category_id,category
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


In [24]:
#I'm going to use left join here just in case that there might be a possibility of having a category_id that
#isn't listed (due to API error?) in the proper JSON file
videos_df = videos_df.merge(category_df, on='category_id', how='left')
videos_df.head()

Unnamed: 0,channel_title,category_id,views,likes,dislikes,category
0,inscope21,24,252786,35885,230,Entertainment
1,LUKE! Die Woche und ich,23,797196,53576,302,Comedy
2,LastWeekTonight,24,2418783,97190,6146,Entertainment
3,100SekundenPhysik,27,380247,31821,458,Education
4,rezo,24,822213,100684,2467,Entertainment


In [25]:
#I will drop the uneccesary category_id column
videos_df = videos_df.loc[:, videos_df.columns!="category_id"]
videos_df.head()

Unnamed: 0,channel_title,views,likes,dislikes,category
0,inscope21,252786,35885,230,Entertainment
1,LUKE! Die Woche und ich,797196,53576,302,Comedy
2,LastWeekTonight,2418783,97190,6146,Entertainment
3,100SekundenPhysik,380247,31821,458,Education
4,rezo,822213,100684,2467,Entertainment


In [26]:
#Finally I will add column which will depict the country name
videos_df['country'] = 'DE'

In [27]:
videos_df.head()

Unnamed: 0,channel_title,views,likes,dislikes,category,country
0,inscope21,252786,35885,230,Entertainment,DE
1,LUKE! Die Woche und ich,797196,53576,302,Comedy,DE
2,LastWeekTonight,2418783,97190,6146,Entertainment,DE
3,100SekundenPhysik,380247,31821,458,Education,DE
4,rezo,822213,100684,2467,Entertainment,DE


In [28]:
#Exporting the data such that we can save up memory by using same namespace for videos_df
videos_df.to_csv('./exported-data/de_videos.csv', index = False)

At this point I think it is safe to make a function and automate the process for the rest of the countries!

In [29]:
!ls youtube-data/youtube-csv-data/

CAvideos.csv FRvideos.csv INvideos.csv KRvideos.csv RUvideos.csv
DEvideos.csv GBvideos.csv JPvideos.csv MXvideos.csv USvideos.csv


In [30]:
#Let's define abbreviations for the rest of the csv files
REST_OF_THE_COUNTRIES = ['CA', 'FR', 'IN', 'KR', 'RU', 'GB', 'JP', 'MX', 'US']

In [31]:
#Let's define the functions to help organize the code better
def csv_reading_and_transformation(country_abb):
    
    COLUMNS_USED = ['channel_title', 'category_id', 'views', 'likes', 'dislikes', 'video_error_or_removed']
    
    #Reading and transforming .csv files
    videos_df = pd.read_csv(f'./youtube-data/youtube-csv-data/{country_abb}videos.csv', encoding='latin-1')
    videos_df = videos_df[COLUMNS_USED][videos_df['video_error_or_removed'] == False]
    videos_df = videos_df.loc[:, videos_df.columns!='video_error_or_removed']
    
    return videos_df

def json_reading_and_transformation(country_abb):
    
    #Reading and transforming .json files
    videos_json = pd.read_json(f'./youtube-data/youtube-json-data/{country_abb}_category_id.json')
    
    list_json = list(videos_json['items'].values)
    category_list = []
    for element in list_json:
        category_list.append((int(element['id']), element['snippet']['title']))
        category_df = pd.DataFrame(category_list, columns =['category_id', 'category'])
            
    return category_df

def final_transformations(country_abb, csv_df, json_df):
    videos_df = csv_df.merge(json_df, on='category_id', how='left')
    
    videos_df = videos_df.loc[:, videos_df.columns!="category_id"]
    videos_df['country'] = f'{country_abb}'
    
    videos_df.to_csv(f'./exported-data/{country_abb.lower()}_videos.csv', index = False)
    
    return "Finished!"

In [32]:
#Getting .csv files for the rest of the countries
for each_country in REST_OF_THE_COUNTRIES:
    video_df = csv_reading_and_transformation(each_country)
    json_df = json_reading_and_transformation(each_country)
    final_transformations(each_country, video_df, json_df)

Let's now combine all the csv files into one such that it will be easier to deal with in Tableau

In [33]:
REST_OF_THE_COUNTRIES = [country.lower() for country in REST_OF_THE_COUNTRIES]

In [34]:
print(REST_OF_THE_COUNTRIES)

['ca', 'fr', 'in', 'kr', 'ru', 'gb', 'jp', 'mx', 'us']


In [35]:
#Let's not forget the Germany
REST_OF_THE_COUNTRIES.append('de')

In [37]:
li = []
for filename in REST_OF_THE_COUNTRIES:
    video_df = pd.read_csv(f'./exported-data/{filename}_videos.csv')
    li.append(video_df)

In [38]:
#concat() function has the same functionality as UNION ALL in SQL
combined_df = pd.concat(li, axis=0, ignore_index=True)

In [39]:
#Export the final csv file
combined_df.to_csv('./exported-data/videos_combined.csv', index = False)