I will use ```pandasql``` to query the data from the Dataframe using standard SQL language.

We will define a popularity of a channel based on number of views, number of times the unique channel was in trending and number of countries it is popular. 

We will say that number of views is the most important popularity factor, then it is number of times channel was in trending and lastly the number of countries it spans. 

In [1]:
import pandas as pd
from pandasql import sqldf

In [2]:
df = pd.read_csv('./exported-data/videos_combined.csv')
df.head()

Unnamed: 0,channel_title,views,likes,dislikes,category,country
0,EminemVEVO,17158579,787425,43420,Music,CA
1,iDubbbzTV,1014651,127794,1688,Comedy,CA
2,Rudy Mancuso,3191434,146035,5339,Comedy,CA
3,nigahiga,2095828,132239,1989,Entertainment,CA
4,Ed Sheeran,33523622,1634130,21082,Music,CA


In [3]:
#Let's replace the country abbreviations with real country names for better data representation
list(df['country'].unique())

['CA', 'FR', 'IN', 'KR', 'RU', 'GB', 'JP', 'MX', 'US', 'DE']

In [4]:
#Let's replace the country abbreviations with real country names for better data representation
dict = {"CA" : 'Canada', "FR" : 'France', "IN": 'India', "KR" : 'Korea', "RU": 'Russia', "GB": 'Great Britain', \
        "JP" : 'Japan', "MX" : 'Mexico', "US" : 'USA', "DE" : 'Germany'}
df = df.replace({"country": dict})

In [5]:
#Let's quick check if we've done everything right
list(df['country'].unique())

['Canada',
 'France',
 'India',
 'Korea',
 'Russia',
 'Great Britain',
 'Japan',
 'Mexico',
 'USA',
 'Germany']

In [6]:
sql_query = """
    SELECT channel_title, SUM(views) AS num_views, COUNT(*) AS num_times_trending, GROUP_CONCAT(DISTINCT(country)) AS country
    FROM df
    GROUP BY channel_title
"""

In [7]:
query_execution = sqldf(sql_query)
query_execution.head()

Unnamed: 0,channel_title,num_views,num_times_trending,country
0,! ì¸ìì ë¬´ì¨ì¼ì´,3942977,7,Korea
1,!!8æã ãé¢ç½ãã¿å¤§éå,50207,1,Japan
2,!BTSã»TWICE ã¾ã¨ã,7310,1,Japan
3,!Los amorosos ViralesÂ¡,6069,2,Mexico
4,!t Live,240038,4,"Korea,Japan"


In [8]:
#Let's get the 
query_execution['num_countries'] = query_execution['country'].str.split(",").str.len()

In [14]:
query_execution.head(10)

Unnamed: 0,channel_title,num_views,num_times_trending,country,num_countries
0,! ì¸ìì ë¬´ì¨ì¼ì´,3942977,7,Korea,1
1,!!8æã ãé¢ç½ãã¿å¤§éå,50207,1,Japan,1
2,!BTSã»TWICE ã¾ã¨ã,7310,1,Japan,1
3,!Los amorosos ViralesÂ¡,6069,2,Mexico,1
4,!t Live,240038,4,"Korea,Japan",2
5,!åã©ãã®ä¸ãèµ°ã,11528,2,Japan,1
6,!å³¶ã¡ãã,1061020,4,Japan,1
7,!è¡æçãªåç»,9112,2,Japan,1
8,# #,2318,1,Russia,1
9,# Milgame,1448,1,Russia,1


In [12]:
df = query_execution.sort_values(by=['num_views', 'num_times_trending', 'num_countries'], ascending = False)

In [13]:
df.to_csv('./exported-data/final_task.csv', index = False)

In [15]:
#Let's just show top 20 channels based on the logic
df.head(20)

Unnamed: 0,channel_title,num_views,num_times_trending,country,num_countries
4588,ChildishGambinoVEVO,11016766510,93,"Canada,France,Korea,Russia,Great Britain,Japan...",9
15601,Marvel Entertainment,10430605449,482,"Canada,France,India,Korea,Russia,Great Britain...",10
17799,NickyJamTV,9479859505,126,"Canada,France,Russia,Great Britain,Mexico,USA,...",7
18540,Ozuna,8623329509,113,"Canada,France,Great Britain,Mexico,USA,Germany",6
28530,ibighit,8205572221,320,"Canada,France,Korea,Russia,Great Britain,Japan...",9
6723,DrakeVEVO,7637228580,98,"Canada,France,Korea,Russia,Great Britain,Japan...",9
2802,Bad Bunny,7124207494,54,"Canada,France,Great Britain,Mexico,Germany",5
2111,ArianaGrandeVevo,6202230488,104,"Canada,France,Korea,Russia,Great Britain,Japan...",9
28739,jypentertainment,5802822913,297,"Canada,France,Korea,Russia,Great Britain,Japan...",9
7084,Ed Sheeran,5775405574,202,"Canada,France,India,Korea,Russia,Great Britain...",10
