# Youtube URL Comparison (Sampled vs Filtered)

In [1]:
import psycopg2
import pandas as pd
import config
from matplotlib import pylab as plt
import seaborn as sns
import numpy as np
from IPython.display import display, HTML
import re


color_palette = sns.color_palette(palette='muted', n_colors=None, desat=.75)
sns.set(context='notebook', palette=color_palette, style='whitegrid', font='sans-serif', font_scale=1.5, color_codes=False, rc=None)
pd.set_option('display.max_colwidth', -1)
table_styles = [{'selector': 'td',
                 'props': [('min-width', '100px'), ('text-align', 'center')]},
                {'selector': 'tr',
                 'props': [('border-bottom', '1px dotted black')]},
                {'selector': 'th',
                 'props': [('text-align', 'center')]}
               ]

%matplotlib inline

directory = "url_top_lists/"
stream = "comparison"

  """)
  return f(*args, **kwds)


## Util Methods

In [4]:
def compareRows(row, df_to_compare, column_name):
    comparison_row = df_to_compare.loc[df_to_compare[column_name] == row[column_name]]
    if comparison_row.empty:
        comparison = " - "
    else:
        percentage_dif = row['percentage'] - comparison_row['percentage'].values[0]
        difference = "(%s. / %.3f%% / %.3f%%)" % (comparison_row['rank'].values[0], comparison_row['percentage'].values[0], percentage_dif)
        if comparison_row['rank'].values[0] == row['rank']:
            comparison = " = <br>" + difference
        else:
            if comparison_row['rank'].values[0] > row['rank']:
                comparison = " v <br>" + difference
            else:
                comparison = " ^ <br>" + difference
    return comparison

def getOpacity(val):
    value = abs(float(re.findall(r"[-+]?\d*\.\d+|\d+", val.split("/")[2])[0]))
    if value < 0.005:
        return 1
    if value < 0.01:
        return 0.95
    if value < 0.05:
        return 0.8
    if value < 0.1:
        return 0.7
    if value < 0.5:
        return 0.6
    if value < 1:
        return 0.5
    if value < 10:
        return 0.3
    if value < 40:
        return 0.2
    if value < 80:
        return 0.1
    if value < 100:
        return 0.05

def colorComparisonField(val):
    
    if isinstance(val, str):
        if ' ^ ' in val or ' v ' in val:
            return 'background-color: rgba(246, 185, 59, %s)' %getOpacity(val)
        if ' = ' in val:
            return 'background-color: rgba(184, 233, 148, %s)' %getOpacity(val)
        if ' - ' in val and len(val) == 3:
            return 'background-color: #e55039' 
    return ''

def generateRankingDataframe(series, attribute_name):
    size = series.sum()
    rank = []
    parameter = []
    count = []
    percentage = []

    i = 1
    for index, value in series.iteritems():
        rank.append(i)
        parameter.append(index)
        count.append(value)
        percentage.append((value/size)*100)
        i += 1

    data = {'rank': rank, attribute_name: parameter, 'value': count, 'percentage': percentage}
    return pd.DataFrame(data=data)

def generateComparisonDataframes(df1, df2, column_name, size):
    compare_list = []
    for index, row in df1.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df2, column_name))

    data = {'rank': df1['rank'][:size], column_name: df1[column_name][:size], 'value': df1['value'][:size], 'percentage': df1['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df1_compared = pd.DataFrame(data=data)
    df1_compared.set_index(keys='rank', inplace=True)
    
    compare_list = []
    for index, row in df2.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df1, column_name))

    data = {'rank': df2['rank'][:size], column_name: df2[column_name][:size], 'value': df2['value'][:size], 'percentage': df2['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df2_compared = pd.DataFrame(data=data)
    df2_compared.set_index(keys='rank', inplace=True)
    
    return df1_compared, df2_compared

def getPrettyComparisonDataframe(df, title):
    s = df.style.applymap(colorComparisonField)
    s.set_caption(title)
    s.set_table_styles(table_styles)
    return s

In [3]:
conn = None
try:
    # read connection parameters
    paramsS17 = config.cfgAzureS17()
    paramsS03 = config.cfgAzureS03()

    paramsF17 = config.cfgAzureF17()
    paramsF03 = config.cfgAzureF03()
    
    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    connS17 = psycopg2.connect(**paramsS17)
    connS03 = psycopg2.connect(**paramsS03)
    
    connF17 = psycopg2.connect(**paramsF17)
    connF03 = psycopg2.connect(**paramsF03)

    # create a cursor
    curS17 = connS17.cursor()
    curS03 = connS03.cursor()
    
    curF17 = connF17.cursor()
    curF03 = connF03.cursor()

    # execute a statement
    print('PostgreSQL database version:')
    
    curS17.execute('SELECT version()')
    curS03.execute('SELECT version()')
    curF17.execute('SELECT version()')
    curF03.execute('SELECT version()')
    
    # display the PostgreSQL database server version
    db_version_curS17 = curS17.fetchone()
    db_version_curS03 = curS03.fetchone()
    db_version_curF17 = curF17.fetchone()
    db_version_curF03 = curF03.fetchone()
    
    print(db_version_curS17)
    print(db_version_curS03)
    print(db_version_curF17)
    print(db_version_curF03)

    # close the communication with the PostgreSQL
    curS17.close()
    curS03.close()
    curF17.close()
    curF03.close()

except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Connecting to the PostgreSQL database...
PostgreSQL database version:
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)


## Query URLs

### Sampled

In [6]:
urlsS03 = pd.read_sql_query("SELECT * FROM tweets_urls where top_level_domain ~ 'youtube.com';", connS03 )
print("# of URLs sampled 03: %s" %len(urlsS03))

urlsS17 = pd.read_sql_query("SELECT * FROM tweets_urls where top_level_domain ~ 'youtube.com';", connS17 )
print("# of URLs sampled 17: %s" %len(urlsS17))

urlsS = urlsS17.append(urlsS03)
print("# of URLs sampled: %s" %len(urlsS))

urlsS.head()

# of URLs sampled 03: 3262
# of URLs sampled 17: 2961
# of URLs sampled: 6223


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,988176579586088960,http://youtu.be/ucIOOEMq-VQ?a,https://www.youtube.com/watch?v=ucIOOEMq-VQ&feature=youtu.be&a,200,https://www.youtube.com/,youtube.com,False,False,True,False
1,988176646686560256,http://youtu.be/8adyeFS7HD8?a,https://www.youtube.com/watch?v=8adyeFS7HD8&feature=youtu.be&a,200,https://www.youtube.com/,youtube.com,False,False,True,False
2,988077828091580427,https://youtu.be/Jv0cdrYUkMI,https://www.youtube.com/watch?v=Jv0cdrYUkMI&feature=youtu.be,200,https://www.youtube.com/,youtube.com,False,False,True,False
3,988178949342572544,http://youtu.be/RffAHV3tcgM?a,https://www.youtube.com/watch?v=RffAHV3tcgM&feature=youtu.be&a,200,https://www.youtube.com/,youtube.com,False,False,True,False
4,988024824336343040,https://youtu.be/tiyBmhd5Yrc,https://www.youtube.com/watch?v=tiyBmhd5Yrc&feature=youtu.be,200,https://www.youtube.com/,youtube.com,False,False,True,False


In [24]:
len(urlsF)

439201

###  Filtered

In [7]:
urlsF03 = pd.read_sql_query("SELECT * FROM tweets_urls where top_level_domain ~ 'youtube.com';", connF03 )
print("# of URLs filtered 03: %s" %len(urlsF03))

urlsF17 = pd.read_sql_query("SELECT * FROM tweets_urls where top_level_domain ~ 'youtube.com';", connF17 )
print("# of URLs filtered 17: %s" %len(urlsF17))

urlsF = urlsF17.append(urlsF03)
print("# of URLs filtered: %s" %len(urlsF))

urlsF.head()

# of URLs filtered 03: 233499
# of URLs filtered 17: 205702
# of URLs filtered: 439201


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,988437991168335872,http://youtu.be/3xxDlETxKz8?a,https://www.youtube.com/watch?v=3xxDlETxKz8&feature=youtu.be&a,200.0,https://www.youtube.com/,youtube.com,False,False,True,False
1,988437993630420993,http://youtu.be/HscStpberLc?a,https://www.youtube.com/watch?v=HscStpberLc&feature=youtu.be&a,200.0,https://www.youtube.com/,youtube.com,False,False,True,False
2,988438000773271552,https://www.youtube.com/watch?v=TIPbGZu_cpg,https://www.youtube.com/watch?v=TIPbGZu_cpg,200.0,https://www.youtube.com/,youtube.com,False,False,True,False
3,988438008289484801,http://youtu.be/GeWueT1_-oA?a,https://www.youtube.com/watch?v=GeWueT1_-oA&feature=youtu.be&a,200.0,https://www.youtube.com/,youtube.com,False,False,True,False
4,988438008192995331,https://youtu.be/K8jLEgPsXKw,https://www.youtube.com/watch?v=K8jLEgPsXKw&feature=youtu.be,200.0,https://www.youtube.com/,youtube.com,False,False,True,False


## URLs

### Calendar Week 03 - URLs - Sampled x Filtered

In [8]:
attribute = 'resolved_url'
column_name = 'url'
    
df_sampled = generateRankingDataframe(urlsS03[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(urlsF03[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [9]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - Youtube URLs - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,= (1. / 2.836% / -0.752%),2.08461,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,68
2,-,0.429185,https://www.youtube.com/results?search_query=sehun+solo,14
3,-,0.429185,https://www.youtube.com/results?search_query=sehun+fancam,14
4,-,0.429185,https://www.youtube.com/results?search_query=sehun+focus,14
5,^ (3. / 0.358% / -0.021%),0.337216,https://www.youtube.com/watch?v=Xeqf_HHEMJk,11
6,-,0.30656,https://www.youtube.com/watch?v=EP7TYxOrXoU&feature=share,10
7,^ (4. / 0.294% / 0.013%),0.30656,https://www.youtube.com/channel/UCXDyAGuwSxI4Y-X-N6RXqew/videos,10
8,^ (5. / 0.251% / 0.025%),0.275904,https://www.youtube.com/watch?v=JEVtiDeHKdc&feature=youtu.be&a,9
9,v (10. / 0.115% / 0.100%),0.214592,https://www.youtube.com/watch?v=dOeURUWyhR8,7
10,-,0.183936,https://www.youtube.com/results?search_query=sehun+solo+dance,6


In [11]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - Youtube URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,= (1. / 2.085% / 0.752%),2.83641,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,6623
2,v (13. / 0.184% / 0.399%),0.582872,https://www.youtube.com/user/videodeutschland/videos,1361
3,v (5. / 0.337% / 0.021%),0.358032,https://www.youtube.com/watch?v=Xeqf_HHEMJk,836
4,v (7. / 0.307% / -0.013%),0.293791,https://www.youtube.com/channel/UCXDyAGuwSxI4Y-X-N6RXqew/videos,686
5,v (8. / 0.276% / -0.025%),0.250965,https://www.youtube.com/watch?v=JEVtiDeHKdc&feature=youtu.be&a,586
6,v (14. / 0.153% / 0.050%),0.203427,https://www.youtube.com/watch?v=XBcu5D7EI6g&feature=youtu.be,475
7,v (17. / 0.153% / 0.009%),0.161885,https://www.youtube.com/channel/UCq93BR098LNtNk_TsYnl8ZQ?view_as=subscriber,378
8,v (15. / 0.153% / -0.005%),0.148609,https://www.youtube.com/?gl=DE,347
9,v (223. / 0.061% / 0.059%),0.120771,https://www.youtube.com/watch?v=xsarQ8O58YM&feature=youtu.be,282
10,^ (9. / 0.215% / -0.100%),0.114776,https://www.youtube.com/watch?v=dOeURUWyhR8,268


### Calendar Week 17 - URLs - Sampled x Filtered

In [12]:
attribute = 'resolved_url'
column_name = 'url'
    
df_sampled = generateRankingDataframe(urlsS17[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(urlsF17[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [14]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 17 - Youtube URLs - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,v (44. / 0.039% / 1.211%),1.24958,https://www.youtube.com/watch?v=F-eMt3SrfFU&feature=youtu.be,37
2,-,1.21581,https://www.youtube.com/watch?v=tBWJ3bvJuT0&feature=youtu.be,36
3,-,1.11449,https://www.youtube.com/watch?v=tzni9QRNMEU&feature=youtu.be,33
4,^ (2. / 0.586% / -0.181%),0.405268,https://www.youtube.com/user/videodeutschland/videos,12
5,^ (3. / 0.214% / 0.191%),0.405268,https://www.youtube.com/watch?v=zvKjfWSPI7s,12
6,v (157. / 0.017% / 0.321%),0.337724,https://www.youtube.com/channel/UC_dZp8bZipnjntBGLVHm6rw/about?sub_confirmation=1,10
7,^ (1. / 0.717% / -0.379%),0.337724,https://www.youtube.com/watch?v=6bTQkwftlf0&feature=youtu.be&a,10
8,^ (4. / 0.186% / 0.016%),0.202634,https://www.youtube.com/watch?v=fAYjSLtz6wQ&feature=youtu.be,6
9,v (11. / 0.089% / 0.114%),0.202634,https://www.youtube.com/watch?v=zvKjfWSPI7s&feature=youtu.be,6
10,v (3969. / 0.002% / 0.201%),0.202634,https://www.youtube.com/watch?v=T7O7BtBnsG4&feature=youtu.be,6


In [15]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - Youtube URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,v (7. / 0.338% / 0.379%),0.716571,https://www.youtube.com/watch?v=6bTQkwftlf0&feature=youtu.be&a,1474
2,v (4. / 0.405% / 0.181%),0.585799,https://www.youtube.com/user/videodeutschland/videos,1205
3,v (5. / 0.405% / -0.191%),0.213902,https://www.youtube.com/watch?v=zvKjfWSPI7s,440
4,v (8. / 0.203% / -0.016%),0.186192,https://www.youtube.com/watch?v=fAYjSLtz6wQ&feature=youtu.be,383
5,-,0.130772,https://www.youtube.com/channel/UCXDyAGuwSxI4Y-X-N6RXqew/videos,269
6,v (22. / 0.135% / -0.008%),0.127369,https://www.youtube.com/channel/UCVnvMpMVel0KMCqZM69XMbQ,262
7,v (34. / 0.101% / 0.017%),0.118618,https://www.youtube.com/watch?v=uLmaXJph8a8&feature=youtu.be,244
8,v (28. / 0.101% / 0.017%),0.118132,https://www.youtube.com/watch?v=elsJqFNUlus&feature=youtu.be,243
9,v (30. / 0.101% / -0.003%),0.0982003,https://www.youtube.com/watch?v=wM0FTIMAu8M&feature=youtu.be,202
10,v (32. / 0.101% / -0.011%),0.0904221,https://www.youtube.com/watch?v=bCys42hGcoc&feature=youtu.be&a,186


## Sampled vs Filtered

In [16]:
attribute = 'resolved_url'
column_name = 'url'
    
df_sampled = generateRankingDataframe(urlsS[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(urlsF[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [17]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "Youtube URLs - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,= (1. / 1.508% / -0.415%),1.09272,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,68
2,v (93. / 0.018% / 0.576%),0.594569,https://www.youtube.com/watch?v=F-eMt3SrfFU&feature=youtu.be,37
3,-,0.578499,https://www.youtube.com/watch?v=tBWJ3bvJuT0&feature=youtu.be,36
4,-,0.530291,https://www.youtube.com/watch?v=tzni9QRNMEU&feature=youtu.be,33
5,^ (2. / 0.584% / -0.295%),0.28925,https://www.youtube.com/user/videodeutschland/videos,18
6,-,0.224972,https://www.youtube.com/results?search_query=sehun+fancam,14
7,-,0.224972,https://www.youtube.com/results?search_query=sehun+solo,14
8,-,0.224972,https://www.youtube.com/results?search_query=sehun+focus,14
9,^ (8. / 0.100% / 0.093%),0.192833,https://www.youtube.com/watch?v=zvKjfWSPI7s,12
10,^ (5. / 0.190% / -0.014%),0.176764,https://www.youtube.com/watch?v=Xeqf_HHEMJk,11


In [18]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "Youtube URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,= (1. / 1.093% / 0.415%),1.50797,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,6623
2,v (5. / 0.289% / 0.295%),0.584243,https://www.youtube.com/user/videodeutschland/videos,2566
3,v (13. / 0.161% / 0.175%),0.335609,https://www.youtube.com/watch?v=6bTQkwftlf0&feature=youtu.be&a,1474
4,v (14. / 0.161% / 0.057%),0.21744,https://www.youtube.com/channel/UCXDyAGuwSxI4Y-X-N6RXqew/videos,955
5,v (10. / 0.177% / 0.014%),0.190346,https://www.youtube.com/watch?v=Xeqf_HHEMJk,836
6,v (15. / 0.145% / -0.011%),0.133424,https://www.youtube.com/watch?v=JEVtiDeHKdc&feature=youtu.be&a,586
7,v (25. / 0.080% / 0.028%),0.108151,https://www.youtube.com/watch?v=XBcu5D7EI6g&feature=youtu.be,475
8,v (9. / 0.193% / -0.093%),0.100182,https://www.youtube.com/watch?v=zvKjfWSPI7s,440
9,v (22. / 0.096% / -0.009%),0.0872038,https://www.youtube.com/watch?v=fAYjSLtz6wQ&feature=youtu.be,383
10,v (26. / 0.080% / 0.006%),0.0860654,https://www.youtube.com/channel/UCq93BR098LNtNk_TsYnl8ZQ?view_as=subscriber,378


## Query Tweets

### Sampled

In [19]:
tweetsS03 = pd.read_sql_query("SELECT * FROM tweets_info;", connS03 )
print("# of Tweets sampled 03: %s" %len(tweetsS03))

tweetsS17 = pd.read_sql_query("SELECT * FROM tweets_info;", connS17 )
print("# of tweets sampled 17: %s" %len(tweetsS17))

tweetsS = tweetsS17.append(tweetsS03)
print("# of URLs sampled: %s" %len(tweetsS))

tweetsS.head()

# of Tweets sampled 03: 123680
# of tweets sampled 17: 112003
# of URLs sampled: 235683


Unnamed: 0,id,user_id,text,created_at,source,lang,truncated,is_retweet,retweet_id,is_quote,...,favorite_count,favorited,retweeted,hashtags,user_mentions,number_of_urls,extracted,coordinates_type,coordinates_long,coordinates_lat
0,988175614908788736,1647758059,"Johangeorgenstadt 23:59 Nachtzeit/Trocken/Klar ,Temp: 14°C, Hum:66%, Wind:WSW 0-&gt;24kmh, Baro: 1012.80hPa, Regen: 0.00mm #wetter",Sun Apr 22 22:00:00 +0000 2018,"<a href=""http://saratoga-weather.org/scripts-TweetWX.php#TweetWX"" rel=""nofollow"">TweetWX</a>",de,False,False,,False,...,0,False,False,wetter,,0,False,,,
1,988175619098861568,234778734,Beim nächsten Ton ist es 0 Uhr: ♩♩♩,Sun Apr 22 22:00:01 +0000 2018,"<a href=""http://www.rasputin.de/"" rel=""nofollow"">rasputin</a>",de,False,False,,False,...,0,False,False,,,0,False,,,
2,988175627453952002,821834588158853120,[d] DiplIngMarkus Verbrauch gestern: 3.71 kWh Uptime: 41 days 21:01:04 since 23:48:32 13/10/2017,Sun Apr 22 22:00:03 +0000 2018,"<a href=""http://arduino-tweet.appspot.com/"" rel=""nofollow"">Arduino</a>",de,False,False,,False,...,0,False,False,,,0,False,,,
3,988175627479146498,859148218915069952,Alien Mutterschiff sozusagen \n(Hab das Gefühl bin schwanger) \n(Das gibt dann rr Stress) https://t.co/z4uHz2dugO,Sun Apr 22 22:00:03 +0000 2018,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>",de,False,False,,False,...,0,False,False,,,0,False,,,
4,988175681996681218,3206249890,@MDegen55 🇩🇪🇩🇪 Gute Nacht 🌃🌃🌃 https://t.co/OTnsAsDGrc,Sun Apr 22 22:00:16 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,False,,False,...,0,False,False,,3206249890.0,0,False,,,


In [22]:
tweetsS[tweetsS['id'].isin(urlsS[urlsS['resolved_url'] == "https://www.youtube.com/watch?v=tzni9QRNMEU&feature=youtu.be"]['tweet_id'])]

Unnamed: 0,id,user_id,text,created_at,source,lang,truncated,is_retweet,retweet_id,is_quote,...,favorite_count,favorited,retweeted,hashtags,user_mentions,number_of_urls,extracted,coordinates_type,coordinates_long,coordinates_lat
89443,990199324620472320,1099570976,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:01:30 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89456,990199702112030720,1710032354,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:03:00 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89469,990199773423636480,73732277,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:03:17 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89478,990199878268669952,744423667178823680,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:03:42 +0000 2018,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89482,990199907624542211,845826939856216064,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:03:49 +0000 2018,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89495,990200083810467841,1101142141,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:04:31 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89508,990200415156293632,1097049314,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:05:50 +0000 2018,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89573,990201727952572418,2399086608,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:11:03 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89605,990202420016840704,1540314288,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:13:48 +0000 2018,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89629,990202940102098944,4158510612,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:15:52 +0000 2018,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,


## Query Hashtags

In [25]:
def getHashtagsFromTopLevelDomain(tld, urls, hashtags):
    return hashtags[ hashtags['tweet_id'].isin(urls[urls['top_level_domain'] == tld]['tweet_id'])]

### Sampled

In [26]:
hashtagsS03 = pd.read_sql_query("SELECT * FROM tweets_hashtags", connS03 )
print("# of hashtags sampled 03: %s" %len(hashtagsS03))

hashtagsS03 = getHashtagsFromTopLevelDomain('youtube.com', urlsS03, hashtagsS03)
print("# of youtube hashtags sampled 03: %s" %len(hashtagsS03))

hashtagsS17 = pd.read_sql_query("SELECT * FROM tweets_hashtags", connS17 )
print("# of hashtags sampled 17: %s" %len(hashtagsS17))

hashtagsS17 = getHashtagsFromTopLevelDomain('youtube.com', urlsS17, hashtagsS17)
print("# of youtube hashtags sampled 17: %s" %len(hashtagsS17))

hashtagsS = hashtagsS17.append(hashtagsS03)
print("# of youtube hashtags sampled: %s" %len(hashtagsS))

hashtagsS.head()

# of hashtags sampled 03: 84203
# of youtube hashtags sampled 03: 1450
# of hashtags sampled 17: 73935
# of youtube hashtags sampled 17: 1750
# of youtube hashtags sampled: 3200


Unnamed: 0,tweet_id,hashtag
59,988077828091580427,nullsechsTV
60,988077828091580427,SCPHFC
61,988077828091580427,scp06
62,988177561053286401,nullsechsTV
63,988177561053286401,SCPHFC


In [24]:
len(urlsF)

439201

###  Filtered

In [27]:
hashtagsF03 = pd.read_sql_query("SELECT * FROM tweets_hashtags", connF03 )
print("# of hashtags filtered 03: %s" %len(hashtagsF03))

hashtagsF03 = getHashtagsFromTopLevelDomain('youtube.com', urlsF03, hashtagsF03)
print("# of youtube hashtags filtered 03: %s" %len(hashtagsF03))

hashtagsF17 = pd.read_sql_query("SELECT * FROM tweets_hashtags", connF17 )
print("# of hashtags filtered 17: %s" %len(hashtagsF17))

hashtagsF17 = getHashtagsFromTopLevelDomain('youtube.com', urlsF17, hashtagsF17)
print("# of youtube hashtags filtered 17: %s" %len(hashtagsF17))

hashtagsF = hashtagsF17.append(hashtagsF03)
print("# of youtube hashtags filtered: %s" %len(hashtagsF))

hashtagsF.head()

# of hashtags filtered 03: 4377831
# of youtube hashtags filtered 03: 68218
# of hashtags filtered 17: 3780197
# of youtube hashtags filtered 17: 70375
# of youtube hashtags filtered: 138593


Unnamed: 0,tweet_id,hashtag
119,988175640607195137,egoFM
120,988175640607195137,musicinbetween
239,988175711453302789,PS4live
271,988160198480154624,Bremen
272,988160198480154624,AfD


## Hashtags

### Sampled vs Filtered

In [30]:
attribute = 'hashtag'
column_name = 'hashtag'
    
df_sampled = generateRankingDataframe(hashtagsS[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(hashtagsF[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [31]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "Youtube Hashtags - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),hashtag,percentage,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,v (17434. / 0.001% / 2.281%),빅스,2.28125,73
2,-,VIXX_TV,2.28125,73
3,v (4972. / 0.003% / 2.278%),VIXX,2.28125,73
4,v (116. / 0.066% / 1.684%),ForTheLoveOf,1.75,56
5,v (185. / 0.049% / 1.701%),GlobeHarryStyles,1.75,56
6,v (9. / 0.513% / 0.737%),AfD,1.25,40
7,^ (3. / 1.465% / -0.215%),letsplay,1.25,40
8,^ (2. / 1.593% / -0.437%),LetsPlay,1.15625,37
9,^ (4. / 1.152% / -0.058%),youtube,1.09375,35
10,^ (5. / 0.763% / 0.237%),YouTube,1.0,32


In [32]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "Youtube URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),hashtag,percentage,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,v (12. / 0.719% / 1.027%),PS4live,1.7454,2419
2,v (8. / 1.156% / 0.437%),LetsPlay,1.59315,2208
3,v (7. / 1.250% / 0.215%),letsplay,1.46472,2030
4,v (9. / 1.094% / 0.058%),youtube,1.15157,1596
5,v (10. / 1.000% / -0.237%),YouTube,0.763386,1058
6,v (17. / 0.406% / 0.316%),gaming,0.722259,1001
7,v (15. / 0.469% / 0.151%),GermanMediaRT,0.6198,859
8,v (13. / 0.625% / -0.014%),Nachtschicht,0.611142,847
9,^ (6. / 1.250% / -0.737%),AfD,0.513013,711
10,v (14. / 0.562% / -0.093%),Video,0.469721,651
