# Youtube URL Comparison (Sampled vs Filtered)

In [1]:
import psycopg2
import pandas as pd
import config
from matplotlib import pylab as plt
import seaborn as sns
import numpy as np
from IPython.display import display, HTML
import re


color_palette = sns.color_palette(palette='muted', n_colors=None, desat=.75)
sns.set(context='notebook', palette=color_palette, style='whitegrid', font='sans-serif', font_scale=1.5, color_codes=False, rc=None)
pd.set_option('display.max_colwidth', -1)
table_styles = [{'selector': 'td',
                 'props': [('min-width', '100px'), ('text-align', 'center')]},
                {'selector': 'tr',
                 'props': [('border-bottom', '1px dotted black')]},
                {'selector': 'th',
                 'props': [('text-align', 'center')]}
               ]

%matplotlib inline

directory = "url_top_lists/"
stream = "comparison"

  """)
  return f(*args, **kwds)


## Util Methods

In [2]:
def compareRows(row, df_to_compare, column_name):
    comparison_row = df_to_compare.loc[df_to_compare[column_name] == row[column_name]]
    if comparison_row.empty:
        comparison = " - "
    else:
        percentage_dif = row['percentage'] - comparison_row['percentage'].values[0]
        difference = "(%s. / %.3f%% / %.3f%%)" % (comparison_row['rank'].values[0], comparison_row['percentage'].values[0], percentage_dif)
        if comparison_row['rank'].values[0] == row['rank']:
            comparison = " = <br>" + difference
        else:
            if comparison_row['rank'].values[0] > row['rank']:
                comparison = " v <br>" + difference
            else:
                comparison = " ^ <br>" + difference
    return comparison

def getOpacity(val):
    value = abs(float(re.findall(r"[-+]?\d*\.\d+|\d+", val.split("/")[2])[0]))
    if value < 0.005:
        return 1
    if value < 0.01:
        return 0.95
    if value < 0.05:
        return 0.8
    if value < 0.1:
        return 0.7
    if value < 0.5:
        return 0.6
    if value < 1:
        return 0.5
    if value < 10:
        return 0.3
    if value < 40:
        return 0.2
    if value < 80:
        return 0.1
    if value < 100:
        return 0.05

def colorComparisonField(val):
    
    if isinstance(val, str):
        if ' ^ ' in val or ' v ' in val:
            return 'background-color: rgba(246, 185, 59, %s)' %getOpacity(val)
        if ' = ' in val:
            return 'background-color: rgba(184, 233, 148, %s)' %getOpacity(val)
        if ' - ' in val and len(val) == 3:
            return 'background-color: #e55039' 
    return ''

def generateRankingDataframe(series, attribute_name):
    size = series.sum()
    rank = []
    parameter = []
    count = []
    percentage = []

    i = 1
    for index, value in series.iteritems():
        rank.append(i)
        parameter.append(index)
        count.append(value)
        percentage.append((value/size)*100)
        i += 1

    data = {'rank': rank, attribute_name: parameter, 'value': count, 'percentage': percentage}
    return pd.DataFrame(data=data)

def generateComparisonDataframes(df1, df2, column_name, size):
    compare_list = []
    for index, row in df1.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df2, column_name))

    data = {'rank': df1['rank'][:size], column_name: df1[column_name][:size], 'value': df1['value'][:size], 'percentage': df1['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df1_compared = pd.DataFrame(data=data)
    df1_compared.set_index(keys='rank', inplace=True)
    
    compare_list = []
    for index, row in df2.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df1, column_name))

    data = {'rank': df2['rank'][:size], column_name: df2[column_name][:size], 'value': df2['value'][:size], 'percentage': df2['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df2_compared = pd.DataFrame(data=data)
    df2_compared.set_index(keys='rank', inplace=True)
    
    return df1_compared, df2_compared

def getPrettyComparisonDataframe(df, title):
    s = df.style.applymap(colorComparisonField)
    s.set_caption(title)
    s.set_table_styles(table_styles)
    return s

In [3]:
conn = None
try:
    # read connection parameters
    paramsS17 = config.cfgAzureS17()
    paramsS03 = config.cfgAzureS03()

    paramsF17 = config.cfgAzureF17()
    paramsF03 = config.cfgAzureF03()
    
    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    connS17 = psycopg2.connect(**paramsS17)
    connS03 = psycopg2.connect(**paramsS03)
    
    connF17 = psycopg2.connect(**paramsF17)
    connF03 = psycopg2.connect(**paramsF03)

    # create a cursor
    curS17 = connS17.cursor()
    curS03 = connS03.cursor()
    
    curF17 = connF17.cursor()
    curF03 = connF03.cursor()

    # execute a statement
    print('PostgreSQL database version:')
    
    curS17.execute('SELECT version()')
    curS03.execute('SELECT version()')
    curF17.execute('SELECT version()')
    curF03.execute('SELECT version()')
    
    # display the PostgreSQL database server version
    db_version_curS17 = curS17.fetchone()
    db_version_curS03 = curS03.fetchone()
    db_version_curF17 = curF17.fetchone()
    db_version_curF03 = curF03.fetchone()
    
    print(db_version_curS17)
    print(db_version_curS03)
    print(db_version_curF17)
    print(db_version_curF03)

    # close the communication with the PostgreSQL
    curS17.close()
    curS03.close()
    curF17.close()
    curF03.close()

except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Connecting to the PostgreSQL database...
PostgreSQL database version:
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)


## Query URLs

### Sampled

In [4]:
urlsS03 = pd.read_sql_query("SELECT * FROM tweets_urls;", connS03 )
print("# of URLs sampled 03: %s" %len(urlsS03))

urlsS17 = pd.read_sql_query("SELECT * FROM tweets_urls;", connS17 )
print("# of URLs sampled 17: %s" %len(urlsS17))

urlsS = urlsS17.append(urlsS03)
print("# of URLs sampled: %s" %len(urlsS))

urlsS.head()

# of URLs sampled 03: 40339
# of URLs sampled 17: 35687
# of URLs sampled: 76026


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,989097603664138240,http://arte.tv/abgedreht,https://www.arte.tv/de/videos/RC-014033/abgedreht/,200.0,https://www.arte.tv/,arte.tv,False,False,True,False
1,988175933659021318,https://twitter.com/piersmorgan/status/987388203593322496,https://twitter.com/piersmorgan/status/987388203593322496,200.0,https://twitter.com/,twitter.com,True,False,True,False
2,988176164358361088,https://www.journal.koeln/pol-ham-fahrradfahrer-verletzt-sich-bei-sturz/,https://www.journal.koeln/pol-ham-fahrradfahrer-verletzt-sich-bei-sturz/,200.0,https://www.journal.koeln/,journal.koeln,False,False,True,False
3,988176403412766720,http://www.radionomy.com/erika1,https://www.radionomy.com/en/radio/erika1,200.0,https://www.radionomy.com/,radionomy.com,False,False,True,False
4,988176415995592704,https://twitter.com/JanLatten/status/988161845205913600,https://twitter.com/JanLatten/status/988161845205913600,200.0,https://twitter.com/,twitter.com,True,False,True,False


In [24]:
len(urlsF)

439201

###  Filtered

In [5]:
urlsF03 = pd.read_sql_query("SELECT * FROM tweets_urls;", connF03 )
print("# of URLs filtered 03: %s" %len(urlsF03))

urlsF17 = pd.read_sql_query("SELECT * FROM tweets_urls;", connF17 )
print("# of URLs filtered 17: %s" %len(urlsF17))

urlsF = urlsF17.append(urlsF03)
print("# of URLs filtered: %s" %len(urlsF))

urlsF.head()

# of URLs filtered 03: 2411523
# of URLs filtered 17: 2107279
# of URLs filtered: 4518802


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,988437985363406853,https://www.facebook.com/nadjashah/posts/10215947002527277,https://www.facebook.com/nadjashah/posts/10215947002527277,200.0,https://www.facebook.com/,facebook.com,False,False,True,False
1,988437986797916160,https://www.facebook.com/StageSchoolHamburg/posts/1934173349947359,https://www.facebook.com/StageSchoolHamburg/posts/1934173349947359,200.0,https://www.facebook.com/,facebook.com,False,False,True,False
2,988437989712957440,http://www.faz.net/aktuell/rhein-main/frankfurt/frankfurter-gutleutviertel-landgericht-erlaubt-drohende-zwangsraeumung-15556433.html,http://www.faz.net/aktuell/rhein-main/frankfurt/frankfurter-gutleutviertel-landgericht-erlaubt-drohende-zwangsraeumung-15556433.html,200.0,http://www.faz.net/,faz.net,False,False,True,False
3,988437989704568832,http://vera-lengsfeld.de/2018/04/22/buergerprotest-ueberall-wie-lange-wird-das-noch-verschwiegen/,http://vera-lengsfeld.de/2018/04/22/buergerprotest-ueberall-wie-lange-wird-das-noch-verschwiegen/,200.0,http://vera-lengsfeld.de/,vera-lengsfeld.de,False,False,True,False
4,988437990732128256,https://ift.tt/2Jm4wnp,https://news.google.com/?sa=t&fd=R&ct2=de&usg=AFQjCNFSz3Yf_fDFIytLtnNe8JNGM2BPZg&clid=c3a7d30bb8a4878e06b80cf16b898331&ei=O_rdWqDIHNCT3QHCyJDQAg&url=https://www.waz.de/kultur/fuer-silke-j-raebiger-ist-es-das-letzte-frauen-filmfestival-id214100661.html&taa=1&hl=en-US&gl=US&ceid=US:en,200.0,https://news.google.com/,google.com,False,False,True,False


## URLs

### Calendar Week 03 - URLs - Sampled x Filtered

In [6]:
attribute = 'resolved_url'
column_name = 'url'
    
df_sampled = generateRankingDataframe(urlsS03[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(urlsF03[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [7]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - Youtube URLs - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,= (1. / 0.351% / 0.102%),0.453655,https://twitter.com/account/suspended,183
2,= (2. / 0.275% / -0.106%),0.168571,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,68
3,= (3. / 0.272% / -0.144%),0.128908,https://www.dwd.de/,52
4,= (4. / 0.221% / -0.097%),0.12395,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,50
5,v (35. / 0.044% / 0.080%),0.12395,http://www.deutschlandfunk.de/dlf24-startseite.1441.de.html,50
6,-,0.118992,http://www.blackpinkyg.com/,48
7,^ (5. / 0.144% / -0.042%),0.101639,http://www.messe.tv/,41
8,^ (6. / 0.123% / -0.039%),0.0842857,http://www.the-sz.com/products/vbbinfo/?f=3,34
9,v (10. / 0.085% / -0.003%),0.0818067,http://der-x-code.com/,33
10,-,0.0768487,http://wx3.sinaimg.cn/large/a157f83bly1fnk017dgu2j20zk0qogoh.jpg,31


In [8]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - Youtube URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,= (1. / 0.454% / -0.102%),0.35123,https://twitter.com/account/suspended,8470
2,= (2. / 0.169% / 0.106%),0.27464,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,6623
3,= (3. / 0.129% / 0.144%),0.272483,https://www.dwd.de/,6571
4,= (4. / 0.124% / 0.097%),0.220732,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,5323
5,v (7. / 0.102% / 0.042%),0.143934,http://www.messe.tv/,3471
6,v (8. / 0.084% / 0.039%),0.123283,http://www.the-sz.com/products/vbbinfo/?f=3,2973
7,v (14. / 0.074% / 0.027%),0.101761,https://www.emma-care.de/blog/teufelskralle-fuers-pferd-was-ist-das,2454
8,-,0.0959974,https://sec.help.ch,2315
9,v (21. / 0.052% / 0.040%),0.0924312,https://www.facebook.com/CODECODE1111111111/,2229
10,^ (9. / 0.082% / 0.003%),0.0852988,http://der-x-code.com/,2057


### Calendar Week 17 - URLs - Sampled x Filtered

In [9]:
attribute = 'resolved_url'
column_name = 'url'
    
df_sampled = generateRankingDataframe(urlsS17[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(urlsF17[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [12]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 17 - URLs - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,v (4. / 0.217% / -0.049%),0.168128,https://www.dwd.de/,60
2,v (3. / 0.227% / -0.068%),0.159722,https://www.ffd365.de,57
3,^ (2. / 0.241% / -0.085%),0.15692,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,56
4,v (19. / 0.077% / 0.071%),0.148513,http://susanne-ulrike-maria-albrecht.over-blog.de/,53
5,v (86. / 0.020% / 0.111%),0.131701,https://twitter.com/buzzfeedfrance/status/988362712513171456,47
6,v (91. / 0.019% / 0.101%),0.120492,https://twitter.com/Markus_Soeder/status/988768341820170240,43
7,v (898. / 0.004% / 0.100%),0.103679,https://www.youtube.com/watch?v=F-eMt3SrfFU&feature=youtu.be,37
8,^ (6. / 0.139% / -0.038%),0.100877,https://www.linkedin.com/,36
9,-,0.100877,http://tv.naver.com/v/3097997,36
10,-,0.100877,https://www.youtube.com/watch?v=tBWJ3bvJuT0&feature=youtu.be,36


In [13]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-,0.266457,https://sec.help.ch,5615
2,v (3. / 0.157% / 0.085%),0.241449,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,5088
3,^ (2. / 0.160% / 0.068%),0.227355,https://www.ffd365.de,4791
4,^ (1. / 0.168% / 0.049%),0.216678,https://www.dwd.de/,4566
5,v (12. / 0.095% / 0.070%),0.164857,http://www.gooni168.tv/sellpanties/item/haushalts-sklave-in/,3474
6,v (8. / 0.101% / 0.038%),0.139279,https://www.linkedin.com/,2935
7,v (16. / 0.076% / 0.054%),0.129978,http://www.the-sz.com/products/vbbinfo/?f=3,2739
8,v (26. / 0.053% / 0.062%),0.11484,https://www.miet-check.de/,2420
9,-,0.108291,https://mobile.twitter.com/Markus_Soeder/status/988768341820170240,2282
10,v (24. / 0.056% / 0.048%),0.103736,https://www.radionomy.com/en/radio/erika1,2186


## Sampled vs Filtered

In [28]:
attribute = 'resolved_url'
column_name = 'url'
    
df_sampled = generateRankingDataframe(urlsS[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(urlsF[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 100)

In [29]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "URLs - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,v (3. / 0.207% / 0.067%),0.273591,https://twitter.com/account/suspended,208
2,^ (1. / 0.246% / -0.099%),0.147318,https://www.dwd.de/,112
3,^ (2. / 0.230% / -0.091%),0.139426,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,106
4,v (29. / 0.041% / 0.060%),0.101281,http://www.deutschlandfunk.de/dlf24-startseite.1441.de.html,77
5,= (5. / 0.147% / -0.057%),0.0894431,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,68
6,= (6. / 0.126% / -0.046%),0.0802357,http://www.the-sz.com/products/vbbinfo/?f=3,61
7,= (7. / 0.106% / -0.031%),0.0749744,https://www.ffd365.de,57
8,v (24. / 0.044% / 0.031%),0.0749744,http://susanne-ulrike-maria-albrecht.over-blog.de/,57
9,= (9. / 0.089% / -0.016%),0.073659,https://www.linkedin.com/,56
10,-,0.0631363,http://www.blackpinkyg.com/,48


In [30]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),percentage,url,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,v (2. / 0.147% / 0.099%),0.246459,https://www.dwd.de/,11137
2,v (3. / 0.139% / 0.091%),0.230393,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,10411
3,^ (1. / 0.274% / -0.067%),0.206714,https://twitter.com/account/suspended,9341
4,-,0.175489,https://sec.help.ch,7930
5,= (5. / 0.089% / 0.057%),0.146565,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,6623
6,= (6. / 0.080% / 0.046%),0.126405,http://www.the-sz.com/products/vbbinfo/?f=3,5712
7,= (7. / 0.075% / 0.031%),0.106024,https://www.ffd365.de,4791
8,v (19. / 0.047% / 0.042%),0.0892936,http://www.flirt.21.com/,4035
9,= (9. / 0.074% / 0.016%),0.0892051,https://www.linkedin.com/,4031
10,v (15. / 0.053% / 0.036%),0.0890502,http://streaming.radionomy.com/Radio-Jodlerwirt1,4024


In [44]:
for index, row in df_sampled[:100].iterrows():
    
    print(row['rank'],
          row['url'],
          row['value'],
         
         sep="\t")

1	https://twitter.com/account/suspended	208
2	https://www.dwd.de/	112
3	https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US	106
4	http://www.deutschlandfunk.de/dlf24-startseite.1441.de.html	77
5	https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a	68
6	http://www.the-sz.com/products/vbbinfo/?f=3	61
7	https://www.ffd365.de	57
8	http://susanne-ulrike-maria-albrecht.over-blog.de/	57
9	https://www.linkedin.com/	56
10	http://www.blackpinkyg.com/	48
11	https://www.radionomy.com/en/radio/erika1	47
12	https://twitter.com/buzzfeedfrance/status/988362712513171456	47
13	https://twitter.com/Markus_Soeder/status/988768341820170240	43
14	http://www.messe.tv/	41
15	http://streaming.radionomy.com/Radio-Jodlerwirt1	40
16	http://www.radio-jodlerwirt.de/	38
17	https://www.youtube.com/watch?v=F-eMt3SrfFU&feature=youtu.be	37
18	http://tv.naver.com/v/3097997	36
19	http://www.flirt.21.com/	36
20	https://www.youtube.com/watch?v=tBWJ3bvJuT0&feature=youtu.be	36
21	https://

In [45]:
for index, row in df_filtered[:100].iterrows():
    
    print(row['rank'],
          row['url'],
          row['value'],
         
         sep="\t")

1	https://www.dwd.de/	11137
2	https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US	10411
3	https://twitter.com/account/suspended	9341
4	https://sec.help.ch	7930
5	https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a	6623
6	http://www.the-sz.com/products/vbbinfo/?f=3	5712
7	https://www.ffd365.de	4791
8	http://www.flirt.21.com/	4035
9	https://www.linkedin.com/	4031
10	http://streaming.radionomy.com/Radio-Jodlerwirt1	4024
11	http://www.radio-jodlerwirt.de/	3815
12	https://www.radionomy.com/en/radio/erika1	3806
13	http://www.gooni168.tv/sellpanties/item/haushalts-sklave-in/	3474
14	http://www.messe.tv/	3471
15	http://www.the-sz.com/products/vbbinfo/?f=4	2639
16	https://www.youtube.com/user/videodeutschland/videos	2566
17	https://www.purzel-video.com/	2485
18	https://www.emma-care.de/blog/teufelskralle-fuers-pferd-was-ist-das	2454
19	https://www.miet-check.de/	2429
20	http://www.chatgate24.com/	2412
21	https://mobile.twitter.com/Markus_Soeder/status/988

## Top Level Domains

### Calendar Week 03 - Top Level Domains - Sampled x Filtered

In [19]:
attribute = 'top_level_domain'
column_name = 'Top Level Domain'
    
df_sampled = generateRankingDataframe(urlsS03[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(urlsF03[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [20]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - Top Level Domain - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,Top Level Domain,difference (rank / percentage / diff),percentage,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,twitter.com,= (1. / 14.475% / 1.306%),15.7807,6362
2,youtube.com,= (2. / 9.687% / -1.596%),8.09128,3262
3,facebook.com,= (3. / 2.700% / -0.220%),2.48047,1000
4,instagram.com,= (4. / 1.634% / 0.169%),1.8033,727
5,welt.de,v (6. / 1.327% / 0.379%),1.70656,688
6,spiegel.de,v (7. / 1.112% / 0.054%),1.16582,470
7,focus.de,v (9. / 0.934% / 0.001%),0.935136,377
8,amazon.de,= (8. / 0.953% / -0.030%),0.922733,372
9,google.com,^ (5. / 1.522% / -0.684%),0.838398,338
10,bild.de,v (17. / 0.645% / 0.188%),0.833437,336


In [21]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - Top Level Domain - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,Top Level Domain,difference (rank / percentage / diff),percentage,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,twitter.com,= (1. / 15.781% / -1.306%),14.4748,348905
2,youtube.com,= (2. / 8.091% / 1.596%),9.68695,233498
3,facebook.com,= (3. / 2.480% / 0.220%),2.70042,65092
4,instagram.com,= (4. / 1.803% / -0.169%),1.63418,39391
5,google.com,v (9. / 0.838% / 0.684%),1.52205,36688
6,welt.de,^ (5. / 1.707% / -0.379%),1.32727,31993
7,spiegel.de,^ (6. / 1.166% / -0.054%),1.112,26804
8,amazon.de,= (8. / 0.923% / 0.030%),0.953022,22972
9,focus.de,^ (7. / 0.935% / -0.001%),0.933731,22507
10,twitch.tv,v (11. / 0.771% / 0.092%),0.862955,20801


### Calendar Week 17 - URLs - Sampled x Filtered

In [22]:
attribute = 'top_level_domain'
column_name = 'Top Level Domain'
    
df_sampled = generateRankingDataframe(urlsS17[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(urlsF17[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [23]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 17 - Top Level Domain - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,Top Level Domain,difference (rank / percentage / diff),percentage,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,twitter.com,= (1. / 16.120% / 1.237%),17.3572,6192
2,youtube.com,= (2. / 9.764% / -1.464%),8.30016,2961
3,facebook.com,= (3. / 2.720% / -0.243%),2.478,884
4,instagram.com,= (4. / 1.966% / 0.291%),2.25655,805
5,welt.de,v (7. / 1.086% / 0.411%),1.49689,534
6,spiegel.de,= (6. / 1.151% / 0.055%),1.20536,430
7,amazon.de,^ (5. / 1.248% / -0.312%),0.936256,334
8,twitch.tv,= (8. / 0.999% / -0.107%),0.891406,318
9,focus.de,v (10. / 0.756% / 0.060%),0.81572,291
10,wordpress.com,v (12. / 0.650% / 0.081%),0.731625,261


In [24]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - Top Level Domain - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,Top Level Domain,difference (rank / percentage / diff),percentage,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,twitter.com,= (1. / 17.357% / -1.237%),16.1198,339597
2,youtube.com,= (2. / 8.300% / 1.464%),9.76404,205700
3,facebook.com,= (3. / 2.478% / 0.243%),2.7205,57313
4,instagram.com,= (4. / 2.257% / -0.291%),1.96567,41411
5,amazon.de,v (7. / 0.936% / 0.312%),1.24792,26290
6,spiegel.de,= (6. / 1.205% / -0.055%),1.15051,24238
7,welt.de,^ (5. / 1.497% / -0.411%),1.08596,22878
8,twitch.tv,= (8. / 0.891% / 0.107%),0.998904,21044
9,google.com,v (15. / 0.577% / 0.357%),0.933968,19676
10,focus.de,^ (9. / 0.816% / -0.060%),0.755965,15926


## Sampled vs Filtered

In [25]:
attribute = 'top_level_domain'
column_name = 'Top Level Domain'
    
df_sampled = generateRankingDataframe(urlsS[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(urlsF[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [26]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "Top Level Domain - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,Top Level Domain,difference (rank / percentage / diff),percentage,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,twitter.com,= (1. / 15.242% / 1.279%),16.5208,12554
2,youtube.com,= (2. / 9.723% / -1.534%),8.18934,6223
3,facebook.com,= (3. / 2.710% / -0.230%),2.47931,1884
4,instagram.com,= (4. / 1.789% / 0.227%),2.01608,1532
5,welt.de,v (6. / 1.215% / 0.393%),1.60813,1222
6,spiegel.de,v (7. / 1.130% / 0.054%),1.18438,900
7,amazon.de,v (8. / 1.091% / -0.161%),0.929082,706
8,focus.de,v (10. / 0.851% / 0.028%),0.879075,668
9,twitch.tv,= (9. / 0.926% / -0.099%),0.827751,629
10,bild.de,v (12. / 0.645% / 0.136%),0.781692,594


In [27]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "Top Level Domain - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,Top Level Domain,difference (rank / percentage / diff),percentage,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,twitter.com,= (1. / 16.521% / -1.279%),15.242,688502
2,youtube.com,= (2. / 8.189% / 1.534%),9.7229,439198
3,facebook.com,= (3. / 2.479% / 0.230%),2.70979,122405
4,instagram.com,= (4. / 2.016% / -0.227%),1.78878,80802
5,google.com,v (12. / 0.716% / 0.532%),1.24778,56364
6,welt.de,^ (5. / 1.608% / -0.393%),1.21473,54871
7,spiegel.de,^ (6. / 1.184% / -0.054%),1.12996,51042
8,amazon.de,^ (7. / 0.929% / 0.161%),1.09056,49262
9,twitch.tv,= (9. / 0.828% / 0.099%),0.926359,41845
10,focus.de,^ (8. / 0.879% / -0.028%),0.850824,38433


## Query Tweets

### Sampled

In [19]:
tweetsS03 = pd.read_sql_query("SELECT * FROM tweets_info;", connS03 )
print("# of Tweets sampled 03: %s" %len(tweetsS03))

tweetsS17 = pd.read_sql_query("SELECT * FROM tweets_info;", connS17 )
print("# of tweets sampled 17: %s" %len(tweetsS17))

tweetsS = tweetsS17.append(tweetsS03)
print("# of URLs sampled: %s" %len(tweetsS))

tweetsS.head()

# of Tweets sampled 03: 123680
# of tweets sampled 17: 112003
# of URLs sampled: 235683


Unnamed: 0,id,user_id,text,created_at,source,lang,truncated,is_retweet,retweet_id,is_quote,...,favorite_count,favorited,retweeted,hashtags,user_mentions,number_of_urls,extracted,coordinates_type,coordinates_long,coordinates_lat
0,988175614908788736,1647758059,"Johangeorgenstadt 23:59 Nachtzeit/Trocken/Klar ,Temp: 14°C, Hum:66%, Wind:WSW 0-&gt;24kmh, Baro: 1012.80hPa, Regen: 0.00mm #wetter",Sun Apr 22 22:00:00 +0000 2018,"<a href=""http://saratoga-weather.org/scripts-TweetWX.php#TweetWX"" rel=""nofollow"">TweetWX</a>",de,False,False,,False,...,0,False,False,wetter,,0,False,,,
1,988175619098861568,234778734,Beim nächsten Ton ist es 0 Uhr: ♩♩♩,Sun Apr 22 22:00:01 +0000 2018,"<a href=""http://www.rasputin.de/"" rel=""nofollow"">rasputin</a>",de,False,False,,False,...,0,False,False,,,0,False,,,
2,988175627453952002,821834588158853120,[d] DiplIngMarkus Verbrauch gestern: 3.71 kWh Uptime: 41 days 21:01:04 since 23:48:32 13/10/2017,Sun Apr 22 22:00:03 +0000 2018,"<a href=""http://arduino-tweet.appspot.com/"" rel=""nofollow"">Arduino</a>",de,False,False,,False,...,0,False,False,,,0,False,,,
3,988175627479146498,859148218915069952,Alien Mutterschiff sozusagen \n(Hab das Gefühl bin schwanger) \n(Das gibt dann rr Stress) https://t.co/z4uHz2dugO,Sun Apr 22 22:00:03 +0000 2018,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>",de,False,False,,False,...,0,False,False,,,0,False,,,
4,988175681996681218,3206249890,@MDegen55 🇩🇪🇩🇪 Gute Nacht 🌃🌃🌃 https://t.co/OTnsAsDGrc,Sun Apr 22 22:00:16 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,False,,False,...,0,False,False,,3206249890.0,0,False,,,


In [22]:
tweetsS[tweetsS['id'].isin(urlsS[urlsS['resolved_url'] == "https://www.youtube.com/watch?v=tzni9QRNMEU&feature=youtu.be"]['tweet_id'])]

Unnamed: 0,id,user_id,text,created_at,source,lang,truncated,is_retweet,retweet_id,is_quote,...,favorite_count,favorited,retweeted,hashtags,user_mentions,number_of_urls,extracted,coordinates_type,coordinates_long,coordinates_lat
89443,990199324620472320,1099570976,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:01:30 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89456,990199702112030720,1710032354,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:03:00 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89469,990199773423636480,73732277,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:03:17 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89478,990199878268669952,744423667178823680,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:03:42 +0000 2018,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89482,990199907624542211,845826939856216064,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:03:49 +0000 2018,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89495,990200083810467841,1101142141,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:04:31 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89508,990200415156293632,1097049314,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:05:50 +0000 2018,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89573,990201727952572418,2399086608,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:11:03 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89605,990202420016840704,1540314288,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:13:48 +0000 2018,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,
89629,990202940102098944,4158510612,RT @RealVIXX: [VIXX TV cookie]\n빅스(VIXX) VIXX TV cookie #4\n\n▶️ https://t.co/Sj8xa5T2V7\n▶️ https://t.co/uoM2uZ7WIr\n\n#빅스 #VIXX #VIXX_TV #cooki…,Sat Apr 28 12:15:52 +0000 2018,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",de,False,True,9.901993e+17,False,...,0,False,False,빅스 VIXX VIXX_TV,570842497.0,2,False,,,


## Query Hashtags

In [25]:
def getHashtagsFromTopLevelDomain(tld, urls, hashtags):
    return hashtags[ hashtags['tweet_id'].isin(urls[urls['top_level_domain'] == tld]['tweet_id'])]

### Sampled

In [26]:
hashtagsS03 = pd.read_sql_query("SELECT * FROM tweets_hashtags", connS03 )
print("# of hashtags sampled 03: %s" %len(hashtagsS03))

hashtagsS03 = getHashtagsFromTopLevelDomain('youtube.com', urlsS03, hashtagsS03)
print("# of youtube hashtags sampled 03: %s" %len(hashtagsS03))

hashtagsS17 = pd.read_sql_query("SELECT * FROM tweets_hashtags", connS17 )
print("# of hashtags sampled 17: %s" %len(hashtagsS17))

hashtagsS17 = getHashtagsFromTopLevelDomain('youtube.com', urlsS17, hashtagsS17)
print("# of youtube hashtags sampled 17: %s" %len(hashtagsS17))

hashtagsS = hashtagsS17.append(hashtagsS03)
print("# of youtube hashtags sampled: %s" %len(hashtagsS))

hashtagsS.head()

# of hashtags sampled 03: 84203
# of youtube hashtags sampled 03: 1450
# of hashtags sampled 17: 73935
# of youtube hashtags sampled 17: 1750
# of youtube hashtags sampled: 3200


Unnamed: 0,tweet_id,hashtag
59,988077828091580427,nullsechsTV
60,988077828091580427,SCPHFC
61,988077828091580427,scp06
62,988177561053286401,nullsechsTV
63,988177561053286401,SCPHFC


In [24]:
len(urlsF)

439201

###  Filtered

In [27]:
hashtagsF03 = pd.read_sql_query("SELECT * FROM tweets_hashtags", connF03 )
print("# of hashtags filtered 03: %s" %len(hashtagsF03))

hashtagsF03 = getHashtagsFromTopLevelDomain('youtube.com', urlsF03, hashtagsF03)
print("# of youtube hashtags filtered 03: %s" %len(hashtagsF03))

hashtagsF17 = pd.read_sql_query("SELECT * FROM tweets_hashtags", connF17 )
print("# of hashtags filtered 17: %s" %len(hashtagsF17))

hashtagsF17 = getHashtagsFromTopLevelDomain('youtube.com', urlsF17, hashtagsF17)
print("# of youtube hashtags filtered 17: %s" %len(hashtagsF17))

hashtagsF = hashtagsF17.append(hashtagsF03)
print("# of youtube hashtags filtered: %s" %len(hashtagsF))

hashtagsF.head()

# of hashtags filtered 03: 4377831
# of youtube hashtags filtered 03: 68218
# of hashtags filtered 17: 3780197
# of youtube hashtags filtered 17: 70375
# of youtube hashtags filtered: 138593


Unnamed: 0,tweet_id,hashtag
119,988175640607195137,egoFM
120,988175640607195137,musicinbetween
239,988175711453302789,PS4live
271,988160198480154624,Bremen
272,988160198480154624,AfD


## Hashtags

### Sampled vs Filtered

In [30]:
attribute = 'hashtag'
column_name = 'hashtag'
    
df_sampled = generateRankingDataframe(hashtagsS[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(hashtagsF[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [31]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "Youtube Hashtags - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),hashtag,percentage,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,v (17434. / 0.001% / 2.281%),빅스,2.28125,73
2,-,VIXX_TV,2.28125,73
3,v (4972. / 0.003% / 2.278%),VIXX,2.28125,73
4,v (116. / 0.066% / 1.684%),ForTheLoveOf,1.75,56
5,v (185. / 0.049% / 1.701%),GlobeHarryStyles,1.75,56
6,v (9. / 0.513% / 0.737%),AfD,1.25,40
7,^ (3. / 1.465% / -0.215%),letsplay,1.25,40
8,^ (2. / 1.593% / -0.437%),LetsPlay,1.15625,37
9,^ (4. / 1.152% / -0.058%),youtube,1.09375,35
10,^ (5. / 0.763% / 0.237%),YouTube,1.0,32


In [32]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "Youtube URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,difference (rank / percentage / diff),hashtag,percentage,value
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,v (12. / 0.719% / 1.027%),PS4live,1.7454,2419
2,v (8. / 1.156% / 0.437%),LetsPlay,1.59315,2208
3,v (7. / 1.250% / 0.215%),letsplay,1.46472,2030
4,v (9. / 1.094% / 0.058%),youtube,1.15157,1596
5,v (10. / 1.000% / -0.237%),YouTube,0.763386,1058
6,v (17. / 0.406% / 0.316%),gaming,0.722259,1001
7,v (15. / 0.469% / 0.151%),GermanMediaRT,0.6198,859
8,v (13. / 0.625% / -0.014%),Nachtschicht,0.611142,847
9,^ (6. / 1.250% / -0.737%),AfD,0.513013,711
10,v (14. / 0.562% / -0.093%),Video,0.469721,651
