# Shared URL Comparison

In [1]:
import psycopg2
import pandas as pd
import config
from matplotlib import pylab as plt
import seaborn as sns
import numpy as np
from IPython.display import display, HTML
import re


color_palette = sns.color_palette(palette='muted', n_colors=None, desat=.75)
sns.set(context='notebook', palette=color_palette, style='whitegrid', font='sans-serif', font_scale=1.5, color_codes=False, rc=None)
pd.set_option('display.max_colwidth', -1)
table_styles = [{'selector': 'td',
                 'props': [('min-width', '100px'), ('text-align', 'center')]},
                {'selector': 'tr',
                 'props': [('border-bottom', '1px dotted black')]},
                {'selector': 'th',
                 'props': [('text-align', 'center')]}
               ]

%matplotlib inline

directory = "url_top_lists/"
stream = "comparison"

## Util Methods

In [8]:
def compareRows(row, df_to_compare, column_name):
    comparison_row = df_to_compare.loc[df_to_compare[column_name] == row[column_name]]
    if comparison_row.empty:
        comparison = " - "
    else:
        percentage_dif = row['percentage'] - comparison_row['percentage'].values[0]
        difference = "(%s. / %.3f%% / %.3f%%)" % (comparison_row['rank'].values[0], comparison_row['percentage'].values[0], percentage_dif)
        if comparison_row['rank'].values[0] == row['rank']:
            comparison = " = <br>" + difference
        else:
            if comparison_row['rank'].values[0] > row['rank']:
                comparison = " v <br>" + difference
            else:
                comparison = " ^ <br>" + difference
    return comparison

def getOpacity(val):
    value = abs(float(re.findall(r"[-+]?\d*\.\d+|\d+", val.split("/")[2])[0]))
    if value < 0.005:
        return 1
    if value < 0.01:
        return 0.95
    if value < 0.05:
        return 0.8
    if value < 0.1:
        return 0.7
    if value < 0.5:
        return 0.6
    if value < 1:
        return 0.5
    if value < 10:
        return 0.3
    if value < 40:
        return 0.2
    if value < 80:
        return 0.1
    if value < 100:
        return 0.05

def colorComparisonField(val):
    
    if isinstance(val, str):
        if ' ^ ' in val or ' v ' in val:
            return 'background-color: rgba(246, 185, 59, %s)' %getOpacity(val)
        if ' = ' in val:
            return 'background-color: rgba(184, 233, 148, %s)' %getOpacity(val)
        if ' - ' in val and len(val) == 3:
            return 'background-color: #e55039' 
    return ''

def generateRankingDataframe(series, attribute_name):
    size = series.sum()
    rank = []
    parameter = []
    count = []
    percentage = []

    i = 1
    for index, value in series.iteritems():
        rank.append(i)
        parameter.append(index)
        count.append(value)
        percentage.append((value/size)*100)
        i += 1

    data = {'rank': rank, attribute_name: parameter, 'value': count, 'percentage': percentage}
    return pd.DataFrame(data=data)

def generateComparisonDataframes(df1, df2, column_name, size):
    compare_list = []
    for index, row in df1.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df2, column_name))

    data = {'rank': df1['rank'][:size], column_name: df1[column_name][:size], 'value': df1['value'][:size], 'percentage': df1['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df1_compared = pd.DataFrame(data=data)
    df1_compared.set_index(keys='rank', inplace=True)
    
    compare_list = []
    for index, row in df2.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df1, column_name))

    data = {'rank': df2['rank'][:size], column_name: df2[column_name][:size], 'value': df2['value'][:size], 'percentage': df2['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df2_compared = pd.DataFrame(data=data)
    df2_compared.set_index(keys='rank', inplace=True)
    
    return df1_compared, df2_compared

def getPrettyComparisonDataframe(df, title):
    s = df.style.applymap(colorComparisonField)
    s.set_caption(title)
    s.set_table_styles(table_styles)
    return s

In [3]:
conn = None
try:
    # read connection parameters
    paramsS17 = config.cfgAzureS17()
    paramsS03 = config.cfgAzureS03()

    paramsF17 = config.cfgAzureF17()
    paramsF03 = config.cfgAzureF03()
    
    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    connS17 = psycopg2.connect(**paramsS17)
    connS03 = psycopg2.connect(**paramsS03)
    
    connF17 = psycopg2.connect(**paramsF17)
    connF03 = psycopg2.connect(**paramsF03)

    # create a cursor
    curS17 = connS17.cursor()
    curS03 = connS03.cursor()
    
    curF17 = connF17.cursor()
    curF03 = connF03.cursor()

    # execute a statement
    print('PostgreSQL database version:')
    
    curS17.execute('SELECT version()')
    curS03.execute('SELECT version()')
    curF17.execute('SELECT version()')
    curF03.execute('SELECT version()')
    
    # display the PostgreSQL database server version
    db_version_curS17 = curS17.fetchone()
    db_version_curS03 = curS03.fetchone()
    db_version_curF17 = curF17.fetchone()
    db_version_curF03 = curF03.fetchone()
    
    print(db_version_curS17)
    print(db_version_curS03)
    print(db_version_curF17)
    print(db_version_curF03)

    # close the communication with the PostgreSQL
    curS17.close()
    curS03.close()
    curF17.close()
    curF03.close()

except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Connecting to the PostgreSQL database...
PostgreSQL database version:
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)


## Query URLs

### Calendar Week 03 - Sampled

In [4]:
#tweets03 = pd.read_sql_query("SELECT * FROM tweets_info;", conn, parse_dates=['created_at'] )
#tweets03['created_at'] = tweets['created_at'].dt.tz_localize("UTC").dt.tz_convert("Europe/Berlin")
tweetsS03 = pd.read_sql_query("SELECT * FROM tweets_urls;", connS03 )

print("Number of Tweets: %s" %len(tweetsS03))
tweetsS03.head()

Number of Tweets: 40339


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,953585808631631872,https://twitter.com/Zockerli/status/953370777017094144,https://twitter.com/Zockerli/status/953370777017094144,200.0,https://twitter.com/,twitter.com,True,False,True,False
1,952676758066728960,http://bit.ly/2D8B0mK,http://dalyaakota.ru/mghnfgnrtgfgnfgbfbrgdfnfgf/6733,,http://dalyaakota.ru/,dalyaakota.ru,False,,True,True
2,952676816820621314,http://youtu.be/xo1eqTeG5DY?a,https://www.youtube.com/watch?v=xo1eqTeG5DY&feature=youtu.be&a,200.0,https://www.youtube.com/,youtube.com,False,False,True,False
3,952676760390336512,http://on.wusa9.com/2D0gSit,https://www.wusa9.com/article/news/nation-world/amazon-ceo-and-his-wife-donate-33m-scholarship-grant-for-daca-students/65-507939971,200.0,https://www.wusa9.com/,wusa9.com,False,False,True,False
4,952677085230915589,https://www.refcrime.info/de/Crime/Show/74471?tkn=EA05559E0B0B10F0,https://www.refcrime.info/de/Crime/Show/74471?tkn=EA05559E0B0B10F0,200.0,https://www.refcrime.info/,refcrime.info,False,False,True,False


### Calendar Week 17 - Sampled

In [5]:
tweetsS17 = pd.read_sql_query("SELECT * FROM tweets_urls;", connS17 )

print("Number of Tweets: %s" %len(tweetsS17))
tweetsS17.head()

Number of Tweets: 35687


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,989097603664138240,http://arte.tv/abgedreht,https://www.arte.tv/de/videos/RC-014033/abgedreht/,200.0,https://www.arte.tv/,arte.tv,False,False,True,False
1,988175933659021318,https://twitter.com/piersmorgan/status/987388203593322496,https://twitter.com/piersmorgan/status/987388203593322496,200.0,https://twitter.com/,twitter.com,True,False,True,False
2,988176164358361088,https://www.journal.koeln/pol-ham-fahrradfahrer-verletzt-sich-bei-sturz/,https://www.journal.koeln/pol-ham-fahrradfahrer-verletzt-sich-bei-sturz/,200.0,https://www.journal.koeln/,journal.koeln,False,False,True,False
3,988176403412766720,http://www.radionomy.com/erika1,https://www.radionomy.com/en/radio/erika1,200.0,https://www.radionomy.com/,radionomy.com,False,False,True,False
4,988176415995592704,https://twitter.com/JanLatten/status/988161845205913600,https://twitter.com/JanLatten/status/988161845205913600,200.0,https://twitter.com/,twitter.com,True,False,True,False


### Calendar Week 03 - Filtered

In [6]:
tweetsF03 = pd.read_sql_query("SELECT * FROM tweets_urls;", connF03 )

print("Number of Tweets: %s" %len(tweetsF03))
tweetsF03.head()

Number of Tweets: 2411523


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,953683515803099138,http://www.hda-koeln.de,http://www.hda-koeln.de/,200.0,http://www.hda-koeln.de/,hda-koeln.de,False,False,True,False
1,954032649353465856,https://www.amazon.de/diesem-hei%C3%9Fen-Sommer-Eva-Maria-Farohi-ebook/dp/B075KHFTJW/ref=la_B00YYC9ZUG_1_10?s=books&ie=UTF8&qid=1505325422&sr=1-10,https://www.amazon.de/diesem-hei%C3%9Fen-Sommer-Eva-Maria-Farohi-ebook/dp/B075KHFTJW/ref=la_B00YYC9ZUG_1_10?s=books&ie=UTF8&qid=1505325422&sr=1-10,503.0,https://www.amazon.de/,amazon.de,False,False,True,False
2,954032649483554816,https://fb.me/P063el3s,https://www.facebook.com/hellwegradio/posts/1824843810872037,200.0,https://www.facebook.com/,facebook.com,False,False,True,False
3,954032649315655680,http://dlvr.it/QBmbks,https://www.amateurseite.com/index.php?fsk=16&mediagallery_id=4113344&utm_source=dlvr.it&utm_medium=twitter&geoip_cc=DE&geoip_lat=51.0500&geoip_lon=13.7500&main=mediagallery&page=view_video&template=page_view_video&wm=6460&ts=rss&prm=rev&pr=Stripbunnies,200.0,https://www.amateurseite.com/,amateurseite.com,False,False,True,False
4,954032649487527938,http://dlvr.it/QBmbkh,https://news.google.com/?sa=t&fd=R&ct2=de&usg=AFQjCNE-JoIbC8dsXduut9q4Oxp_EUQnbg&clid=c3a7d30bb8a4878e06b80cf16b898331&cid=52780764023198&ei=tc9gWtj6E5PthQGj-YaQAQ&url=http://www.abendzeitung-muenchen.de/inhalt.videodreh-in-thailand-unfall-in-thailand-marc-terenzi-zeigt-seine-verletzungen.7b7c38da-15eb-4652-b5fb-6806bde81bae.html&utm_source=dlvr.it&utm_medium=twitter&taa=1&hl=en-US&gl=US&ceid=US:en,200.0,https://news.google.com/,google.com,False,False,True,False


### Calendar Week 17 - Filtered

In [7]:
tweetsF17 = pd.read_sql_query("SELECT * FROM tweets_urls;", connF17 )

print("Number of Tweets: %s" %len(tweetsF17))
tweetsF17.head()

Number of Tweets: 2107279


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,988437985363406853,https://www.facebook.com/nadjashah/posts/10215947002527277,https://www.facebook.com/nadjashah/posts/10215947002527277,200.0,https://www.facebook.com/,facebook.com,False,False,True,False
1,988437986797916160,https://www.facebook.com/StageSchoolHamburg/posts/1934173349947359,https://www.facebook.com/StageSchoolHamburg/posts/1934173349947359,200.0,https://www.facebook.com/,facebook.com,False,False,True,False
2,988437989712957440,http://www.faz.net/aktuell/rhein-main/frankfurt/frankfurter-gutleutviertel-landgericht-erlaubt-drohende-zwangsraeumung-15556433.html,http://www.faz.net/aktuell/rhein-main/frankfurt/frankfurter-gutleutviertel-landgericht-erlaubt-drohende-zwangsraeumung-15556433.html,200.0,http://www.faz.net/,faz.net,False,False,True,False
3,988437989704568832,http://vera-lengsfeld.de/2018/04/22/buergerprotest-ueberall-wie-lange-wird-das-noch-verschwiegen/,http://vera-lengsfeld.de/2018/04/22/buergerprotest-ueberall-wie-lange-wird-das-noch-verschwiegen/,200.0,http://vera-lengsfeld.de/,vera-lengsfeld.de,False,False,True,False
4,988437990732128256,https://ift.tt/2Jm4wnp,https://news.google.com/?sa=t&fd=R&ct2=de&usg=AFQjCNFSz3Yf_fDFIytLtnNe8JNGM2BPZg&clid=c3a7d30bb8a4878e06b80cf16b898331&ei=O_rdWqDIHNCT3QHCyJDQAg&url=https://www.waz.de/kultur/fuer-silke-j-raebiger-ist-es-das-letzte-frauen-filmfestival-id214100661.html&taa=1&hl=en-US&gl=US&ceid=US:en,200.0,https://news.google.com/,google.com,False,False,True,False


## Comparing Top Short URLs shared

### Calendar Week 03 - Short URL - Sampled x Filtered

In [10]:
attribute = 'short_url'
column_name = 'short url'
    
df_sampled = generateRankingDataframe(tweetsS03[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF03[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [12]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - Short URLs - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,short url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,http://youtu.be/Dw-2hh6G_D8?a,68,0.168571,= (1. / 0.275% / -0.106%)
2,http://www.dwd.de,52,0.128908,= (2. / 0.272% / -0.143%)
3,http://tinyurl.com/q3qf2ej,50,0.12395,= (3. / 0.221% / -0.097%)
4,http://www.BLACKPINKYG.COM,48,0.118992,-
5,http://www.messe.tv,41,0.101639,^ (4. / 0.144% / -0.042%)
6,http://www.the-sz.com/products/vbbinfo/?f=3,34,0.0842857,^ (5. / 0.123% / -0.039%)
7,http://bit.ly/1Y1aRZ1,33,0.0818067,v (12. / 0.085% / -0.003%)
8,http://wx3.sinaimg.cn/large/a157f83bly1fnk017dgu2j20zk0qogoh.jpg,31,0.0768487,-
9,https://farm5.staticflickr.com/4658/27991711159_fca7ac03bb_o.jpg,31,0.0768487,v (105906. / 0.000% / 0.077%)
10,https://www.instagram.com/p/BeGyLOHn6YR/?taken-by=exopassion12,31,0.0768487,-


## Comparing Top URLs shared

### Calendar Week 03 - Sampled x Filtered

In [8]:
top_urls_S03 = tweetsS03['resolved_url'].value_counts()
top_urls_F03 = tweetsF03['resolved_url'].value_counts()

In [9]:
top_urls_S03_sum = top_urls_S03.sum()
top_urls_S03_rank = []
top_urls_S03_url = []
top_urls_S03_count = []
top_urls_S03_percentage = []

i = 1
for index, value in top_urls_S03.iteritems():
    top_urls_S03_rank.append(i)
    top_urls_S03_url.append(index)
    top_urls_S03_count.append(value)
    top_urls_S03_percentage.append((value/top_urls_S03_sum)*100)
    i += 1
    
data = {'rank': top_urls_S03_rank,'url':top_urls_S03_url, 'value': top_urls_S03_count, 'percentage': top_urls_S03_percentage}
df_top_urls_S03 = pd.DataFrame(data=data)
display(HTML(df_top_urls_S03[:10].to_html()))

Unnamed: 0,rank,url,value,percentage
0,1,https://twitter.com/account/suspended,183,0.453655
1,2,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,68,0.168571
2,3,https://www.dwd.de/,52,0.128908
3,4,http://www.deutschlandfunk.de/dlf24-startseite.1441.de.html,50,0.12395
4,5,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,50,0.12395
5,6,http://www.blackpinkyg.com/,48,0.118992
6,7,http://www.messe.tv/,41,0.101639
7,8,http://www.the-sz.com/products/vbbinfo/?f=3,34,0.084286
8,9,http://der-x-code.com/,33,0.081807
9,10,https://farm5.staticflickr.com/4632/25898289958_0b5d369677_o.jpg,31,0.076849


In [10]:
top_urls_F03_sum = top_urls_F03.sum()
top_urls_F03_rank = []
top_urls_F03_url = []
top_urls_F03_count = []
top_urls_F03_percentage = []

i = 1
for index, value in top_urls_F03.iteritems():
    top_urls_F03_rank.append(i)
    top_urls_F03_url.append(index)
    top_urls_F03_count.append(value)
    top_urls_F03_percentage.append((value/top_urls_F03_sum)*100)
    i += 1
    
data = {'rank': top_urls_F03_rank,'url':top_urls_F03_url, 'value': top_urls_F03_count, 'percentage': top_urls_F03_percentage}
df_top_urls_F03 = pd.DataFrame(data=data)
display(HTML(df_top_urls_F03[:10].to_html()))

Unnamed: 0,rank,url,value,percentage
0,1,https://twitter.com/account/suspended,8470,0.35123
1,2,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,6623,0.27464
2,3,https://www.dwd.de/,6571,0.272483
3,4,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,5323,0.220732
4,5,http://www.messe.tv/,3471,0.143934
5,6,http://www.the-sz.com/products/vbbinfo/?f=3,2973,0.123283
6,7,https://www.emma-care.de/blog/teufelskralle-fuers-pferd-was-ist-das,2454,0.101761
7,8,https://sec.help.ch,2315,0.095997
8,9,https://www.facebook.com/CODECODE1111111111/,2229,0.092431
9,10,http://der-x-code.com/,2057,0.085299


In [11]:
compare_list = []
for index, row in df_top_urls_S03.iterrows():
    if row['rank'] <= 50:
        compare_list.append(compareRows(row, df_top_urls_F03))
                    
                
data = {'rank': top_urls_S03_rank[:50],'url':top_urls_S03_url[:50], 'value': top_urls_S03_count[:50], 'percentage': top_urls_S03_percentage[:50],
        'difference (rank / percentage / diff)': compare_list}
df_top_urls_S03_F03_comparison = pd.DataFrame(data=data)
df_top_urls_S03_F03_comparison.set_index(keys='rank', inplace=True)
df_top_urls_S03_F03_comparison.style.applymap(colorComparisonField)

s = df_top_urls_S03_F03_comparison.style.applymap(colorComparisonField)
s.set_caption("Calendar Week 03: Sampled URLs x Filtered URLs ")
s.set_table_styles(table_styles)

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,https://twitter.com/account/suspended,183,0.453655,= (1. / 0.351% / 0.102%)
2,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,68,0.168571,= (2. / 0.275% / -0.106%)
3,https://www.dwd.de/,52,0.128908,= (3. / 0.272% / -0.144%)
4,http://www.deutschlandfunk.de/dlf24-startseite.1441.de.html,50,0.12395,v (35. / 0.044% / 0.080%)
5,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,50,0.12395,^ (4. / 0.221% / -0.097%)
6,http://www.blackpinkyg.com/,48,0.118992,-
7,http://www.messe.tv/,41,0.101639,^ (5. / 0.144% / -0.042%)
8,http://www.the-sz.com/products/vbbinfo/?f=3,34,0.0842857,^ (6. / 0.123% / -0.039%)
9,http://der-x-code.com/,33,0.0818067,v (10. / 0.085% / -0.003%)
10,https://farm5.staticflickr.com/4632/25898289958_0b5d369677_o.jpg,31,0.0768487,-


In [12]:
compare_list = []
for index, row in df_top_urls_F03.iterrows():
    if row['rank'] <= 50:
        compare_list.append(compareRows(row, df_top_urls_S03))
                    
                
data = {'rank': top_urls_F03_rank[:50],'url':top_urls_F03_url[:50], 'value': top_urls_F03_count[:50], 'percentage': top_urls_F03_percentage[:50],
        'difference (rank / percentage / diff)': compare_list}
df_top_urls_F03_S03_comparison = pd.DataFrame(data=data)
df_top_urls_F03_S03_comparison.set_index(keys='rank', inplace=True)
df_top_urls_F03_S03_comparison.style.applymap(colorComparisonField)

s = df_top_urls_F03_S03_comparison.style.applymap(colorComparisonField)
s.set_caption("Calendar Week 03: Filtered URLs x Sampled URLs ")
s.set_table_styles(table_styles)

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,https://twitter.com/account/suspended,8470,0.35123,= (1. / 0.454% / -0.102%)
2,https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a,6623,0.27464,= (2. / 0.169% / 0.106%)
3,https://www.dwd.de/,6571,0.272483,= (3. / 0.129% / 0.144%)
4,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,5323,0.220732,v (5. / 0.124% / 0.097%)
5,http://www.messe.tv/,3471,0.143934,v (7. / 0.102% / 0.042%)
6,http://www.the-sz.com/products/vbbinfo/?f=3,2973,0.123283,v (8. / 0.084% / 0.039%)
7,https://www.emma-care.de/blog/teufelskralle-fuers-pferd-was-ist-das,2454,0.101761,v (14. / 0.074% / 0.027%)
8,https://sec.help.ch,2315,0.0959974,-
9,https://www.facebook.com/CODECODE1111111111/,2229,0.0924312,v (22. / 0.052% / 0.040%)
10,http://der-x-code.com/,2057,0.0852988,^ (9. / 0.082% / 0.003%)


### Calendar Week 17 - Sampled x Filtered

In [13]:
top_urls_S03 = tweetsS17['resolved_url'].value_counts()
top_urls_F03 = tweetsF17['resolved_url'].value_counts()

In [14]:
top_urls_S03_sum = top_urls_S03.sum()
top_urls_S03_rank = []
top_urls_S03_url = []
top_urls_S03_count = []
top_urls_S03_percentage = []

i = 1
for index, value in top_urls_S03.iteritems():
    top_urls_S03_rank.append(i)
    top_urls_S03_url.append(index)
    top_urls_S03_count.append(value)
    top_urls_S03_percentage.append((value/top_urls_S03_sum)*100)
    i += 1
    
data = {'rank': top_urls_S03_rank,'url':top_urls_S03_url, 'value': top_urls_S03_count, 'percentage': top_urls_S03_percentage}
df_top_urls_S03 = pd.DataFrame(data=data)
display(HTML(df_top_urls_S03[:10].to_html()))

Unnamed: 0,rank,url,value,percentage
0,1,https://www.dwd.de/,60,0.168128
1,2,https://www.ffd365.de,57,0.159722
2,3,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,56,0.15692
3,4,http://susanne-ulrike-maria-albrecht.over-blog.de/,53,0.148513
4,5,https://twitter.com/buzzfeedfrance/status/988362712513171456,47,0.131701
5,6,https://twitter.com/Markus_Soeder/status/988768341820170240,43,0.120492
6,7,https://www.youtube.com/watch?v=F-eMt3SrfFU&feature=youtu.be,37,0.103679
7,8,https://www.youtube.com/watch?v=tBWJ3bvJuT0&feature=youtu.be,36,0.100877
8,9,https://www.linkedin.com/,36,0.100877
9,10,http://tv.naver.com/v/3097997,36,0.100877


In [15]:
top_urls_F03_sum = top_urls_F03.sum()
top_urls_F03_rank = []
top_urls_F03_url = []
top_urls_F03_count = []
top_urls_F03_percentage = []

i = 1
for index, value in top_urls_F03.iteritems():
    top_urls_F03_rank.append(i)
    top_urls_F03_url.append(index)
    top_urls_F03_count.append(value)
    top_urls_F03_percentage.append((value/top_urls_F03_sum)*100)
    i += 1
    
data = {'rank': top_urls_F03_rank,'url':top_urls_F03_url, 'value': top_urls_F03_count, 'percentage': top_urls_F03_percentage}
df_top_urls_F03 = pd.DataFrame(data=data)
display(HTML(df_top_urls_F03[:10].to_html()))

Unnamed: 0,rank,url,value,percentage
0,1,https://sec.help.ch,5615,0.266457
1,2,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,5088,0.241449
2,3,https://www.ffd365.de,4791,0.227355
3,4,https://www.dwd.de/,4566,0.216678
4,5,http://www.gooni168.tv/sellpanties/item/haushalts-sklave-in/,3474,0.164857
5,6,https://www.linkedin.com/,2935,0.139279
6,7,http://www.the-sz.com/products/vbbinfo/?f=3,2739,0.129978
7,8,https://www.miet-check.de/,2420,0.11484
8,9,https://mobile.twitter.com/Markus_Soeder/status/988768341820170240,2282,0.108291
9,10,https://www.radionomy.com/en/radio/erika1,2186,0.103736


In [16]:
compare_list = []
for index, row in df_top_urls_S03.iterrows():
    if row['rank'] <= 50:
        compare_list.append(compareRows(row, df_top_urls_F03))
                    
                
data = {'rank': top_urls_S03_rank[:50],'url':top_urls_S03_url[:50], 'value': top_urls_S03_count[:50], 'percentage': top_urls_S03_percentage[:50],
        'difference (rank / percentage / diff)': compare_list}
df_top_urls_S03_F03_comparison = pd.DataFrame(data=data)
df_top_urls_S03_F03_comparison.set_index(keys='rank', inplace=True)
df_top_urls_S03_F03_comparison.style.applymap(colorComparisonField)

s = df_top_urls_S03_F03_comparison.style.applymap(colorComparisonField)
s.set_caption("Calendar Week 17: Sampled URLs x Filtered URLs ")
s.set_table_styles(table_styles)

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,https://www.dwd.de/,60,0.168128,v (4. / 0.217% / -0.049%)
2,https://www.ffd365.de,57,0.159722,v (3. / 0.227% / -0.068%)
3,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,56,0.15692,^ (2. / 0.241% / -0.085%)
4,http://susanne-ulrike-maria-albrecht.over-blog.de/,53,0.148513,v (19. / 0.077% / 0.071%)
5,https://twitter.com/buzzfeedfrance/status/988362712513171456,47,0.131701,v (87. / 0.020% / 0.111%)
6,https://twitter.com/Markus_Soeder/status/988768341820170240,43,0.120492,v (91. / 0.019% / 0.101%)
7,https://www.youtube.com/watch?v=F-eMt3SrfFU&feature=youtu.be,37,0.103679,v (887. / 0.004% / 0.100%)
8,https://www.youtube.com/watch?v=tBWJ3bvJuT0&feature=youtu.be,36,0.100877,-
9,https://www.linkedin.com/,36,0.100877,^ (6. / 0.139% / -0.038%)
10,http://tv.naver.com/v/3097997,36,0.100877,-


In [17]:
compare_list = []
for index, row in df_top_urls_F03.iterrows():
    if row['rank'] <= 50:
        compare_list.append(compareRows(row, df_top_urls_S03))
                    
                
data = {'rank': top_urls_F03_rank[:50],'url':top_urls_F03_url[:50], 'value': top_urls_F03_count[:50], 'percentage': top_urls_F03_percentage[:50],
        'difference (rank / percentage / diff)': compare_list}
df_top_urls_F03_S03_comparison = pd.DataFrame(data=data)
df_top_urls_F03_S03_comparison.set_index(keys='rank', inplace=True)
df_top_urls_F03_S03_comparison.style.applymap(colorComparisonField)

s = df_top_urls_F03_S03_comparison.style.applymap(colorComparisonField)
s.set_caption("Calendar Week 17: Filtered URLs x Sampled URLs ")
s.set_table_styles(table_styles)

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,https://sec.help.ch,5615,0.266457,-
2,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,5088,0.241449,v (3. / 0.157% / 0.085%)
3,https://www.ffd365.de,4791,0.227355,^ (2. / 0.160% / 0.068%)
4,https://www.dwd.de/,4566,0.216678,^ (1. / 0.168% / 0.049%)
5,http://www.gooni168.tv/sellpanties/item/haushalts-sklave-in/,3474,0.164857,v (12. / 0.095% / 0.070%)
6,https://www.linkedin.com/,2935,0.139279,v (9. / 0.101% / 0.038%)
7,http://www.the-sz.com/products/vbbinfo/?f=3,2739,0.129978,v (17. / 0.076% / 0.054%)
8,https://www.miet-check.de/,2420,0.11484,v (25. / 0.053% / 0.062%)
9,https://mobile.twitter.com/Markus_Soeder/status/988768341820170240,2282,0.108291,-
10,https://www.radionomy.com/en/radio/erika1,2186,0.103736,v (24. / 0.056% / 0.048%)


## Comparing Top Level Domains shared

### Calendar Week 03: Top Level Domains

In [18]:
top_urls_S03 = tweetsS03['top_level_domain'].value_counts()
top_urls_F03 = tweetsF03['top_level_domain'].value_counts()

In [19]:
top_urls_S03_sum = top_urls_S03.sum()
top_urls_S03_rank = []
top_urls_S03_url = []
top_urls_S03_count = []
top_urls_S03_percentage = []

i = 1
for index, value in top_urls_S03.iteritems():
    top_urls_S03_rank.append(i)
    top_urls_S03_url.append(index)
    top_urls_S03_count.append(value)
    top_urls_S03_percentage.append((value/top_urls_S03_sum)*100)
    i += 1
    
data = {'rank': top_urls_S03_rank,'url':top_urls_S03_url, 'value': top_urls_S03_count, 'percentage': top_urls_S03_percentage}
df_top_urls_S03 = pd.DataFrame(data=data)
display(HTML(df_top_urls_S03[:10].to_html()))

Unnamed: 0,rank,url,value,percentage
0,1,twitter.com,6362,15.780727
1,2,youtube.com,3262,8.091281
2,3,facebook.com,1000,2.480466
3,4,instagram.com,727,1.803299
4,5,welt.de,688,1.706561
5,6,spiegel.de,470,1.165819
6,7,focus.de,377,0.935136
7,8,amazon.de,372,0.922733
8,9,google.com,338,0.838398
9,10,bild.de,336,0.833437


In [20]:
top_urls_F03_sum = top_urls_F03.sum()
top_urls_F03_rank = []
top_urls_F03_url = []
top_urls_F03_count = []
top_urls_F03_percentage = []

i = 1
for index, value in top_urls_F03.iteritems():
    top_urls_F03_rank.append(i)
    top_urls_F03_url.append(index)
    top_urls_F03_count.append(value)
    top_urls_F03_percentage.append((value/top_urls_F03_sum)*100)
    i += 1
    
data = {'rank': top_urls_F03_rank,'url':top_urls_F03_url, 'value': top_urls_F03_count, 'percentage': top_urls_F03_percentage}
df_top_urls_F03 = pd.DataFrame(data=data)
display(HTML(df_top_urls_F03[:10].to_html()))

Unnamed: 0,rank,url,value,percentage
0,1,twitter.com,348905,14.474755
1,2,youtube.com,233498,9.686953
2,3,facebook.com,65092,2.700422
3,4,instagram.com,39391,1.634184
4,5,google.com,36688,1.522047
5,6,welt.de,31993,1.327269
6,7,spiegel.de,26804,1.111997
7,8,amazon.de,22972,0.953022
8,9,focus.de,22507,0.933731
9,10,twitch.tv,20801,0.862955


In [21]:
compare_list = []
for index, row in df_top_urls_S03.iterrows():
    if row['rank'] <= 50:
        compare_list.append(compareRows(row, df_top_urls_F03))
                    
                
data = {'rank': top_urls_S03_rank[:50],'url':top_urls_S03_url[:50], 'value': top_urls_S03_count[:50], 'percentage': top_urls_S03_percentage[:50],
        'difference (rank / percentage / diff)': compare_list}
df_top_urls_S03_F03_comparison = pd.DataFrame(data=data)
df_top_urls_S03_F03_comparison.set_index(keys='rank', inplace=True)
df_top_urls_S03_F03_comparison.style.applymap(colorComparisonField)

s = df_top_urls_S03_F03_comparison.style.applymap(colorComparisonField)
s.set_caption("Calendar Week 03: Sampled TLDs x Filtered TLDs ")
s.set_table_styles(table_styles)

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,twitter.com,6362,15.7807,= (1. / 14.475% / 1.306%)
2,youtube.com,3262,8.09128,= (2. / 9.687% / -1.596%)
3,facebook.com,1000,2.48047,= (3. / 2.700% / -0.220%)
4,instagram.com,727,1.8033,= (4. / 1.634% / 0.169%)
5,welt.de,688,1.70656,v (6. / 1.327% / 0.379%)
6,spiegel.de,470,1.16582,v (7. / 1.112% / 0.054%)
7,focus.de,377,0.935136,v (9. / 0.934% / 0.001%)
8,amazon.de,372,0.922733,= (8. / 0.953% / -0.030%)
9,google.com,338,0.838398,^ (5. / 1.522% / -0.684%)
10,bild.de,336,0.833437,v (17. / 0.645% / 0.188%)


In [22]:
compare_list = []
for index, row in df_top_urls_F03.iterrows():
    if row['rank'] <= 50:
        compare_list.append(compareRows(row, df_top_urls_S03))
                    
                
data = {'rank': top_urls_F03_rank[:50],'url':top_urls_F03_url[:50], 'value': top_urls_F03_count[:50], 'percentage': top_urls_F03_percentage[:50],
        'difference (rank / percentage)': compare_list}
df_top_urls_F03_S03_comparison = pd.DataFrame(data=data)
df_top_urls_F03_S03_comparison.set_index(keys='rank', inplace=True)
df_top_urls_F03_S03_comparison.style.applymap(colorComparisonField)

s = df_top_urls_F03_S03_comparison.style.applymap(colorComparisonField)
s.set_caption("Calendar Week 03: Filtered TLDs x Sampled TLDs ")
s.set_table_styles(table_styles)

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,twitter.com,348905,14.4748,= (1. / 15.781% / -1.306%)
2,youtube.com,233498,9.68695,= (2. / 8.091% / 1.596%)
3,facebook.com,65092,2.70042,= (3. / 2.480% / 0.220%)
4,instagram.com,39391,1.63418,= (4. / 1.803% / -0.169%)
5,google.com,36688,1.52205,v (9. / 0.838% / 0.684%)
6,welt.de,31993,1.32727,^ (5. / 1.707% / -0.379%)
7,spiegel.de,26804,1.112,^ (6. / 1.166% / -0.054%)
8,amazon.de,22972,0.953022,= (8. / 0.923% / 0.030%)
9,focus.de,22507,0.933731,^ (7. / 0.935% / -0.001%)
10,twitch.tv,20801,0.862955,v (11. / 0.771% / 0.092%)


### Calendar Week 17: Top Level Domains

In [23]:
top_urls_S03 = tweetsS17['top_level_domain'].value_counts()
top_urls_F03 = tweetsF17['top_level_domain'].value_counts()

In [24]:
top_urls_S03_sum = top_urls_S03.sum()
top_urls_S03_rank = []
top_urls_S03_url = []
top_urls_S03_count = []
top_urls_S03_percentage = []

i = 1
for index, value in top_urls_S03.iteritems():
    top_urls_S03_rank.append(i)
    top_urls_S03_url.append(index)
    top_urls_S03_count.append(value)
    top_urls_S03_percentage.append((value/top_urls_S03_sum)*100)
    i += 1
    
data = {'rank': top_urls_S03_rank,'url':top_urls_S03_url, 'value': top_urls_S03_count, 'percentage': top_urls_S03_percentage}
df_top_urls_S03 = pd.DataFrame(data=data)
display(HTML(df_top_urls_S03[:10].to_html()))

Unnamed: 0,rank,url,value,percentage
0,1,twitter.com,6192,17.357179
1,2,youtube.com,2961,8.300163
2,3,facebook.com,884,2.477995
3,4,instagram.com,805,2.256545
4,5,welt.de,534,1.496888
5,6,spiegel.de,430,1.20536
6,7,amazon.de,334,0.936256
7,8,twitch.tv,318,0.891406
8,9,focus.de,291,0.81572
9,10,wordpress.com,261,0.731625


In [25]:
top_urls_F03_sum = top_urls_F03.sum()
top_urls_F03_rank = []
top_urls_F03_url = []
top_urls_F03_count = []
top_urls_F03_percentage = []

i = 1
for index, value in top_urls_F03.iteritems():
    top_urls_F03_rank.append(i)
    top_urls_F03_url.append(index)
    top_urls_F03_count.append(value)
    top_urls_F03_percentage.append((value/top_urls_F03_sum)*100)
    i += 1
    
data = {'rank': top_urls_F03_rank,'url':top_urls_F03_url, 'value': top_urls_F03_count, 'percentage': top_urls_F03_percentage}
df_top_urls_F03 = pd.DataFrame(data=data)
display(HTML(df_top_urls_F03[:10].to_html()))

Unnamed: 0,rank,url,value,percentage
0,1,twitter.com,339597,16.119779
1,2,youtube.com,205700,9.76404
2,3,facebook.com,57313,2.720498
3,4,instagram.com,41411,1.965672
4,5,amazon.de,26290,1.247917
5,6,spiegel.de,24238,1.150514
6,7,welt.de,22878,1.085959
7,8,twitch.tv,21044,0.998904
8,9,google.com,19676,0.933968
9,10,focus.de,15926,0.755965


In [26]:
compare_list = []
for index, row in df_top_urls_S03.iterrows():
    if row['rank'] <= 50:
        compare_list.append(compareRows(row, df_top_urls_F03))
                    
                
data = {'rank': top_urls_S03_rank[:50],'url':top_urls_S03_url[:50], 'value': top_urls_S03_count[:50], 'percentage': top_urls_S03_percentage[:50],
        'difference (rank / percentage)': compare_list}
df_top_urls_S03_F03_comparison = pd.DataFrame(data=data)
df_top_urls_S03_F03_comparison.set_index(keys='rank', inplace=True)
df_top_urls_S03_F03_comparison.style.applymap(colorComparisonField)

s = df_top_urls_S03_F03_comparison.style.applymap(colorComparisonField)
s.set_caption("Calendar Week 17: Sampled TLDs x Filtered TLDs ")
s.set_table_styles(table_styles)

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,twitter.com,6192,17.3572,= (1. / 16.120% / 1.237%)
2,youtube.com,2961,8.30016,= (2. / 9.764% / -1.464%)
3,facebook.com,884,2.478,= (3. / 2.720% / -0.243%)
4,instagram.com,805,2.25655,= (4. / 1.966% / 0.291%)
5,welt.de,534,1.49689,v (7. / 1.086% / 0.411%)
6,spiegel.de,430,1.20536,= (6. / 1.151% / 0.055%)
7,amazon.de,334,0.936256,^ (5. / 1.248% / -0.312%)
8,twitch.tv,318,0.891406,= (8. / 0.999% / -0.107%)
9,focus.de,291,0.81572,v (10. / 0.756% / 0.060%)
10,wordpress.com,261,0.731625,v (12. / 0.650% / 0.081%)


In [27]:
compare_list = []
for index, row in df_top_urls_F03.iterrows():
    if row['rank'] <= 50:
        compare_list.append(compareRows(row, df_top_urls_S03))
                    
                
data = {'rank': top_urls_F03_rank[:50],'url':top_urls_F03_url[:50], 'value': top_urls_F03_count[:50], 'percentage': top_urls_F03_percentage[:50],
        'difference (rank / percentage / diff)': compare_list}
df_top_urls_F03_S03_comparison = pd.DataFrame(data=data)
df_top_urls_F03_S03_comparison.set_index(keys='rank', inplace=True)
df_top_urls_F03_S03_comparison.style.applymap(colorComparisonField)

s = df_top_urls_F03_S03_comparison.style.applymap(colorComparisonField)
s.set_caption("Calendar Week 17: Filtered TLDs x Sampled TLDs ")
s.set_table_styles(table_styles)

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,twitter.com,339597,16.1198,= (1. / 17.357% / -1.237%)
2,youtube.com,205700,9.76404,= (2. / 8.300% / 1.464%)
3,facebook.com,57313,2.7205,= (3. / 2.478% / 0.243%)
4,instagram.com,41411,1.96567,= (4. / 2.257% / -0.291%)
5,amazon.de,26290,1.24792,v (7. / 0.936% / 0.312%)
6,spiegel.de,24238,1.15051,= (6. / 1.205% / -0.055%)
7,welt.de,22878,1.08596,^ (5. / 1.497% / -0.411%)
8,twitch.tv,21044,0.998904,= (8. / 0.891% / 0.107%)
9,google.com,19676,0.933968,v (15. / 0.577% / 0.357%)
10,focus.de,15926,0.755965,^ (9. / 0.816% / -0.060%)
