# Hashtag Data Comparison (Sampled vs Filtered)

In [4]:
import psycopg2
import pandas as pd
import config
from matplotlib import pylab as plt
import seaborn as sns
import numpy as np
from IPython.display import display, HTML
import re


color_palette = sns.color_palette(palette='muted', n_colors=None, desat=.75)
sns.set(context='notebook', palette=color_palette, style='whitegrid', font='sans-serif', font_scale=1.5, color_codes=False, rc=None)
pd.set_option('display.max_colwidth', -1)
table_styles = [{'selector': 'td',
                 'props': [('min-width', '100px'), ('text-align', 'center')]},
                {'selector': 'tr',
                 'props': [('border-bottom', '1px dotted black')]},
                {'selector': 'th',
                 'props': [('text-align', 'center')]}
               ]

%matplotlib inline

directory = "url_top_lists/"
stream = "comparison"

TypeError: color_palette() got an unexpected keyword argument 'palette'

## Util Methods

In [2]:
def compareRows(row, df_to_compare, column_name):
    comparison_row = df_to_compare.loc[df_to_compare[column_name] == row[column_name]]
    if comparison_row.empty:
        comparison = " - "
    else:
        percentage_dif = row['percentage'] - comparison_row['percentage'].values[0]
        difference = "(%s. / %.3f%% / %.3f%%)" % (comparison_row['rank'].values[0], comparison_row['percentage'].values[0], percentage_dif)
        if comparison_row['rank'].values[0] == row['rank']:
            comparison = " = <br>" + difference
        else:
            if comparison_row['rank'].values[0] > row['rank']:
                comparison = " v <br>" + difference
            else:
                comparison = " ^ <br>" + difference
    return comparison

def getOpacity(val):
    value = abs(float(re.findall(r"[-+]?\d*\.\d+|\d+", val.split("/")[2])[0]))
    if value < 0.005:
        return 1
    if value < 0.01:
        return 0.95
    if value < 0.05:
        return 0.8
    if value < 0.1:
        return 0.7
    if value < 0.5:
        return 0.6
    if value < 1:
        return 0.5
    if value < 10:
        return 0.3
    if value < 40:
        return 0.2
    if value < 80:
        return 0.1
    if value < 100:
        return 0.05

def colorComparisonField(val):
    
    if isinstance(val, str):
        if ' ^ ' in val or ' v ' in val:
            return 'background-color: rgba(246, 185, 59, %s)' %getOpacity(val)
        if ' = ' in val:
            return 'background-color: rgba(184, 233, 148, %s)' %getOpacity(val)
        if ' - ' in val and len(val) == 3:
            return 'background-color: #e55039' 
    return ''

def generateRankingDataframe(series, attribute_name):
    size = series.sum()
    rank = []
    parameter = []
    count = []
    percentage = []

    i = 1
    for index, value in series.iteritems():
        rank.append(i)
        parameter.append(index)
        count.append(value)
        percentage.append((value/size)*100)
        i += 1

    data = {'rank': rank, attribute_name: parameter, 'value': count, 'percentage': percentage}
    return pd.DataFrame(data=data)

def generateComparisonDataframes(df1, df2, column_name, size):
    compare_list = []
    for index, row in df1.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df2, column_name))

    data = {'rank': df1['rank'][:size], column_name: df1[column_name][:size], 'value': df1['value'][:size], 'percentage': df1['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df1_compared = pd.DataFrame(data=data)
    df1_compared.set_index(keys='rank', inplace=True)
    
    compare_list = []
    for index, row in df2.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df1, column_name))

    data = {'rank': df2['rank'][:size], column_name: df2[column_name][:size], 'value': df2['value'][:size], 'percentage': df2['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df2_compared = pd.DataFrame(data=data)
    df2_compared.set_index(keys='rank', inplace=True)
    
    return df1_compared, df2_compared

def getPrettyComparisonDataframe(df, title):
    s = df.style.applymap(colorComparisonField)
    s.set_caption(title)
    s.set_table_styles(table_styles)
    return s

In [3]:
conn = None
try:
    # read connection parameters
    paramsS17 = config.cfgAzureS17()
    paramsS03 = config.cfgAzureS03()

    paramsF17 = config.cfgAzureF17()
    paramsF03 = config.cfgAzureF03()
    
    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    connS17 = psycopg2.connect(**paramsS17)
    connS03 = psycopg2.connect(**paramsS03)
    
    connF17 = psycopg2.connect(**paramsF17)
    connF03 = psycopg2.connect(**paramsF03)

    # create a cursor
    curS17 = connS17.cursor()
    curS03 = connS03.cursor()
    
    curF17 = connF17.cursor()
    curF03 = connF03.cursor()

    # execute a statement
    print('PostgreSQL database version:')
    
    curS17.execute('SELECT version()')
    curS03.execute('SELECT version()')
    curF17.execute('SELECT version()')
    curF03.execute('SELECT version()')
    
    # display the PostgreSQL database server version
    db_version_curS17 = curS17.fetchone()
    db_version_curS03 = curS03.fetchone()
    db_version_curF17 = curF17.fetchone()
    db_version_curF03 = curF03.fetchone()
    
    print(db_version_curS17)
    print(db_version_curS03)
    print(db_version_curF17)
    print(db_version_curF03)

    # close the communication with the PostgreSQL
    curS17.close()
    curS03.close()
    curF17.close()
    curF03.close()

except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Connecting to the PostgreSQL database...
PostgreSQL database version:
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)


## Query URLs

### Calendar Week 03 - Sampled

In [4]:
tweetsS03 = pd.read_sql_query("SELECT * FROM tweets_hashtags;", connS03 )

print("Number of Tweets: %s" %len(tweetsS03))
tweetsS03.head()

Number of Tweets: 84203


Unnamed: 0,tweet_id,hashtag
0,952676770679058432,milf
1,952676770679058432,porno
2,952676770679058432,ass
3,952676770679058432,HDvideos
4,952676770679058432,SoloMale


empty icards possible

### Calendar Week 17 - Sampled

In [5]:
tweetsS17 = pd.read_sql_query("SELECT * FROM tweets_hashtags;", connS17 )

print("Number of Hashtags: %s" %len(tweetsS17))
tweetsS17.head()

Number of Hashtags: 73935


Unnamed: 0,tweet_id,hashtag
0,988175614908788736,wetter
1,988176244029251584,MTVBRPETNUGGET
2,988176244029251584,MTVBRSHADETAYLORKATY
3,988176244029251584,PremiosMTVMiaw
4,988176244029251584,MTVLAINSTAGLCAMILAC


### Calendar Week 03 - Filtered

In [6]:
tweetsF03 = pd.read_sql_query("SELECT * FROM tweets_hashtags;", connF03 )

print("Number of Hashtags: %s" %len(tweetsF03))
tweetsF03.head()

Number of Hashtags: 4377831


Unnamed: 0,tweet_id,hashtag
0,950617923604885504,feinesahnefischfilet
1,950617923604885504,SturmundDreck
2,952676701011668998,Laufrollenset
3,952676701011668998,Kretana
4,952676701011668998,Diagonaleinstieg


### Calendar Week 17 - Filtered

In [7]:
tweetsF17 = pd.read_sql_query("SELECT * FROM tweets_hashtags;", connF17 )

print("Number of Hashtags: %s" %len(tweetsF17))
tweetsF17.head()

Number of Hashtags: 3780197


Unnamed: 0,tweet_id,hashtag
0,988175612497063936,Mercedes
1,988175612497063936,Perkasie
2,988175612497063936,carsforsale
3,988175612945854465,reiseagentur
4,988175612945854465,cancelflights


## Single Hashtag

### Calendar Week 03 - Single Hashtags - Sampled x Filtered

In [9]:
attribute = 'hashtag'
column_name = 'hashtag'
    
df_sampled = generateRankingDataframe(tweetsS03[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF03[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [10]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - Hashtags - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,hashtag,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,iHeartAwards,1995,2.36927,v (11. / 0.423% / 1.946%)
2,SPD,1054,1.25174,^ (1. / 1.338% / -0.087%)
3,spdbpt18,1049,1.2458,^ (2. / 1.188% / 0.058%)
4,AfD,876,1.04034,= (4. / 0.934% / 0.106%)
5,BestFanArmy,791,0.939396,v (54. / 0.123% / 0.816%)
6,GroKo,787,0.934646,^ (5. / 0.893% / 0.041%)
7,Friederike,703,0.834887,^ (6. / 0.877% / -0.042%)
8,BestBoyBand,666,0.790946,v (25. / 0.260% / 0.531%)
9,ibes,636,0.755318,^ (3. / 1.137% / -0.382%)
10,NoGroKo,450,0.534423,^ (8. / 0.495% / 0.040%)


In [12]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - Hashtags - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,hashtag,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,SPD,58593,1.3384,v (2. / 1.252% / 0.087%)
2,spdbpt18,52001,1.18783,v (3. / 1.246% / -0.058%)
3,ibes,49784,1.13718,v (9. / 0.755% / 0.382%)
4,AfD,40883,0.933864,= (4. / 1.040% / -0.106%)
5,GroKo,39106,0.893273,v (6. / 0.935% / -0.041%)
6,Friederike,38381,0.876713,v (7. / 0.835% / 0.042%)
7,IBES,26308,0.600937,v (15. / 0.422% / 0.179%)
8,NoGroKo,21652,0.494583,v (10. / 0.534% / -0.040%)
9,Merkel,21530,0.491796,v (12. / 0.474% / 0.018%)
10,Berlin,19400,0.443142,v (13. / 0.444% / -0.001%)


### Calendar Week 17 - Single Hashtags - Sampled x Filtered

In [13]:
attribute = 'hashtag'
column_name = 'hashtag'
    
df_sampled = generateRankingDataframe(tweetsS17[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF17[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [14]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 17 - Hashtags - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,hashtag,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,PremiosMTVMiaw,889,1.20241,v (25. / 0.224% / 0.979%)
2,AfD,795,1.07527,^ (1. / 0.961% / 0.114%)
3,FCBRMA,400,0.541016,^ (2. / 0.652% / -0.111%)
4,Berlin,336,0.454453,^ (3. / 0.490% / -0.035%)
5,Merkel,332,0.449043,v (7. / 0.450% / -0.001%)
6,ReconquistaInternet,320,0.432813,= (6. / 0.453% / -0.020%)
7,Kreuz,299,0.404409,^ (4. / 0.485% / -0.080%)
8,MTVLAHITGLNOTSORRY,289,0.390884,v (106087. / 0.000% / 0.391%)
9,MTVLAINSTAGLCAMILAC,286,0.386826,v (48988. / 0.000% / 0.387%)
10,MTVBRSHADETAYLORKATY,281,0.380064,-


In [15]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - Hashtags - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,hashtag,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AfD,36328,0.961008,v (2. / 1.075% / -0.114%)
2,FCBRMA,24660,0.652347,v (3. / 0.541% / 0.111%)
3,Berlin,18507,0.489578,v (4. / 0.454% / 0.035%)
4,Kreuz,18323,0.48471,v (7. / 0.404% / 0.080%)
5,CSU,17257,0.456511,v (12. / 0.377% / 0.079%)
6,ReconquistaInternet,17127,0.453072,= (6. / 0.433% / 0.020%)
7,Merkel,17018,0.450188,^ (5. / 0.449% / 0.001%)
8,Amazon,14668,0.388022,v (17. / 0.261% / 0.127%)
9,Germany,13241,0.350273,v (28. / 0.210% / 0.141%)
10,Bayern,13067,0.34567,v (15. / 0.280% / 0.066%)


In [9]:
def isUniqueGroupInList(_group, _list):
    for group in _list:
        if len(group) == len(_group):
            is_same_group = True
            for item in _group:
                if item not in group:
                    is_same_group = False
            if is_same_group:
                return False
    return True

def countGroupOccurrence(_group, list_of_groups):
    count = 1
    for group in list_of_groups:
        if len(group) == len(_group):
            is_same_group = True
            for item in _group:
                if item not in group:
                    is_same_group = False
            if is_same_group:
                count += 1
    return count

def getGroupsRanking(df, attribute_name, id_column):
    groups_by_id = {}
    for id_ in df[id_column].unique():
        groups_by_id[id_] = [id_name for id_name in df[attribute_name][df[id_column] == id_]]
        
    unique_groups_list = []
    
    for entry_id in groups_by_id:
        if isUniqueGroupInList(groups_by_id[entry_id], unique_groups_list):
            unique_groups_list.append(groups_by_id[entry_id])

    count_list = []
    
    for unique_group in unique_groups_list:
        count_list.extend(countGroupOccurrence(unique_group, list(groups_by_id.values())) * [" ".join(unique_group)])

    data = {attribute_name: count_list}
    df = pd.DataFrame(data=data)
    return df

def getSimpleGroupsRanking(df, attribute_name, id_column):
    groups = []
    for id_ in df[id_column].unique():
        groups.append(" ".join(list(df[attribute_name][df[id_column] == id_].values)))
    
    data = {attribute_name: groups}
    df = pd.DataFrame(data=data)
    return df


## Hashtag Groups - permutation of hastags detected

This evaluation is based on an algorithm that counts Hashtag Groups that have the same elements but not necessarily the same permutation. For Example: Group A ("live gig awards") and Group B ("gig awards live") are counted as one Group.

### Calendar Week 03 - Hashtag Groups - Sampled x Filtered

In [None]:
attribute = 'hashtag'
column_name = 'hashtags'
id_name = 'tweet_id'
    
df_sampled = generateRankingDataframe(getGroupsRanking(tweetsS03, attribute, id_name)[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(getGroupsRanking(tweetsF03, attribute, id_name)[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [93]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - Hashtag Groups - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,hashtags,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,ibes,522,0.935065,= (1. / 0.935% / 0.000%)
2,spdbpt18,468,0.838334,= (2. / 0.838% / 0.000%)
3,Friederike,345,0.618003,= (3. / 0.618% / 0.000%)
4,iHeartAwards BestFanArmy BTSARMY,328,0.58755,= (4. / 0.588% / 0.000%)
5,iHeartAwards BestBoyBand BTS,301,0.539185,= (5. / 0.539% / 0.000%)
6,IBES,265,0.474698,= (6. / 0.475% / 0.000%)
7,CNCO BestBoyBand iHeartAwards,231,0.413793,= (7. / 0.414% / 0.000%)
8,AfD,216,0.386923,= (8. / 0.387% / 0.000%)
9,EXOL BestFanArmy iHeartAwards,197,0.352888,= (9. / 0.353% / 0.000%)
10,SPD,180,0.322436,= (10. / 0.322% / 0.000%)


In [94]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,hashtags,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,ibes,522,0.935065,= (1. / 0.935% / 0.000%)
2,spdbpt18,468,0.838334,= (2. / 0.838% / 0.000%)
3,Friederike,345,0.618003,= (3. / 0.618% / 0.000%)
4,iHeartAwards BestFanArmy BTSARMY,328,0.58755,= (4. / 0.588% / 0.000%)
5,iHeartAwards BestBoyBand BTS,301,0.539185,= (5. / 0.539% / 0.000%)
6,IBES,265,0.474698,= (6. / 0.475% / 0.000%)
7,CNCO BestBoyBand iHeartAwards,231,0.413793,= (7. / 0.414% / 0.000%)
8,AfD,216,0.386923,= (8. / 0.387% / 0.000%)
9,EXOL BestFanArmy iHeartAwards,197,0.352888,= (9. / 0.353% / 0.000%)
10,SPD,180,0.322436,= (10. / 0.322% / 0.000%)


### Calendar Week 17 - Hashtag Groups - Sampled x Filtered

In [68]:
attribute = 'hashtag'
column_name = 'hashtags'
id_name = 'tweet_id'
    
df_sampled = generateRankingDataframe(getGroupsRanking(tweetsS17, attribute, id_name)[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(getGroupsRanking(tweetsF17, attribute, id_name)[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [69]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 17 - Hashtag Groups - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,56,0.305527,= (1. / 0.419% / -0.114%)
2,http://www.achgut.com/artikel/die_enthauptung_der_hamburger_justiz,44,0.240057,v (4. / 0.228% / 0.012%)
3,https://www.youtube.com/watch?v=F-eMt3SrfFU&feature=youtu.be,34,0.185498,v (1003. / 0.007% / 0.179%)
4,http://www.deutschlandfunk.de/dlf24-startseite.1441.de.html,33,0.180043,v (7. / 0.163% / 0.017%)
5,https://www.focus.de/politik/deutschland/lage-zu-bedrohlich-demonstration-gegen-antisemitismus-in-berlin-nach-15-minuten-abgebrochen_id_8829384.html,29,0.158219,v (13. / 0.104% / 0.055%)
6,http://www.the-sz.com/products/vbbinfo/?f=3,27,0.147308,^ (3. / 0.231% / -0.084%)
7,http://www.linkedin.com/,24,0.13094,^ (6. / 0.164% / -0.034%)
8,http://www.gooni168.tv/sellpanties/item/haushalts-sklave-in/,22,0.120028,v (24112. / 0.001% / 0.120%)
9,https://www.youtube.com/watch?v=yiWCGP4iKf4,21,0.114573,v (102656. / 0.000% / 0.114%)
10,http://www.spiegel.de/spiegel/unispiegel/fh-dortmund-bietet-neuen-studiengang-fluechtlingshilfe-an-a-1196053.html,21,0.114573,v (14. / 0.094% / 0.020%)


In [70]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - Hashtag Groups - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US,4964,0.419488,= (1. / 0.306% / 0.114%)
2,https://sec.help.ch,2978,0.251659,-
3,http://www.the-sz.com/products/vbbinfo/?f=3,2739,0.231462,v (6. / 0.147% / 0.084%)
4,http://www.achgut.com/artikel/die_enthauptung_der_hamburger_justiz,2697,0.227913,^ (2. / 0.240% / -0.012%)
5,https://www.radionomy.com/en/radio/erika1,2172,0.183547,v (12. / 0.109% / 0.074%)
6,http://www.linkedin.com/,1946,0.164449,v (7. / 0.131% / 0.034%)
7,http://www.deutschlandfunk.de/dlf24-startseite.1441.de.html,1932,0.163266,^ (4. / 0.180% / -0.017%)
8,http://www.radio-jodlerwirt.de,1785,0.150843,v (21. / 0.082% / 0.069%)
9,https://www.welt.de/debatte/kommentare/article175695478/Straftaten-Statistik-Die-Wirklichkeit-hinter-den-neuen-Zahlen-zur-Kriminalitaet.html,1510,0.127604,v (16. / 0.098% / 0.029%)
10,https://www.youtube.com/watch?v=6bTQkwftlf0&feature=youtu.be&a,1381,0.116703,v (61. / 0.055% / 0.062%)


## Simple Hashtag Groups

This evaluation is based on a simple algorithm that counts Hashtag Groups that have the same elements in the same specific order. For Example: Group A ("live gig awards") and Group B ("live gig awards") are counted as one Group, but Group C ("gig live awards") is not in the same group.

### Calendar Week 03 - Simple Hashtag Groups - Sampled x Filtered

In [None]:
attribute = 'hashtag'
column_name = 'hashtags'
id_name = 'tweet_id'
    
df_sampled = generateRankingDataframe(getSimpleGroupsRanking(tweetsS03, attribute, id_name)[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(getSimpleGroupsRanking(tweetsF03, attribute, id_name)[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [73]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - Simple Hashtag Groups - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,hash,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0000000000000000,1244,11.0333,= (1. / 12.336% / -1.302%)
2,96a4a29696a6b6b6,55,0.487805,v (6. / 0.322% / 0.165%)
3,734545135755654d,52,0.461197,^ (2. / 0.722% / -0.260%)
4,0d2f67a6b3731393,48,0.425721,-
5,0c8e136b0b3fc661,41,0.363636,^ (3. / 0.431% / -0.068%)
6,1004312ab2320c20,39,0.345898,v (8. / 0.224% / 0.122%)
7,3323332b23070fb3,34,0.301552,^ (4. / 0.390% / -0.088%)
8,254d8c0f0f0e4d5a,29,0.257206,v (9. / 0.222% / 0.035%)
9,26c9c9e4147a6916,28,0.248337,^ (5. / 0.363% / -0.115%)
10,2250526545642244,26,0.230599,-


In [74]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - Simple Hashtag Groups - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,hash,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0000000000000000,78499,12.3357,= (1. / 11.033% / 1.302%)
2,734545135755654d,4592,0.721611,v (3. / 0.461% / 0.260%)
3,0c8e136b0b3fc661,2745,0.431364,v (5. / 0.364% / 0.068%)
4,3323332b23070fb3,2482,0.390034,v (7. / 0.302% / 0.088%)
5,26c9c9e4147a6916,2309,0.362848,v (9. / 0.248% / 0.115%)
6,96a4a29696a6b6b6,2052,0.322462,^ (2. / 0.488% / -0.165%)
7,46072764f0e4ec7b,1458,0.229118,v (12. / 0.204% / 0.025%)
8,1004312ab2320c20,1423,0.223618,^ (6. / 0.346% / -0.122%)
9,254d8c0f0f0e4d5a,1414,0.222203,^ (8. / 0.257% / -0.035%)
10,7e3d5927a676662d,1250,0.196432,v (16. / 0.169% / 0.028%)


### Calendar Week 17 - Simple Hashtag Groups - Sampled x Filtered

In [83]:
attribute = 'hashtag'
column_name = 'hashtags'
id_name = 'tweet_id'
    
df_sampled = generateRankingDataframe(getSimpleGroupsRanking(tweetsS17, attribute, id_name)[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(getSimpleGroupsRanking(tweetsF17, attribute, id_name)[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [84]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 17 - Simple Hashtag Groups - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,hash,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0000000000000000,602,3.79955,= (1. / 7.337% / -3.537%)
2,734545135755654d,63,0.397627,= (2. / 0.630% / -0.233%)
3,26c9c9e4147a6916,44,0.277708,= (3. / 0.475% / -0.197%)
4,72e0c9c360614adb,44,0.277708,v (6. / 0.298% / -0.020%)
5,78b2b64418292b73,43,0.271396,v (23. / 0.135% / 0.137%)
6,e06cefceaeea7b06,40,0.252461,v (1038. / 0.009% / 0.244%)
7,434b486474f8b6fe,34,0.214592,v (8. / 0.253% / -0.038%)
8,f453696d696b4727,31,0.195658,v (25. / 0.133% / 0.063%)
9,254d8c0f0f0e4d5a,30,0.189346,^ (7. / 0.270% / -0.081%)
10,6b63b2a267f3b371,29,0.183035,v (22. / 0.138% / 0.045%)


In [86]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - Simple Hashtag Groups - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,hash,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0000000000000000,55873,7.337,= (1. / 3.800% / 3.537%)
2,734545135755654d,4799,0.630184,= (2. / 0.398% / 0.233%)
3,26c9c9e4147a6916,3616,0.474837,= (3. / 0.278% / 0.197%)
4,0c13212531232704,3384,0.444372,v (803. / 0.019% / 0.425%)
5,3323332b23070fb3,2313,0.303733,v (13. / 0.170% / 0.133%)
6,72e0c9c360614adb,2268,0.297824,^ (4. / 0.278% / 0.020%)
7,254d8c0f0f0e4d5a,2059,0.270379,v (9. / 0.189% / 0.081%)
8,434b486474f8b6fe,1923,0.25252,^ (7. / 0.215% / 0.038%)
9,0c8e136b0b3fc661,1743,0.228883,v (17. / 0.151% / 0.077%)
10,46072764f0e4ec7b,1665,0.218641,v (40. / 0.095% / 0.124%)


## Calendar Week 03 - Sampled

In [20]:
tweetsS03 = pd.read_sql_query("SELECT id, hashtags FROM tweets_info;", connS03 )

print("Number of Tweets: %s" %len(tweetsS03))
tweetsS03.head()

Number of Tweets: 123680


Unnamed: 0,id,hashtags
0,952676732913401856,
1,952676745496354816,
2,952676758066737158,
3,952676758066728960,
4,952676770679058432,milf porno ass HDvideos SoloMale HDвидео


### Calendar Week 17 - Sampled

In [21]:
tweetsS17 = pd.read_sql_query("SELECT id,hashtags FROM tweets_info;", connS17 )

print("Number of Hashtags: %s" %len(tweetsS17))
tweetsS17.head()

Number of Hashtags: 112003


Unnamed: 0,id,hashtags
0,988175614908788736,wetter
1,988175619098861568,
2,988175627453952002,
3,988175627479146498,
4,988175681996681218,


### Calendar Week 03 - Filtered

In [22]:
tweetsF03 = pd.read_sql_query("SELECT id, hashtags FROM tweets_info;;", connF03 )

print("Number of Hashtags: %s" %len(tweetsF03))
tweetsF03.head()

Number of Hashtags: 8010674


Unnamed: 0,id,hashtags
0,952676700692930561,
1,952571491597586433,
2,952676700873256961,
3,952676701011668998,Laufrollenset Kretana Diagonaleinstieg
4,952676701485625346,


### Calendar Week 17 - Filtered

In [23]:
tweetsF17 = pd.read_sql_query("SELECT id,hashtags FROM tweets_info;", connF17 )

print("Number of Hashtags: %s" %len(tweetsF17))
tweetsF17.head()

Number of Hashtags: 7269347


Unnamed: 0,id,hashtags
0,989887937385988096,
1,989887940137435137,
2,989887940825419776,
3,989887941420908544,
4,989887941811064832,


### Calendar Week 03 - Hashtag Groups from Tweets Table - Sampled x Filtered

In [14]:
attribute = 'hashtags'
column_name = 'hashtags'
id_name = 'tweet_id'
    
df_sampled = generateRankingDataframe(tweetsS03[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF03[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [15]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - Hashtag Groups from Tweets Table - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,hashtags,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,ibes,518,1.40357,= (1. / 1.988% / -0.584%)
2,spdbpt18,466,1.26267,= (2. / 1.205% / 0.058%)
3,Friederike,344,0.932098,= (3. / 0.983% / -0.051%)
4,IBES,264,0.715331,= (4. / 0.978% / -0.263%)
5,CNCO BestBoyBand iHeartAwards,223,0.604238,v (143. / 0.048% / 0.557%)
6,iHeartAwards BestFanArmy BTSARMY,218,0.59069,v (74. / 0.074% / 0.517%)
7,AfD,213,0.577142,^ (5. / 0.721% / -0.144%)
8,SPD,176,0.476887,^ (6. / 0.581% / -0.104%)
9,iHeartAwards BestBoyBand BTS,153,0.414567,v (59. / 0.087% / 0.327%)
10,Maischberger,140,0.379342,^ (9. / 0.365% / 0.014%)


In [16]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - Simple Hashtag Groups - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,hashtags,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,ibes,41783,1.98778,= (1. / 1.404% / 0.584%)
2,spdbpt18,25325,1.20481,= (2. / 1.263% / -0.058%)
3,Friederike,20662,0.98297,= (3. / 0.932% / 0.051%)
4,IBES,20566,0.978403,= (4. / 0.715% / 0.263%)
5,AfD,15148,0.720648,v (7. / 0.577% / 0.144%)
6,SPD,12210,0.580876,v (8. / 0.477% / 0.104%)
7,jobs,12135,0.577308,v (13. / 0.314% / 0.263%)
8,GroKo,9696,0.461276,v (11. / 0.355% / 0.106%)
9,Maischberger,7671,0.364939,v (10. / 0.379% / -0.014%)
10,Bachelor,7625,0.36275,v (19. / 0.271% / 0.092%)


### Calendar Week 17 - Hashtag Groups from Tweets Table - Sampled x Filtered

In [17]:
attribute = 'hashtags'
column_name = 'hashtags'
id_name = 'tweet_id'
    
df_sampled = generateRankingDataframe(tweetsS17[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF17[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 50)

In [18]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 17 - Hashtag Groups from Tweets Table - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,hashtags,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,MTVBRPETNUGGET MTVBRSHADETAYLORKATY PremiosMTVMiaw MTVLAINSTAGLCAMILAC,281,0.910004,-
2,ReconquistaInternet,207,0.670358,= (2. / 0.628% / 0.042%)
3,AfD,190,0.615305,^ (1. / 0.661% / -0.046%)
4,FCBRMA,134,0.433952,^ (3. / 0.574% / -0.140%)
5,RealMadrid Emirates APorLa13 HalaMadrid,109,0.352991,-
6,MTVLAINSTAGLSELENA MTVLAFTWOLVES MTVBRHITBADLIAR MTVBRSHADEMAEJELENA MTVBRFANDOMSELENATORS MTVLAHITGLNOTSORRY PremiosMTVMiaw,106,0.343275,-
7,Germany Amazon Deal international ad,96,0.310891,^ (4. / 0.503% / -0.192%)
8,Merkel,82,0.265553,v (10. / 0.271% / -0.006%)
9,Kreuz,79,0.255837,v (11. / 0.268% / -0.013%)
10,PREMIOSMTVMIAW MTVLAHITGLNOTSORRY MTVLAHITCULPA MTVLAINSTAGLDEMI MTVBRPETBATMAN MTVLACHINGONSOFIA MTVLADUROSHAKIRA MTVLAINSTACOJBALVIN MTVLACRUSHSOFIA MTVLAINSTAARVICICONTE MTVLAVIDEOMACHIKA,77,0.24936,-


In [19]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - Hashtag Groups from Tweets Table - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,hashtags,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AfD,11611,0.661339,v (3. / 0.615% / 0.046%)
2,ReconquistaInternet,11033,0.628417,= (2. / 0.670% / -0.042%)
3,FCBRMA,10082,0.57425,v (4. / 0.434% / 0.140%)
4,Germany Amazon Deal international ad,8831,0.502996,v (7. / 0.311% / 0.192%)
5,147sf,7169,0.408332,v (16. / 0.214% / 0.195%)
6,FFD365 Aktuell,6145,0.350007,v (15. / 0.223% / 0.127%)
7,Antisemitismus,5805,0.330641,v (12. / 0.236% / 0.094%)
8,ParadiseIsland2 GameInsight,5554,0.316345,v (24. / 0.165% / 0.151%)
9,GameInsight ParadiseIsland2,5330,0.303586,v (17. / 0.201% / 0.103%)
10,Merkel,4760,0.27112,^ (8. / 0.266% / 0.006%)
