# User Profile Data Comparison (Sampled vs Filtered)

In [1]:
import psycopg2
import pandas as pd
import config
from matplotlib import pylab as plt
import seaborn as sns
import numpy as np
from IPython.display import display, HTML
import re


color_palette = sns.color_palette(palette='muted', n_colors=None, desat=.75)
sns.set(context='notebook', palette=color_palette, style='whitegrid', font='sans-serif', font_scale=1.5, color_codes=False, rc=None)
pd.set_option('display.max_colwidth', -1)
table_styles = [{'selector': 'td',
                 'props': [('min-width', '100px'), ('text-align', 'center')]},
                {'selector': 'tr',
                 'props': [('border-bottom', '1px dotted black')]},
                {'selector': 'th',
                 'props': [('text-align', 'center')]}
               ]

%matplotlib inline

directory = "url_top_lists/"
stream = "comparison"

## Util Methods

In [2]:
def compareRows(row, df_to_compare, column_name):
    comparison_row = df_to_compare.loc[df_to_compare[column_name] == row[column_name]]
    if comparison_row.empty:
        comparison = " - "
    else:
        percentage_dif = row['percentage'] - comparison_row['percentage'].values[0]
        difference = "(%s. / %.3f%% / %.3f%%)" % (comparison_row['rank'].values[0], comparison_row['percentage'].values[0], percentage_dif)
        if comparison_row['rank'].values[0] == row['rank']:
            comparison = " = <br>" + difference
        else:
            if comparison_row['rank'].values[0] > row['rank']:
                comparison = " v <br>" + difference
            else:
                comparison = " ^ <br>" + difference
    return comparison

def getOpacity(val):
    value = abs(float(re.findall(r"[-+]?\d*\.\d+|\d+", val.split("/")[2])[0]))
    if value < 0.005:
        return 1
    if value < 0.01:
        return 0.95
    if value < 0.05:
        return 0.8
    if value < 0.1:
        return 0.7
    if value < 0.5:
        return 0.6
    if value < 1:
        return 0.5
    if value < 10:
        return 0.3
    if value < 40:
        return 0.2
    if value < 80:
        return 0.1
    if value < 100:
        return 0.05

def colorComparisonField(val):
    
    if isinstance(val, str):
        if ' ^ ' in val or ' v ' in val:
            return 'background-color: rgba(246, 185, 59, %s)' %getOpacity(val)
        if ' = ' in val:
            return 'background-color: rgba(184, 233, 148, %s)' %getOpacity(val)
        if ' - ' in val and len(val) == 3:
            return 'background-color: #e55039' 
    return ''

def generateRankingDataframe(series, attribute_name):
    size = series.sum()
    rank = []
    parameter = []
    count = []
    percentage = []

    i = 1
    for index, value in series.iteritems():
        rank.append(i)
        parameter.append(index)
        count.append(value)
        percentage.append((value/size)*100)
        i += 1

    data = {'rank': rank, attribute_name: parameter, 'value': count, 'percentage': percentage}
    return pd.DataFrame(data=data)

def generateComparisonDataframes(df1, df2, column_name, size):
    compare_list = []
    for index, row in df1.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df2, column_name))

    data = {'rank': df1['rank'][:size], column_name: df1[column_name][:size], 'value': df1['value'][:size], 'percentage': df1['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df1_compared = pd.DataFrame(data=data)
    df1_compared.set_index(keys='rank', inplace=True)
    
    compare_list = []
    for index, row in df2.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df1, column_name))

    data = {'rank': df2['rank'][:size], column_name: df2[column_name][:size], 'value': df2['value'][:size], 'percentage': df2['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df2_compared = pd.DataFrame(data=data)
    df2_compared.set_index(keys='rank', inplace=True)
    
    return df1_compared, df2_compared

def getPrettyComparisonDataframe(df, title):
    s = df.style.applymap(colorComparisonField)
    s.set_caption(title)
    s.set_table_styles(table_styles)
    return s

In [3]:
conn = None
try:
    # read connection parameters
    paramsS17 = config.cfgAzureS17()
    paramsS03 = config.cfgAzureS03()

    paramsF17 = config.cfgAzureF17()
    paramsF03 = config.cfgAzureF03()
    
    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    connS17 = psycopg2.connect(**paramsS17)
    connS03 = psycopg2.connect(**paramsS03)
    
    connF17 = psycopg2.connect(**paramsF17)
    connF03 = psycopg2.connect(**paramsF03)

    # create a cursor
    curS17 = connS17.cursor()
    curS03 = connS03.cursor()
    
    curF17 = connF17.cursor()
    curF03 = connF03.cursor()

    # execute a statement
    print('PostgreSQL database version:')
    
    curS17.execute('SELECT version()')
    curS03.execute('SELECT version()')
    curF17.execute('SELECT version()')
    curF03.execute('SELECT version()')
    
    # display the PostgreSQL database server version
    db_version_curS17 = curS17.fetchone()
    db_version_curS03 = curS03.fetchone()
    db_version_curF17 = curF17.fetchone()
    db_version_curF03 = curF03.fetchone()
    
    print(db_version_curS17)
    print(db_version_curS03)
    print(db_version_curF17)
    print(db_version_curF03)

    # close the communication with the PostgreSQL
    curS17.close()
    curS03.close()
    curF17.close()
    curF03.close()

except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Connecting to the PostgreSQL database...
PostgreSQL database version:
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)


## Query Users

### Calendar Week 03 - Sampled

In [5]:
#tweets03 = pd.read_sql_query("SELECT * FROM tweets_info;", conn, parse_dates=['created_at'] )
#tweets03['created_at'] = tweets['created_at'].dt.tz_localize("UTC").dt.tz_convert("Europe/Berlin")
tweetsS03 = pd.read_sql_query("SELECT id, name, location, url, description FROM tweets_users;", connS03 )

print("Number of Tweets: %s" %len(tweetsS03))
tweetsS03.head()

Number of Tweets: 73706


Unnamed: 0,id,name,location,url,description
0,1121790578,gontavichi_duck,Saitama.Pref Japan,,A duck automatically murmurs greetings in Greenwich Mean Time.
1,824737881763745792,Glenn Robertson,,,
2,16947788,Wermelskirchen,RP Online Wermelskirchen,http://www.rp-online.de/wermelskirchen,Nachrichten und Fotos aus der Stadt Wermelskirchen - direkt von RP ONLINE. Fragen bitte an @rponline. Alle Accounts: http://www.rp-online.de/twitter
3,950898579735851008,Iris Mcdougal,,,
4,900374406190518272,Mila porn video,"New York, USA",,"I love sex))) Erotic movies, beautiful pornography"


empty icards possible

### Calendar Week 17 - Sampled

In [6]:
tweetsS17 = pd.read_sql_query("SELECT id, name, location, url, description FROM tweets_users;", connS17 )

print("Number of Tweets: %s" %len(tweetsS17))
tweetsS17.head()

Number of Tweets: 69421


Unnamed: 0,id,name,location,url,description
0,1647758059,WX Wetterstation,Johanngeorgenstadt,http://www.wetterstation-johanngeorgenstadt.de,
1,821834588158853120,Markus Brode,,,
2,415351831,fremdsprachelernen24,Brandenburg,http://www.fremdsprachelernen24.de,"Lernen Sie Sprachen wesentlich schneller als mit herkömmlichen Lernmethoden. Leicht zu bedienende Sprachkurse mit einem klaren, strukturierten Aufbau."
3,4544548995,💸,porsche,,jsuis drôle c le + important
4,2319972582,murcy ☾,"Berlin, Deutschland",https://youtu.be/nuhLiIGUJ1A,Meine Biografie könnt ihr irgendwann bei Wikipedia nachlesen.


### Calendar Week 03 - Filtered

In [7]:
tweetsF03 = pd.read_sql_query("SELECT id, name, location, url, description FROM tweets_users;", connF03 )

print("Number of Tweets: %s" %len(tweetsF03))
tweetsF03.head()

Number of Tweets: 892802


Unnamed: 0,id,name,location,url,description
0,952842353806635008,Hayley Rashdi,India,,
1,945822553418944519,Joana Oneill,,,
2,561742581,HEDGEaccordingly📈,,https://HedgeAccordingly.com,"Business, #Wallst, #Cryptocurrencies and Political news. The #BitcoinBreakdown"
3,966820562,Lucie,"Stuttgart, Germany",http://youtube.com/user/SingingPixels,"20 | Gamer, singer, artist and Youtuber. Addicted to video games and Netflix addict. Interested in my art? Dm me!"
4,106382065,Cathy Barry,"Kildare, Ireland",,"Personal twitter (philosophy, politics, ethics, Irish philosophy). Too much Brexit. Die-hard Whovian."


### Calendar Week 17 - Filtered

In [8]:
tweetsF17 = pd.read_sql_query("SELECT id, name, location, url, description FROM tweets_users;", connF17 )

print("Number of Tweets: %s" %len(tweetsF17))
tweetsF17.head()

Number of Tweets: 862487


Unnamed: 0,id,name,location,url,description
0,825772033992699906,LISA😍,❤,,"Directioner, 5SOSFam, MendesArmy, K-poper & many more fandom's ❤\n\nLarry, Taekook & LuWoo ❤"
1,730136446904995840,Scholarly Harridan,United Kingdom,,"Mostly Irish, vaguely Christian, fiercely secular. Queer and female, so obviously #ISupportIsrael.\n\n#ایرانآزاد"
2,71803722,Domenica Marchetti,,http://www.domenicacooks.com,"Author of Italian cookbooks. Latest, Preserving Italy (HMH). Culinary tours in Abruzzo, Italy. Tennis lover. I believe in a free press."
3,1469112342,lernTierpsychologie,,http://www.lernTierpsychologie.ch,lernTierpsychologie.ch --- Tiere verstehen lernen
4,334604146,なしょこ,東京,,舞台好き。四季にはじまり最近はドイツ語圏ミュージカルも。観た舞台や読んだものの感想など。ドイツ語勉強中。


## User Locations

### Calendar Week 03 - User Locations - Sampled x Filtered

In [9]:
attribute = 'location'
column_name = 'location'
    
df_sampled = generateRankingDataframe(tweetsS03[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF03[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [11]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - User Locations - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,location,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Deutschland,1984,3.93541,= (1. / 3.423% / 0.513%)
2,Berlin,1251,2.48145,v (3. / 1.785% / 0.697%)
3,Germany,1141,2.26326,^ (2. / 2.195% / 0.068%)
4,"Berlin, Deutschland",1131,2.24342,= (4. / 1.692% / 0.551%)
5,Hamburg,458,0.908478,v (6. / 0.715% / 0.193%)
6,"Hamburg, Deutschland",438,0.868806,^ (5. / 0.824% / 0.045%)
7,"Wien, Österreich",323,0.640695,= (7. / 0.603% / 0.038%)
8,München,310,0.614909,v (12. / 0.466% / 0.149%)
9,"Köln, Deutschland",310,0.614909,v (10. / 0.517% / 0.098%)
10,"München, Bayern",301,0.597056,^ (8. / 0.574% / 0.023%)


In [12]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - User Locations - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,location,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Deutschland,18924,3.42251,= (1. / 3.935% / -0.513%)
2,Germany,12137,2.19504,v (3. / 2.263% / -0.068%)
3,Berlin,9868,1.78468,^ (2. / 2.481% / -0.697%)
4,"Berlin, Deutschland",9358,1.69244,= (4. / 2.243% / -0.551%)
5,"Hamburg, Deutschland",4556,0.823977,v (6. / 0.869% / -0.045%)
6,Hamburg,3954,0.715102,^ (5. / 0.908% / -0.193%)
7,"Wien, Österreich",3335,0.603153,= (7. / 0.641% / -0.038%)
8,"München, Bayern",3174,0.574035,v (10. / 0.597% / -0.023%)
9,Österreich,2994,0.541481,v (12. / 0.538% / 0.004%)
10,"Köln, Deutschland",2860,0.517246,^ (9. / 0.615% / -0.098%)


### Calendar Week 17 - User Locations - Sampled x Filtered

In [13]:
attribute = 'location'
column_name = 'location'
    
df_sampled = generateRankingDataframe(tweetsS17[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF17[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [14]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 17 - User Locations - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,location,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Deutschland,1891,3.94657,= (1. / 3.346% / 0.600%)
2,Berlin,1199,2.50235,v (4. / 1.698% / 0.804%)
3,"Berlin, Deutschland",1088,2.27069,= (3. / 1.765% / 0.506%)
4,Germany,1034,2.15799,^ (2. / 2.139% / 0.019%)
5,"Hamburg, Deutschland",496,1.03517,= (5. / 0.865% / 0.170%)
6,Hamburg,424,0.8849,= (6. / 0.682% / 0.203%)
7,"Köln, Deutschland",318,0.663675,v (9. / 0.528% / 0.136%)
8,"Wien, Österreich",313,0.65324,= (8. / 0.565% / 0.088%)
9,München,311,0.649066,v (12. / 0.439% / 0.210%)
10,"Berlin, Germany",291,0.607325,= (10. / 0.514% / 0.093%)


In [15]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - User Locations - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,location,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Deutschland,18373,3.34633,= (1. / 3.947% / -0.600%)
2,Germany,11744,2.13897,v (4. / 2.158% / -0.019%)
3,"Berlin, Deutschland",9689,1.76469,= (3. / 2.271% / -0.506%)
4,Berlin,9324,1.69821,^ (2. / 2.502% / -0.804%)
5,"Hamburg, Deutschland",4750,0.865132,= (5. / 1.035% / -0.170%)
6,Hamburg,3743,0.681724,= (6. / 0.885% / -0.203%)
7,"München, Bayern",3242,0.590476,v (11. / 0.603% / -0.013%)
8,"Wien, Österreich",3104,0.565341,= (8. / 0.653% / -0.088%)
9,"Köln, Deutschland",2897,0.52764,^ (7. / 0.664% / -0.136%)
10,"Berlin, Germany",2822,0.51398,= (10. / 0.607% / -0.093%)


## User Names

### Calendar Week 03 - URLs - Sampled x Filtered

In [16]:
attribute = 'name'
column_name = 'name'
    
df_sampled = generateRankingDataframe(tweetsS03[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF03[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [17]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - User Names - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,name,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Chris,56,0.0759775,v (4. / 0.057% / 0.019%)
2,.,55,0.0746208,^ (1. / 0.086% / -0.011%)
3,Alex,50,0.0678371,^ (2. / 0.064% / 0.004%)
4,Daniel,36,0.0488427,v (5. / 0.054% / -0.005%)
5,ً,36,0.0488427,^ (3. / 0.061% / -0.012%)
6,Michael,36,0.0488427,v (7. / 0.041% / 0.008%)
7,Max,33,0.0447725,^ (6. / 0.045% / -0.000%)
8,Christian,28,0.0379888,v (14. / 0.035% / 0.003%)
9,Marcel,28,0.0379888,v (32. / 0.026% / 0.012%)
10,Stefan,28,0.0379888,v (26. / 0.029% / 0.009%)


In [18]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,name,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,.,766,0.0857979,v (2. / 0.075% / 0.011%)
2,Alex,571,0.0639564,v (3. / 0.068% / -0.004%)
3,ً,542,0.0607082,v (5. / 0.049% / 0.012%)
4,Chris,513,0.0574599,^ (1. / 0.076% / -0.019%)
5,Daniel,483,0.0540997,^ (4. / 0.049% / 0.005%)
6,Max,404,0.0452511,v (7. / 0.045% / 0.000%)
7,Michael,369,0.0413308,^ (6. / 0.049% / -0.008%)
8,David,344,0.0385306,v (20. / 0.030% / 0.009%)
9,Jan,339,0.0379706,v (13. / 0.035% / 0.003%)
10,Anna,334,0.0374106,v (22. / 0.030% / 0.008%)


### Calendar Week 17 - User Names - Sampled x Filtered

In [19]:
attribute = 'name'
column_name = 'name'
    
df_sampled = generateRankingDataframe(tweetsS17[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF17[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [21]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 17 - User Names - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,description,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,.,80,0.131722,= (1. / 0.065% / 0.067%)
2,...,18,0.0296374,= (2. / 0.050% / -0.021%)
3,Hi,14,0.0230513,= (3. / 0.043% / -0.020%)
4,😀,13,0.0214048,= (4. / 0.022% / -0.001%)
5,Daily Healthy Info http://www.healthy44.com,13,0.0214048,v (57. / 0.004% / 0.017%)
6,-,10,0.0164652,v (7. / 0.018% / -0.002%)
7,🇩🇪,10,0.0164652,v (25. / 0.008% / 0.009%)
8,😎,9,0.0148187,v (9. / 0.013% / 0.002%)
9,#RT #MGWV #TFB #FB #paksgallery #mamag_museum #aboutartmagazin #Tanjaplayner Be my friend. 100% #followback all my friends!,9,0.0148187,v (238. / 0.002% / 0.013%)
10,🤭🤭🤭🤭,9,0.0148187,v (166. / 0.002% / 0.013%)


In [22]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - User Names - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,description,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,.,421,0.0645524,= (1. / 0.132% / -0.067%)
2,...,328,0.0502926,= (2. / 0.030% / 0.021%)
3,Hi,281,0.0430861,= (3. / 0.023% / 0.020%)
4,😀,143,0.0219264,= (4. / 0.021% / 0.001%)
5,hi,137,0.0210064,v (26. / 0.008% / 0.013%)
6,Journalist,121,0.0185531,v (12. / 0.013% / 0.005%)
7,-,118,0.0180931,^ (6. / 0.016% / 0.002%)
8,:),89,0.0136465,v (17. / 0.012% / 0.002%)
9,😎,82,0.0125732,^ (8. / 0.015% / -0.002%)
10,¯\_(ツ)_/¯,75,0.0114998,v (23. / 0.010% / 0.002%)


## Descriptions

### Calendar Week 03 - Descriptions - Sampled x Filtered

In [20]:
attribute = 'description'
column_name = 'description'
    
df_sampled = generateRankingDataframe(tweetsS03[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF03[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [23]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - Hash - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,description,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,.,80,0.131722,= (1. / 0.065% / 0.067%)
2,...,18,0.0296374,= (2. / 0.050% / -0.021%)
3,Hi,14,0.0230513,= (3. / 0.043% / -0.020%)
4,😀,13,0.0214048,= (4. / 0.022% / -0.001%)
5,Daily Healthy Info http://www.healthy44.com,13,0.0214048,v (57. / 0.004% / 0.017%)
6,-,10,0.0164652,v (7. / 0.018% / -0.002%)
7,🇩🇪,10,0.0164652,v (25. / 0.008% / 0.009%)
8,😎,9,0.0148187,v (9. / 0.013% / 0.002%)
9,#RT #MGWV #TFB #FB #paksgallery #mamag_museum #aboutartmagazin #Tanjaplayner Be my friend. 100% #followback all my friends!,9,0.0148187,v (238. / 0.002% / 0.013%)
10,🤭🤭🤭🤭,9,0.0148187,v (166. / 0.002% / 0.013%)


In [24]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - Hash - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,description,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,.,421,0.0645524,= (1. / 0.132% / -0.067%)
2,...,328,0.0502926,= (2. / 0.030% / 0.021%)
3,Hi,281,0.0430861,= (3. / 0.023% / 0.020%)
4,😀,143,0.0219264,= (4. / 0.021% / 0.001%)
5,hi,137,0.0210064,v (26. / 0.008% / 0.013%)
6,Journalist,121,0.0185531,v (12. / 0.013% / 0.005%)
7,-,118,0.0180931,^ (6. / 0.016% / 0.002%)
8,:),89,0.0136465,v (17. / 0.012% / 0.002%)
9,😎,82,0.0125732,^ (8. / 0.015% / -0.002%)
10,¯\_(ツ)_/¯,75,0.0114998,v (23. / 0.010% / 0.002%)


### Calendar Week 17 - Descriptions - Sampled x Filtered

In [25]:
attribute = 'description'
column_name = 'description'
    
df_sampled = generateRankingDataframe(tweetsS17[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF17[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [26]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - Descriptions - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,description,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,—,72,0.124389,v (497. / 0.001% / 0.123%)
2,RT,40,0.0691049,-
3,.,33,0.0570116,^ (1. / 0.060% / -0.003%)
4,...,18,0.0310972,^ (2. / 0.044% / -0.013%)
5,Journalist,13,0.0224591,v (6. / 0.016% / 0.007%)
6,😀,12,0.0207315,^ (4. / 0.021% / -0.001%)
7,18,9,0.0155486,v (10. / 0.014% / 0.002%)
8,😎,9,0.0155486,v (13. / 0.013% / 0.003%)
9,Daily Healthy Info http://www.healthy44.com,9,0.0155486,v (79. / 0.004% / 0.012%)
10,:),8,0.013821,^ (8. / 0.015% / -0.002%)


In [27]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - Hash - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,description,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,.,385,0.0597601,v (3. / 0.057% / 0.003%)
2,...,282,0.0437723,v (4. / 0.031% / 0.013%)
3,Hi,222,0.034459,v (12. / 0.012% / 0.022%)
4,😀,137,0.0212653,v (6. / 0.021% / 0.001%)
5,hi,117,0.0181608,v (23. / 0.009% / 0.010%)
6,Journalist,101,0.0156773,^ (5. / 0.022% / -0.007%)
7,-,100,0.0155221,v (162. / 0.003% / 0.012%)
8,:),99,0.0153669,v (10. / 0.014% / 0.002%)
9,¯\_(ツ)_/¯,90,0.0139699,v (19. / 0.010% / 0.004%)
10,18,88,0.0136594,^ (7. / 0.016% / -0.002%)


In [29]:
tweetsS17[tweetsS17['description'] == "RT"]

Unnamed: 0,id,name,location,url,description
1238,982320624822562816,;,,,RT
13626,981663882900131846,💧,"Penha, São Paulo",http://selenagomez.com,RT
13746,987381455520260096,;,,,RT
13773,987065314557988864,;,,,RT
13990,982691650567856129,;,,,RT
14076,986592636298518528,;,,,RT
14229,984240677209956352,;,,,RT
14480,988483698558464002,;,,,RT
14898,982362391915712512,;,,,RT
15827,981660327447334917,;,,,RT


## URLs

### Calendar Week 03 - URLs - Sampled x Filtered

In [30]:
attribute = 'url'
column_name = 'url'
    
df_sampled = generateRankingDataframe(tweetsS03[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF03[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [31]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - URLs - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,http://bit.ly/2pPV2ar,89,0.2794,v (2. / 0.044% / 0.235%)
2,http://tinyurl.com/yao32uh9,34,0.106737,v (7. / 0.022% / 0.085%)
3,https://tinyurl.com/yao32uh9,32,0.100458,v (17. / 0.015% / 0.085%)
4,https://Xinder.Date/MeetMeAndFuckNow/?source=twitter&sub=farm&sub2=pflink,26,0.0816224,^ (1. / 0.059% / 0.023%)
5,https://goo.gl/Y7DK8M,21,0.0659258,v (30. / 0.011% / 0.055%)
6,http://www.meteosphaere.de/,20,0.0627865,v (58. / 0.006% / 0.057%)
7,http://dienstleistungstausch.org,16,0.0502292,v (21. / 0.013% / 0.037%)
8,http://www.bild.de,14,0.0439505,v (9. / 0.020% / 0.024%)
9,http://www.healthy44.com,13,0.0408112,v (35. / 0.009% / 0.032%)
10,http://www.kicker.de,12,0.0376719,^ (6. / 0.023% / 0.015%)


In [32]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,https://Xinder.Date/MeetMeAndFuckNow/?source=twitter&sub=farm&sub2=pflink,188,0.058797,v (4. / 0.082% / -0.023%)
2,http://bit.ly/2pPV2ar,142,0.0444105,^ (1. / 0.279% / -0.235%)
3,http://m.imdb.com/name/nm0395476,96,0.030024,-
4,https://twitter.com/Louis_Tomlinson/status/120620074301267968,87,0.0272093,v (160. / 0.006% / 0.021%)
5,https://twitter.com,81,0.0253328,v (215. / 0.006% / 0.019%)
6,http://www.kicker.de,72,0.022518,v (10. / 0.038% / -0.015%)
7,http://tinyurl.com/yao32uh9,70,0.0218925,^ (2. / 0.107% / -0.085%)
8,http://www.google.com,66,0.0206415,v (136. / 0.009% / 0.011%)
9,http://www.bild.de,64,0.020016,^ (8. / 0.044% / -0.024%)
10,http://hstyles.co.uk,62,0.0193905,v (22904. / 0.003% / 0.016%)


### Calendar Week 17 - URLs - Sampled x Filtered

In [33]:
attribute = 'url'
column_name = 'url'
    
df_sampled = generateRankingDataframe(tweetsS17[attribute].value_counts(), column_name)
df_filtered = generateRankingDataframe(tweetsF17[attribute].value_counts(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [34]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 17 - URLs - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,http://www.kicker.de,17,0.0568011,v (3. / 0.023% / 0.034%)
2,https://twitter.com,15,0.0501186,^ (1. / 0.036% / 0.014%)
3,http://youtube.com,14,0.0467774,v (8. / 0.018% / 0.029%)
4,http://www.meteosphaere.de/,13,0.0434361,v (53. / 0.006% / 0.037%)
5,http://www.pegasi.de,11,0.0367537,v (123. / 0.004% / 0.033%)
6,http://www.bild.de,10,0.0334124,v (14. / 0.016% / 0.017%)
7,http://www.welt.de,10,0.0334124,v (22. / 0.012% / 0.021%)
8,https://www.twitter.com,10,0.0334124,^ (6. / 0.019% / 0.015%)
9,http://facebook.com,10,0.0334124,v (13. / 0.016% / 0.017%)
10,http://www.facebook.com,10,0.0334124,v (15. / 0.015% / 0.019%)


In [35]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - URLs - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,url,value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,https://twitter.com,112,0.0356744,v (2. / 0.050% / -0.014%)
2,http://bit.ly/2JrdiAQ,86,0.0273928,v (28094. / 0.003% / 0.024%)
3,http://www.kicker.de,73,0.023252,^ (1. / 0.057% / -0.034%)
4,http://hstyles.co.uk,68,0.0216594,v (37. / 0.017% / 0.005%)
5,http://btsblog.ibighit.com,60,0.0191113,v (93. / 0.010% / 0.009%)
6,https://www.twitter.com,59,0.0187927,v (8. / 0.033% / -0.015%)
7,http://privatmy.com,57,0.0181557,-
8,http://youtube.com,56,0.0178372,^ (3. / 0.047% / -0.029%)
9,https://twitter.com/Louis_Tomlinson/status/120620074301267968,55,0.0175187,v (52. / 0.013% / 0.004%)
10,http://www.google.com,53,0.0168816,v (391. / 0.007% / 0.010%)
