# Tweet Variation Data Comparison (Sampled vs Filtered)

In [1]:
import psycopg2
import pandas as pd
import config
from matplotlib import pylab as plt
import seaborn as sns
import numpy as np
from IPython.display import display, HTML
import re


color_palette = sns.color_palette(palette='muted', n_colors=None, desat=.75)
sns.set(context='notebook', palette=color_palette, style='whitegrid', font='sans-serif', font_scale=1.5, color_codes=False, rc=None)
pd.set_option('display.max_colwidth', -1)
table_styles = [{'selector': 'td',
                 'props': [('min-width', '100px'), ('text-align', 'center')]},
                {'selector': 'tr',
                 'props': [('border-bottom', '1px dotted black')]},
                {'selector': 'th',
                 'props': [('text-align', 'center')]}
               ]

%matplotlib inline

directory = "url_top_lists/"
stream = "comparison"

## Util Methods

In [2]:
def compareRows(row, df_to_compare, column_name):
    comparison_row = df_to_compare.loc[df_to_compare[column_name] == row[column_name]]
    if comparison_row.empty:
        comparison = " - "
    else:
        percentage_dif = row['percentage'] - comparison_row['percentage'].values[0]
        difference = "(%s. / %.3f%% / %.3f%%)" % (comparison_row['rank'].values[0], comparison_row['percentage'].values[0], percentage_dif)
        if comparison_row['rank'].values[0] == row['rank']:
            comparison = " = <br>" + difference
        else:
            if comparison_row['rank'].values[0] > row['rank']:
                comparison = " v <br>" + difference
            else:
                comparison = " ^ <br>" + difference
    return comparison

def getOpacity(val):
    value = abs(float(re.findall(r"[-+]?\d*\.\d+|\d+", val.split("/")[2])[0]))
    if value < 0.005:
        return 1
    if value < 0.01:
        return 0.95
    if value < 0.05:
        return 0.8
    if value < 0.1:
        return 0.7
    if value < 0.5:
        return 0.6
    if value < 1:
        return 0.5
    if value < 10:
        return 0.3
    if value < 40:
        return 0.2
    if value < 80:
        return 0.1
    if value < 100:
        return 0.05

def colorComparisonField(val):
    
    if isinstance(val, str):
        if ' ^ ' in val or ' v ' in val:
            return 'background-color: rgba(246, 185, 59, %s)' %getOpacity(val)
        if ' = ' in val:
            return 'background-color: rgba(184, 233, 148, %s)' %getOpacity(val)
        if ' - ' in val and len(val) == 3:
            return 'background-color: #e55039' 
    return ''

def generateRankingDataframe(series, attribute_name):
    size = series.sum()
    rank = []
    parameter = []
    count = []
    percentage = []

    i = 1
    for index, value in series.iteritems():
        rank.append(i)
        parameter.append(index)
        count.append(value)
        percentage.append((value/size)*100)
        i += 1

    data = {'rank': rank, attribute_name: parameter, 'value': count, 'percentage': percentage}
    return pd.DataFrame(data=data)

def generateComparisonDataframes(df1, df2, column_name, size):
    compare_list = []
    for index, row in df1.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df2, column_name))

    data = {'rank': df1['rank'][:size], column_name: df1[column_name][:size], 'value': df1['value'][:size], 'percentage': df1['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df1_compared = pd.DataFrame(data=data)
    df1_compared.set_index(keys='rank', inplace=True)
    
    compare_list = []
    for index, row in df2.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df1, column_name))

    data = {'rank': df2['rank'][:size], column_name: df2[column_name][:size], 'value': df2['value'][:size], 'percentage': df2['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df2_compared = pd.DataFrame(data=data)
    df2_compared.set_index(keys='rank', inplace=True)
    
    return df1_compared, df2_compared

def getPrettyComparisonDataframe(df, title):
    s = df.style.applymap(colorComparisonField)
    s.set_caption(title)
    s.set_table_styles(table_styles)
    return s

  
def getIdentifier(retweet_option, quote_option, reply_option, icard_option):
    identifier = ""
    if retweet_option:
        identifier += "1 "
    else:
        identifier += "0 "
    if quote_option:
        identifier += "1 "
    else:
        identifier += "0 "
    if reply_option:
        identifier += "1 "
    else:
        identifier += "0 "
    if icard_option:
        identifier += "1 "
    else:
        identifier += "0 "
    
    return identifier

def generateTweetsVariationDataframe(dataframe):
    is_retweet = [False, True]
    is_quote = [False, True]
    is_reply = [False, True]
    has_icard = [False, True]
    
    identifier_list = []
    value_list = []
    
    for retweet_option in is_retweet:
        for quote_option in is_quote:
            for reply_option in is_reply:
                for icard_option in has_icard:
                    identifier_list.append(getIdentifier(retweet_option, quote_option, reply_option, icard_option))
                    if icard_option:
                        data_set = dataframe[(dataframe['is_retweet'] == retweet_option) & 
                                             (dataframe['is_quote'] == quote_option) & 
                                             (dataframe['is_reply'] == reply_option) & 
                                             (dataframe['has_icard'] == icard_option) & (dataframe['response_code'] != None)]
                        value_list.append(len(data_set))
                    else:
                        data_set = dataframe[(dataframe['is_retweet'] == retweet_option) & 
                                             (dataframe['is_quote'] == quote_option) & 
                                             (dataframe['is_reply'] == reply_option) & 
                                             (dataframe['has_icard'] == icard_option)]
                        value_list.append(len(data_set))
                        
    data = {'combination': identifier_list, "value": value_list}
    df = pd.DataFrame(data=data)
    
    return df.sort_values("value", ascending=False).set_index("combination")

In [3]:
conn = None
try:
    # read connection parameters
    paramsS17 = config.cfgAzureS17()
    paramsS03 = config.cfgAzureS03()

    paramsF17 = config.cfgAzureF17()
    paramsF03 = config.cfgAzureF03()
    
    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    connS17 = psycopg2.connect(**paramsS17)
    connS03 = psycopg2.connect(**paramsS03)
    
    connF17 = psycopg2.connect(**paramsF17)
    connF03 = psycopg2.connect(**paramsF03)

    # create a cursor
    curS17 = connS17.cursor()
    curS03 = connS03.cursor()
    
    curF17 = connF17.cursor()
    curF03 = connF03.cursor()

    # execute a statement
    print('PostgreSQL database version:')
    
    curS17.execute('SELECT version()')
    curS03.execute('SELECT version()')
    curF17.execute('SELECT version()')
    curF03.execute('SELECT version()')
    
    # display the PostgreSQL database server version
    db_version_curS17 = curS17.fetchone()
    db_version_curS03 = curS03.fetchone()
    db_version_curF17 = curF17.fetchone()
    db_version_curF03 = curF03.fetchone()
    
    print(db_version_curS17)
    print(db_version_curS03)
    print(db_version_curF17)
    print(db_version_curF03)

    # close the communication with the PostgreSQL
    curS17.close()
    curS03.close()
    curF17.close()
    curF03.close()

except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Connecting to the PostgreSQL database...
PostgreSQL database version:
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)


## Query URLs

### Calendar Week 03 - Sampled

In [6]:
tweetsS03 = pd.read_sql_query('''SELECT id, is_retweet, is_quote, is_reply, tc.has_icard, tc.response_code FROM tweets_info as ti 
                                 INNER JOIN tweets_icards as tc ON ti.id = tc.tweet_id;''', connS03 )

print("Number of Tweets: %s" %len(tweetsS03))
tweetsS03.head()

Number of Tweets: 123680


Unnamed: 0,id,is_retweet,is_quote,is_reply,has_icard,response_code
0,952676745496354816,False,False,False,False,
1,952676816820621314,False,False,False,True,200.0
2,952676816820555776,False,False,False,True,200.0
3,952676825183944705,False,False,False,False,
4,952676971980513281,True,False,False,False,


### Calendar Week 17 - Sampled

In [7]:
tweetsS17 = pd.read_sql_query('''SELECT id, is_retweet, is_quote, is_reply, tc.has_icard, tc.response_code FROM tweets_info as ti 
                                 INNER JOIN tweets_icards as tc ON ti.id = tc.tweet_id;''', connS17 )

print("Number of Tweets: %s" %len(tweetsS17))
tweetsS17.head()

Number of Tweets: 112002


Unnamed: 0,id,is_retweet,is_quote,is_reply,has_icard,response_code
0,988175619098861568,False,False,False,False,
1,988175627453952002,False,False,False,False,
2,988175778474020868,False,False,False,True,200.0
3,988175828801540097,False,False,False,False,
4,987388203593322496,False,False,False,False,


## Sampled Concatination

In [8]:
frames = [tweetsS03, tweetsS17]
tweetsS = pd.concat(frames)

### Calendar Week 03 - Filtered

In [12]:
tweetsF03 = pd.read_sql_query('''SELECT id, is_retweet, is_quote, is_reply, tc.has_icard, tc.response_code FROM tweets_info as ti 
                                 INNER JOIN tweets_icards as tc ON ti.id = tc.tweet_id;''', connF03 )

print("Number of Tweets: %s" %len(tweetsF03))
tweetsF03.head()

Number of Tweets: 8010674


Unnamed: 0,id,is_retweet,is_quote,is_reply,has_icard,response_code
0,952676704803262465,False,False,True,False,
1,952676814618529793,False,False,False,True,200.0
2,952676815331627008,False,False,False,True,200.0
3,952676829881667586,False,False,False,True,200.0
4,952676875805093889,False,False,False,True,200.0


### Calendar Week 17 - Filtered

In [13]:
tweetsF17 = pd.read_sql_query('''SELECT id, is_retweet, is_quote, is_reply, tc.has_icard, tc.response_code FROM tweets_info as ti 
                                 INNER JOIN tweets_icards as tc ON ti.id = tc.tweet_id;''', connF17 )

print("Number of Tweets: %s" %len(tweetsF17))
tweetsF17.head()

Number of Tweets: 7269345


Unnamed: 0,id,is_retweet,is_quote,is_reply,has_icard,response_code
0,989558776553443329,True,False,False,False,
1,989558800460996609,False,False,True,False,
2,988872048591212550,True,False,False,False,
3,989887974476255232,False,False,False,False,
4,989558823542312960,False,False,False,False,


## Filtered Concatination

In [14]:
frames = [tweetsF03, tweetsF17]
tweetsF = pd.concat(frames)

## Tweet Variations

### Calendar Week 03 - Tweet Variations - Sampled x Filtered

In [36]:
column_name = 'is_retweet, is_quote, is_reply, has_icard'
    
df_sampled = generateRankingDataframe(generateTweetsVariationDataframe(tweetsS03).squeeze(), column_name)
df_filtered = generateRankingDataframe(generateTweetsVariationDataframe(tweetsF03).squeeze(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [37]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 03 - Tweet Variations - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,"is_retweet, is_quote, is_reply, has_icard",value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0 0 0 0,44894,36.2985,= (1. / 27.799% / 8.500%)
2,0 0 1 0,26984,21.8176,= (2. / 26.581% / -4.763%)
3,1 0 0 0,24055,19.4494,= (3. / 24.228% / -4.779%)
4,0 0 0 1,14585,11.7925,= (4. / 9.562% / 2.231%)
5,1 0 0 1,5238,4.23512,= (5. / 5.490% / -1.255%)
6,0 1 0 0,4727,3.82196,= (6. / 3.006% / 0.816%)
7,1 1 0 0,2300,1.85964,= (7. / 2.602% / -0.742%)
8,0 0 1 1,658,0.532018,= (8. / 0.514% / 0.018%)
9,0 1 1 0,152,0.122898,= (9. / 0.126% / -0.003%)
10,0 1 0 1,61,0.0493208,= (10. / 0.049% / 0.001%)


In [38]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 03 - Tweet Variations - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,"is_retweet, is_quote, is_reply, has_icard",value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0 0 0 0,2226852,27.7986,= (1. / 36.299% / -8.500%)
2,0 0 1 0,2129316,26.581,= (2. / 21.818% / 4.763%)
3,1 0 0 0,1940857,24.2284,= (3. / 19.449% / 4.779%)
4,0 0 0 1,765982,9.56202,= (4. / 11.793% / -2.231%)
5,1 0 0 1,439766,5.48975,= (5. / 4.235% / 1.255%)
6,0 1 0 0,240814,3.00616,= (6. / 3.822% / -0.816%)
7,1 1 0 0,208435,2.60197,= (7. / 1.860% / 0.742%)
8,0 0 1 1,41148,0.513665,= (8. / 0.532% / -0.018%)
9,0 1 1 0,10088,0.125932,= (9. / 0.123% / 0.003%)
10,0 1 0 1,3893,0.0485977,= (10. / 0.049% / -0.001%)


### Calendar Week 17 - Tweet Variations - Sampled x Filtered

In [40]:
column_name = 'is_retweet, is_quote, is_reply, has_icard'
    
df_sampled = generateRankingDataframe(generateTweetsVariationDataframe(tweetsS17).squeeze(), column_name)
df_filtered = generateRankingDataframe(generateTweetsVariationDataframe(tweetsF17).squeeze(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [18]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "CW 17 - Tweets Variations - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,"is_retweet, is_quote, is_reply, has_icard",value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0 0 0 0,82095,34.8334,v (2. / 26.312% / 8.521%)
2,0 0 1 0,52583,22.3113,^ (1. / 27.166% / -4.855%)
3,1 0 0 0,44424,18.8494,= (3. / 23.887% / -5.037%)
4,0 0 0 1,29692,12.5985,= (4. / 10.048% / 2.551%)
5,1 0 0 1,11182,4.74459,= (5. / 6.074% / -1.329%)
6,0 1 0 0,9340,3.96302,= (6. / 3.123% / 0.840%)
7,1 1 0 0,4347,1.84446,= (7. / 2.579% / -0.735%)
8,0 0 1 1,1491,0.63264,= (8. / 0.574% / 0.059%)
9,0 1 1 0,289,0.122624,= (9. / 0.132% / -0.010%)
10,0 1 0 1,153,0.0649188,= (10. / 0.055% / 0.010%)


In [24]:
writer = pd.ExcelWriter('output.xlsx')
s.to_excel(writer,'Sheet1')
writer.save()



In [42]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "CW 17 - Tweet Variations - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,"is_retweet, is_quote, is_reply, has_icard",value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0 0 1 0,2021622,27.8102,v (2. / 22.856% / 4.954%)
2,0 0 0 0,1793640,24.674,^ (1. / 33.215% / -8.541%)
3,1 0 0 0,1709043,23.5103,= (3. / 18.187% / 5.324%)
4,0 0 0 1,769333,10.5833,= (4. / 13.489% / -2.905%)
5,1 0 0 1,488297,6.71721,= (5. / 5.307% / 1.410%)
6,0 1 0 0,236404,3.25207,= (6. / 4.119% / -0.867%)
7,1 1 0 0,185651,2.55389,= (7. / 1.828% / 0.726%)
8,0 0 1 1,46550,0.64036,= (8. / 0.744% / -0.103%)
9,0 1 1 0,10111,0.139091,= (9. / 0.122% / 0.017%)
10,0 1 0 1,4446,0.0611609,= (10. / 0.082% / -0.021%)


### Calendar Weeks - Tweet Variations - Sampled x Filtered

In [25]:
column_name = 'is_retweet, is_quote, is_reply, has_icard'
    
df_sampled = generateRankingDataframe(generateTweetsVariationDataframe(tweetsS).squeeze(), column_name)
df_filtered = generateRankingDataframe(generateTweetsVariationDataframe(tweetsF).squeeze(), column_name)

df_sampled_comparison, df_filtered_comparison = generateComparisonDataframes(df_sampled, df_filtered, column_name, 25)

In [27]:
s = getPrettyComparisonDataframe(df_sampled_comparison, "Tweets Variations - Sampled Data (compared to Filtered Data)")
s

Unnamed: 0_level_0,"is_retweet, is_quote, is_reply, has_icard",value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0 0 0 0,82095,34.8334,v (2. / 26.312% / 8.521%)
2,0 0 1 0,52583,22.3113,^ (1. / 27.166% / -4.855%)
3,1 0 0 0,44424,18.8494,= (3. / 23.887% / -5.037%)
4,0 0 0 1,29692,12.5985,= (4. / 10.048% / 2.551%)
5,1 0 0 1,11182,4.74459,= (5. / 6.074% / -1.329%)
6,0 1 0 0,9340,3.96302,= (6. / 3.123% / 0.840%)
7,1 1 0 0,4347,1.84446,= (7. / 2.579% / -0.735%)
8,0 0 1 1,1491,0.63264,= (8. / 0.574% / 0.059%)
9,0 1 1 0,289,0.122624,= (9. / 0.132% / -0.010%)
10,0 1 0 1,153,0.0649188,= (10. / 0.055% / 0.010%)


In [26]:
s = getPrettyComparisonDataframe(df_filtered_comparison, "Tweet Variations - Filtered Data (compared to Sampled Data)")
s

Unnamed: 0_level_0,"is_retweet, is_quote, is_reply, has_icard",value,percentage,difference (rank / percentage / diff)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0 0 1 0,4150938,27.1658,v (2. / 22.311% / 4.855%)
2,0 0 0 0,4020492,26.3121,^ (1. / 34.833% / -8.521%)
3,1 0 0 0,3649900,23.8868,= (3. / 18.849% / 5.037%)
4,0 0 0 1,1535315,10.0479,= (4. / 12.598% / -2.551%)
5,1 0 0 1,928063,6.0737,= (5. / 4.745% / 1.329%)
6,0 1 0 0,477218,3.12315,= (6. / 3.963% / -0.840%)
7,1 1 0 0,394086,2.57909,= (7. / 1.844% / 0.735%)
8,0 0 1 1,87698,0.573939,= (8. / 0.633% / -0.059%)
9,0 1 1 0,20199,0.132192,= (9. / 0.123% / 0.010%)
10,0 1 0 1,8339,0.0545745,= (10. / 0.065% / -0.010%)
