In [1]:
import psycopg2
import pandas as pd
import config
from matplotlib import pylab as plt
import seaborn as sns
import numpy as np
from IPython.display import display, HTML
import re
from urllib.parse import parse_qs,urlparse


color_palette = sns.color_palette(palette='muted', n_colors=None, desat=.75)
sns.set(context='notebook', palette=color_palette, style='whitegrid', font='sans-serif', font_scale=1.5, color_codes=False, rc=None)
pd.set_option('display.max_colwidth', -1)
table_styles = [{'selector': 'td',
                 'props': [('min-width', '100px'), ('text-align', 'center')]},
                {'selector': 'tr',
                 'props': [('border-bottom', '1px dotted black')]},
                {'selector': 'th',
                 'props': [('text-align', 'center')]}
               ]

%matplotlib inline

directory = "url_top_lists/"
stream = "comparison"

  """)
  return f(*args, **kwds)


In [2]:
conn = None
try:
    # read connection parameters
    paramsS17 = config.cfgAzureS17()
    paramsS03 = config.cfgAzureS03()

    paramsF17 = config.cfgAzureF17()
    paramsF03 = config.cfgAzureF03()
    
    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    connS17 = psycopg2.connect(**paramsS17)
    connS03 = psycopg2.connect(**paramsS03)
    
    connF17 = psycopg2.connect(**paramsF17)
    connF03 = psycopg2.connect(**paramsF03)

    # create a cursor
    curS17 = connS17.cursor()
    curS03 = connS03.cursor()
    
    curF17 = connF17.cursor()
    curF03 = connF03.cursor()

    # execute a statement
    print('PostgreSQL database version:')
    
    curS17.execute('SELECT version()')
    curS03.execute('SELECT version()')
    curF17.execute('SELECT version()')
    curF03.execute('SELECT version()')
    
    # display the PostgreSQL database server version
    db_version_curS17 = curS17.fetchone()
    db_version_curS03 = curS03.fetchone()
    db_version_curF17 = curF17.fetchone()
    db_version_curF03 = curF03.fetchone()
    
    print(db_version_curS17)
    print(db_version_curS03)
    print(db_version_curF17)
    print(db_version_curF03)

    # close the communication with the PostgreSQL
    curS17.close()
    curS03.close()
    curF17.close()
    curF03.close()

except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Connecting to the PostgreSQL database...
PostgreSQL database version:
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)


# Sampled Data

## URLs

In [3]:
urlsS03 = pd.read_sql_query("SELECT * FROM tweets_urls;", connS03 )
print("# of URLs sampled 03: %s" %len(urlsS03))

urlsS17 = pd.read_sql_query("SELECT * FROM tweets_urls;", connS17 )
print("# of URLs sampled 17: %s" %len(urlsS17))

urlsS = urlsS17.append(urlsS03)
print("# of URLs sampled: %s" %len(urlsS))

urlsS.head()

# of URLs sampled 03: 40339
# of URLs sampled 17: 35687
# of URLs sampled: 76026


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,989097603664138240,http://arte.tv/abgedreht,https://www.arte.tv/de/videos/RC-014033/abgedreht/,200.0,https://www.arte.tv/,arte.tv,False,False,True,False
1,988175933659021318,https://twitter.com/piersmorgan/status/987388203593322496,https://twitter.com/piersmorgan/status/987388203593322496,200.0,https://twitter.com/,twitter.com,True,False,True,False
2,988176164358361088,https://www.journal.koeln/pol-ham-fahrradfahrer-verletzt-sich-bei-sturz/,https://www.journal.koeln/pol-ham-fahrradfahrer-verletzt-sich-bei-sturz/,200.0,https://www.journal.koeln/,journal.koeln,False,False,True,False
3,988176403412766720,http://www.radionomy.com/erika1,https://www.radionomy.com/en/radio/erika1,200.0,https://www.radionomy.com/,radionomy.com,False,False,True,False
4,988176415995592704,https://twitter.com/JanLatten/status/988161845205913600,https://twitter.com/JanLatten/status/988161845205913600,200.0,https://twitter.com/,twitter.com,True,False,True,False


## Tweets

In [4]:
tweetsS03 = pd.read_sql_query("SELECT * FROM tweets_info;", connS03 )
print("# of Tweets sampled 03: %s" %len(tweetsS03))

tweetsS17 = pd.read_sql_query("SELECT * FROM tweets_info;", connS17 )
print("# of tweets sampled 17: %s" %len(tweetsS17))

tweetsS = tweetsS17.append(tweetsS03)
print("# of tweets sampled: %s" %len(tweetsS))

tweetsS.head()

# of Tweets sampled 03: 123680
# of tweets sampled 17: 112003
# of tweets sampled: 235683


Unnamed: 0,id,user_id,text,created_at,source,lang,truncated,is_retweet,retweet_id,is_quote,...,favorite_count,favorited,retweeted,hashtags,user_mentions,number_of_urls,extracted,coordinates_type,coordinates_long,coordinates_lat
0,988175614908788736,1647758059,"Johangeorgenstadt 23:59 Nachtzeit/Trocken/Klar ,Temp: 14°C, Hum:66%, Wind:WSW 0-&gt;24kmh, Baro: 1012.80hPa, Regen: 0.00mm #wetter",Sun Apr 22 22:00:00 +0000 2018,"<a href=""http://saratoga-weather.org/scripts-TweetWX.php#TweetWX"" rel=""nofollow"">TweetWX</a>",de,False,False,,False,...,0,False,False,wetter,,0,False,,,
1,988175619098861568,234778734,Beim nächsten Ton ist es 0 Uhr: ♩♩♩,Sun Apr 22 22:00:01 +0000 2018,"<a href=""http://www.rasputin.de/"" rel=""nofollow"">rasputin</a>",de,False,False,,False,...,0,False,False,,,0,False,,,
2,988175627453952002,821834588158853120,[d] DiplIngMarkus Verbrauch gestern: 3.71 kWh Uptime: 41 days 21:01:04 since 23:48:32 13/10/2017,Sun Apr 22 22:00:03 +0000 2018,"<a href=""http://arduino-tweet.appspot.com/"" rel=""nofollow"">Arduino</a>",de,False,False,,False,...,0,False,False,,,0,False,,,
3,988175627479146498,859148218915069952,Alien Mutterschiff sozusagen \n(Hab das Gefühl bin schwanger) \n(Das gibt dann rr Stress) https://t.co/z4uHz2dugO,Sun Apr 22 22:00:03 +0000 2018,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>",de,False,False,,False,...,0,False,False,,,0,False,,,
4,988175681996681218,3206249890,@MDegen55 🇩🇪🇩🇪 Gute Nacht 🌃🌃🌃 https://t.co/OTnsAsDGrc,Sun Apr 22 22:00:16 +0000 2018,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",de,False,False,,False,...,0,False,False,,3206249890.0,0,False,,,


# Little Helpers 

In [5]:
def getTweetsByURL(url, urls, tweets):
    return tweets[tweets['id'].isin(urls[urls['resolved_url'] == url]['tweet_id'])]

In [28]:
getTweetsByURL('https://sec.help.ch', urlsS, tweetsS)

Unnamed: 0,id,user_id,text,created_at,source,lang,truncated,is_retweet,retweet_id,is_quote,...,favorite_count,favorited,retweeted,hashtags,user_mentions,number_of_urls,extracted,coordinates_type,coordinates_long,coordinates_lat
