In [1]:
import psycopg2
import pandas as pd
import config
from matplotlib import pylab as plt
import seaborn as sns
import numpy as np
from IPython.display import display, HTML
import re
from urllib.parse import parse_qs,urlparse


color_palette = sns.color_palette(palette='muted', n_colors=None, desat=.75)
sns.set(context='notebook', palette=color_palette, style='whitegrid', font='sans-serif', font_scale=1.5, color_codes=False, rc=None)
pd.set_option('display.max_colwidth', -1)
table_styles = [{'selector': 'td',
                 'props': [('min-width', '100px'), ('text-align', 'center')]},
                {'selector': 'tr',
                 'props': [('border-bottom', '1px dotted black')]},
                {'selector': 'th',
                 'props': [('text-align', 'center')]}
               ]

%matplotlib inline

directory = "url_top_lists/"
stream = "comparison"

  """)
  return f(*args, **kwds)


In [3]:
conn = None
try:
    # read connection parameters
    paramsS17 = config.cfgAzureS17()
    paramsS03 = config.cfgAzureS03()

    paramsF17 = config.cfgAzureF17()
    paramsF03 = config.cfgAzureF03()
    
    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    connS17 = psycopg2.connect(**paramsS17)
    connS03 = psycopg2.connect(**paramsS03)
    
    connF17 = psycopg2.connect(**paramsF17)
    connF03 = psycopg2.connect(**paramsF03)

    # create a cursor
    curS17 = connS17.cursor()
    curS03 = connS03.cursor()
    
    curF17 = connF17.cursor()
    curF03 = connF03.cursor()

    # execute a statement
    print('PostgreSQL database version:')
    
    curS17.execute('SELECT version()')
    curS03.execute('SELECT version()')
    curF17.execute('SELECT version()')
    curF03.execute('SELECT version()')
    
    # display the PostgreSQL database server version
    db_version_curS17 = curS17.fetchone()
    db_version_curS03 = curS03.fetchone()
    db_version_curF17 = curF17.fetchone()
    db_version_curF03 = curF03.fetchone()
    
    print(db_version_curS17)
    print(db_version_curS03)
    print(db_version_curF17)
    print(db_version_curF03)

    # close the communication with the PostgreSQL
    curS17.close()
    curS03.close()
    curF17.close()
    curF03.close()

except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Connecting to the PostgreSQL database...
PostgreSQL database version:
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.9, compiled by Visual C++ build 1800, 64-bit',)


# Filtered Dataset

## URLs

In [4]:
urlsF03 = pd.read_sql_query("SELECT * FROM tweets_urls;", connF03 )
print("# of URLs filtered 03: %s" %len(urlsF03))

urlsF17 = pd.read_sql_query("SELECT * FROM tweets_urls;", connF17 )
print("# of URLs filtered 17: %s" %len(urlsF17))

urlsF = urlsF17.append(urlsF03)
print("# of URLs filtered: %s" %len(urlsF))

urlsF.head()

# of URLs filtered 03: 2411523
# of URLs filtered 17: 2107279
# of URLs filtered: 4518802


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,988437985363406853,https://www.facebook.com/nadjashah/posts/10215947002527277,https://www.facebook.com/nadjashah/posts/10215947002527277,200.0,https://www.facebook.com/,facebook.com,False,False,True,False
1,988437986797916160,https://www.facebook.com/StageSchoolHamburg/posts/1934173349947359,https://www.facebook.com/StageSchoolHamburg/posts/1934173349947359,200.0,https://www.facebook.com/,facebook.com,False,False,True,False
2,988437989712957440,http://www.faz.net/aktuell/rhein-main/frankfurt/frankfurter-gutleutviertel-landgericht-erlaubt-drohende-zwangsraeumung-15556433.html,http://www.faz.net/aktuell/rhein-main/frankfurt/frankfurter-gutleutviertel-landgericht-erlaubt-drohende-zwangsraeumung-15556433.html,200.0,http://www.faz.net/,faz.net,False,False,True,False
3,988437989704568832,http://vera-lengsfeld.de/2018/04/22/buergerprotest-ueberall-wie-lange-wird-das-noch-verschwiegen/,http://vera-lengsfeld.de/2018/04/22/buergerprotest-ueberall-wie-lange-wird-das-noch-verschwiegen/,200.0,http://vera-lengsfeld.de/,vera-lengsfeld.de,False,False,True,False
4,988437990732128256,https://ift.tt/2Jm4wnp,https://news.google.com/?sa=t&fd=R&ct2=de&usg=AFQjCNFSz3Yf_fDFIytLtnNe8JNGM2BPZg&clid=c3a7d30bb8a4878e06b80cf16b898331&ei=O_rdWqDIHNCT3QHCyJDQAg&url=https://www.waz.de/kultur/fuer-silke-j-raebiger-ist-es-das-letzte-frauen-filmfestival-id214100661.html&taa=1&hl=en-US&gl=US&ceid=US:en,200.0,https://news.google.com/,google.com,False,False,True,False


## Tweets

In [5]:
tweetsF03 = pd.read_sql_query("SELECT id, text, hashtags, user_mentions, is_retweet, is_reply, is_quote, user_id FROM tweets_info;", connF03 )
print("# of Tweets filtered 03: %s" %len(tweetsF03))

tweetsF17 = pd.read_sql_query("SELECT id, text, hashtags, user_mentions, is_retweet, is_reply, is_quote, user_id FROM tweets_info;", connF17 )
print("# of tweets filtered 17: %s" %len(tweetsF17))

tweetsF = tweetsF17.append(tweetsF03)
print("# of tweets filtered: %s" %len(tweetsF))

tweetsF.head()

# of Tweets filtered 03: 8010674
# of tweets filtered 17: 7269347
# of tweets filtered: 15280021


Unnamed: 0,id,text,hashtags,user_mentions,is_retweet,is_reply,is_quote,user_id
0,988416340770902016,@badespassbarbie Neeee! Finde ehr ich strahle Sicherheit &amp; Vertrauen aus.,,100456149.0,False,True,False,823740331
1,988175612497063936,"For sale -&gt; 2015 #Mercedes-BenzM-Class in #Perkasie, PA #carsforsale https://t.co/sGPYcKdETp",Mercedes Perkasie carsforsale,,False,False,False,227474620
2,988175612710998016,"@Tsukomia Ferienjob, Werkstudentin oder was auch immer bei dir zutreffen würde?",,2923003695.0,False,True,False,2735324418
3,988175612778024960,"Warum war sowas mal wieder gut, das war zwar eher eine Domain als Geschmacksverstärker",,,False,False,False,842397141251289088
4,988175612941611010,"Du hast neuerdings eine Satirepartei, oder wie?",,,False,False,False,831923785607630853


## Top URLs

In [20]:
top_100_urls = urlsF['resolved_url'].value_counts()[:100]

['https://www.dwd.de/',
 'https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US',
 'https://twitter.com/account/suspended',
 'https://sec.help.ch',
 'https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a',
 'http://www.the-sz.com/products/vbbinfo/?f=3',
 'https://www.ffd365.de',
 'http://www.flirt.21.com/',
 'https://www.linkedin.com/',
 'http://streaming.radionomy.com/Radio-Jodlerwirt1',
 'http://www.radio-jodlerwirt.de/',
 'https://www.radionomy.com/en/radio/erika1',
 'http://www.gooni168.tv/sellpanties/item/haushalts-sklave-in/',
 'http://www.messe.tv/',
 'http://www.the-sz.com/products/vbbinfo/?f=4',
 'https://www.youtube.com/user/videodeutschland/videos',
 'https://www.purzel-video.com/',
 'https://www.emma-care.de/blog/teufelskralle-fuers-pferd-was-ist-das',
 'https://www.miet-check.de/',
 'http://www.chatgate24.com/',
 'https://mobile.twitter.com/Markus_Soeder/status/988768341820170240',
 'https://www.facebook.com/CODECODE1111111111/',
 'http:

In [53]:
def getTweetsByURL(url, urls, tweets):
    return tweets[tweets['id'].isin(urls[urls['resolved_url'] == url]['tweet_id'])]

def getTweetVariation(df, is_retweet, is_quote, is_reply):
    return df[ (df['is_retweet'] == is_retweet) & (df['is_quote'] == is_quote) & (df['is_reply'] == is_reply)]

def getTweetsByURLList(url_list, urls, tweets):
    tweet_id_list = []
    for url_entry in url_list:
        tweet_id_list.extend(getTweetsByURL(url_entry, urls, tweets)['id'].values)
        #print(tweet_id_list)
    
    return tweets[tweets['id'].isin(tweet_id_list)]

In [56]:
len(getTweetsByURLList(["http://www.flirt.21.com/","https://sec.help.ch"], urlsF, tweetsF))

[988177404815429632, 988177416542711809, 988182433966239744, 988182442921119744, 988184952494837766, 988184963966259205, 988411444453658624, 988411456361246726, 988187467613392896, 988187476991922176, 988189981960212480, 988189992240500743, 988192503567470592, 988192515827404802, 988195019453620224, 988195031935791104, 988197532831543296, 988197541530488832, 988200051905359872, 988200064249159681, 988202567254585344, 988202576029118464, 988205085778939906, 988205097556561920, 988210119027683330, 988210130444484610, 988212637325496322, 988212648792739840, 988215150934151168, 988215162682343425, 988217669898862593, 988217685279412224, 988220183843139585, 988220193053794304, 988222712383500289, 988222701092458496, 988225217666396160, 988225229381087233, 988227733112131584, 988227741270052864, 988230248595681280, 988230258544476160, 988232764322451457, 988232773293965312, 988235282657914880, 988235291273170944, 988237802432270336, 988237814444720128, 988240317076967425, 988240325721370626,

[988177404815429632, 988177416542711809, 988182433966239744, 988182442921119744, 988184952494837766, 988184963966259205, 988411444453658624, 988411456361246726, 988187467613392896, 988187476991922176, 988189981960212480, 988189992240500743, 988192503567470592, 988192515827404802, 988195019453620224, 988195031935791104, 988197532831543296, 988197541530488832, 988200051905359872, 988200064249159681, 988202567254585344, 988202576029118464, 988205085778939906, 988205097556561920, 988210119027683330, 988210130444484610, 988212637325496322, 988212648792739840, 988215150934151168, 988215162682343425, 988217669898862593, 988217685279412224, 988220183843139585, 988220193053794304, 988222712383500289, 988222701092458496, 988225217666396160, 988225229381087233, 988227733112131584, 988227741270052864, 988230248595681280, 988230258544476160, 988232764322451457, 988232773293965312, 988235282657914880, 988235291273170944, 988237802432270336, 988237814444720128, 988240317076967425, 988240325721370626,

11964

In [18]:
len(top_100_urls['resolved_url'])

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [15]:
spam_urls = ['https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US',
'http://www.flirt.21.com/',
'http://www.gooni168.tv/sellpanties/item/haushalts-sklave-in/',
'http://www.chatgate24.com/',
'https://www.facebook.com/CODECODE1111111111/',
'http://der-x-code.com/'
'http://tiny.cc/u2n9ly',
'http://www.gooni168.tv/sellpanties/item/slip-sklave/',
'http://tiny.cc/32n9ly',
'http://tiny.cc/13n9ly',
'http://tinyurl.co/',
'https://s1.biathlonmania.com/?lang=',
'http://tiny.cc/n3n9ly'
'https://tinyurl.com/',
'https://play.google.com/store/apps/details?id=com.fgcos.crossword_de_kreuzwortratsel',
'https://www.g5e.com/games/the_secret_society_hidden_mystery_ios',
'https://store.playstation.com/?resolve=EP9000-CUSA10345_00-DETROITDEMO00001?emcid=or-ph-110609',
'https://play.google.com/store/apps/details?id=com.seventeenbullets.android.island',
'https://apps.powerplaymanager.com/biathlonmania/g2061',
'http://www.gooni168.tv/sellpanties/item/clone-deine-pussy-busen-po/',
'http://www.dirtysabi.com',
'https://planetradio.co.uk/magic/',
'http://on4us.eu/up1sn/',
'http://stores.ebay.de/Graf-von-Eychendorf-Antiquitaten',
'https://www.plusidee.news/gratismailer-kostenlos-werbung-machen/',
'https://discordapp.com/invite/wEpAzeS',
'https://myporndate.de/',
'https://play.google.com/store/apps/details?id=com.g5e.secretsociety',
'https://cams.com/katie71?pid=g866508.subkatie71',
'https://pages.ebay.com/messages/page_not_found.html',
'http://besteferien.de/',
'http://homepage.mrslove.com/usa1.php?x=X-LITOVE-4-date',
'https://itunes.apple.com/de/app/farmville-2-raus-aufs-land/id824318267?mt=8&ign-mpt=uo%3D4',
'https://t.irtye.com/9hvpsdu5j4?offer_id=3664&aff_id=35395&bo=2779,2778,2777,2776,3391',
'https://tiny.cc/n3n9ly',
'https://backoffice.lionstradingclub.org/register/?ref=Swingtime21']


In [35]:
no_spam_urls = [r for r in top_100_urls.index if r not in spam_urls]
print(len(no_spam_urls))

68


https://www.dwd.de/
https://pi2.17bullets.com/tw_post.php?messageId=achievement&values=&locale=en_US
https://twitter.com/account/suspended
https://sec.help.ch
https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a
http://www.the-sz.com/products/vbbinfo/?f=3
https://www.ffd365.de
http://www.flirt.21.com/
https://www.linkedin.com/
http://streaming.radionomy.com/Radio-Jodlerwirt1
http://www.radio-jodlerwirt.de/
https://www.radionomy.com/en/radio/erika1
http://www.gooni168.tv/sellpanties/item/haushalts-sklave-in/
http://www.messe.tv/
http://www.the-sz.com/products/vbbinfo/?f=4
https://www.youtube.com/user/videodeutschland/videos
https://www.purzel-video.com/
https://www.emma-care.de/blog/teufelskralle-fuers-pferd-was-ist-das
https://www.miet-check.de/
http://www.chatgate24.com/
https://mobile.twitter.com/Markus_Soeder/status/988768341820170240
https://www.facebook.com/CODECODE1111111111/
http://der-x-code.com/
http://susanne-ulrike-maria-albrecht.over-blog.de/
https://mobile.twitte

68

In [30]:
len(spam_urls)

34

In [None]:
no_spam_tweets = getTweetsByURLList(no_spam_urls, urlsF, tweetsF)