# Youtube URL Analysis

In [23]:
import psycopg2
import pandas as pd
import config
from matplotlib import pylab as plt
import seaborn as sns
import numpy as np
from IPython.display import display, HTML
import re
from urllib.parse import parse_qs,urlparse

#color_palette = sns.color_palette(palette='muted', n_colors=None, desat=.75)
#sns.set(context='notebook', palette=color_palette, style='whitegrid', font='sans-serif', font_scale=1.5, color_codes=False, rc=None)
pd.set_option('display.max_colwidth', -1)
table_styles = [{'selector': 'td',
                 'props': [('min-width', '100px'), ('text-align', 'center')]},
                {'selector': 'tr',
                 'props': [('border-bottom', '1px dotted black')]},
                {'selector': 'th',
                 'props': [('text-align', 'center')]}
               ]

%matplotlib inline

directory = "url_top_lists/"
stream = "comparison"

In [2]:
conn = None
try:
    # read connection parameters
    paramsS17 = config.cfgAzureS17()
    paramsS03 = config.cfgAzureS03()

    paramsF17 = config.cfgAzureF17()
    paramsF03 = config.cfgAzureF03()
    
    # connect to the PostgreSQL server
    print('Connecting to the PostgreSQL database...')
    connS17 = psycopg2.connect(**paramsS17)
    connS03 = psycopg2.connect(**paramsS03)
    
    connF17 = psycopg2.connect(**paramsF17)
    connF03 = psycopg2.connect(**paramsF03)

    # create a cursor
    curS17 = connS17.cursor()
    curS03 = connS03.cursor()
    
    curF17 = connF17.cursor()
    curF03 = connF03.cursor()

    # execute a statement
    print('PostgreSQL database version:')
    
    curS17.execute('SELECT version()')
    curS03.execute('SELECT version()')
    curF17.execute('SELECT version()')
    curF03.execute('SELECT version()')
    
    # display the PostgreSQL database server version
    db_version_curS17 = curS17.fetchone()
    db_version_curS03 = curS03.fetchone()
    db_version_curF17 = curF17.fetchone()
    db_version_curF03 = curF03.fetchone()
    
    print(db_version_curS17)
    print(db_version_curS03)
    print(db_version_curF17)
    print(db_version_curF03)

    # close the communication with the PostgreSQL
    curS17.close()
    curS03.close()
    curF17.close()
    curF03.close()

except (Exception, psycopg2.DatabaseError) as error:
    print(error)

Connecting to the PostgreSQL database...
PostgreSQL database version:
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)
('PostgreSQL 9.6.7, compiled by Visual C++ build 1800, 64-bit',)


## Util Methods

In [3]:
def compareRows(row, df_to_compare, column_name):
    comparison_row = df_to_compare.loc[df_to_compare[column_name] == row[column_name]]
    if comparison_row.empty:
        comparison = " - "
    else:
        percentage_dif = row['percentage'] - comparison_row['percentage'].values[0]
        difference = "(%s. / %.3f%% / %.3f%%)" % (comparison_row['rank'].values[0], comparison_row['percentage'].values[0], percentage_dif)
        if comparison_row['rank'].values[0] == row['rank']:
            comparison = " = <br>" + difference
        else:
            if comparison_row['rank'].values[0] > row['rank']:
                comparison = " v <br>" + difference
            else:
                comparison = " ^ <br>" + difference
    return comparison

def getOpacity(val):
    value = abs(float(re.findall(r"[-+]?\d*\.\d+|\d+", val.split("/")[2])[0]))
    if value < 0.005:
        return 1
    if value < 0.01:
        return 0.95
    if value < 0.05:
        return 0.8
    if value < 0.1:
        return 0.7
    if value < 0.5:
        return 0.6
    if value < 1:
        return 0.5
    if value < 10:
        return 0.3
    if value < 40:
        return 0.2
    if value < 80:
        return 0.1
    if value < 100:
        return 0.05

def colorComparisonField(val):
    
    if isinstance(val, str):
        if ' ^ ' in val or ' v ' in val:
            return 'background-color: rgba(246, 185, 59, %s)' %getOpacity(val)
        if ' = ' in val:
            return 'background-color: rgba(184, 233, 148, %s)' %getOpacity(val)
        if ' - ' in val and len(val) == 3:
            return 'background-color: #e55039' 
    return ''

def generateRankingDataframe(series, attribute_name):
    size = series.sum()
    rank = []
    parameter = []
    count = []
    percentage = []

    i = 1
    for index, value in series.iteritems():
        rank.append(i)
        parameter.append(index)
        count.append(value)
        percentage.append((value/size)*100)
        i += 1

    data = {'rank': rank, attribute_name: parameter, 'value': count, 'percentage': percentage}
    return pd.DataFrame(data=data)

def generateComparisonDataframes(df1, df2, column_name, size):
    compare_list = []
    for index, row in df1.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df2, column_name))

    data = {'rank': df1['rank'][:size], column_name: df1[column_name][:size], 'value': df1['value'][:size], 'percentage': df1['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df1_compared = pd.DataFrame(data=data)
    df1_compared.set_index(keys='rank', inplace=True)
    
    compare_list = []
    for index, row in df2.iterrows():
        if row['rank'] <= size:
            compare_list.append(compareRows(row, df1, column_name))

    data = {'rank': df2['rank'][:size], column_name: df2[column_name][:size], 'value': df2['value'][:size], 'percentage': df2['percentage'][:size],
            'difference (rank / percentage / diff)': compare_list}
    
    df2_compared = pd.DataFrame(data=data)
    df2_compared.set_index(keys='rank', inplace=True)
    
    return df1_compared, df2_compared

def getPrettyComparisonDataframe(df, title):
    s = df.style.applymap(colorComparisonField)
    s.set_caption(title)
    s.set_table_styles(table_styles)
    return s

## Query URLs - Sampled

In [4]:
urlsS03 = pd.read_sql_query("SELECT * FROM tweets_urls;", connS03 )
print("# of URLs sampled 03: %s" %len(urlsS03))

urlsS17 = pd.read_sql_query("SELECT * FROM tweets_urls;", connS17 )
print("# of URLs sampled 17: %s" %len(urlsS17))

urlsS = urlsS17.append(urlsS03)
print("# of URLs sampled: %s" %len(urlsS))

urlsS.head()

# of URLs sampled 03: 40339
# of URLs sampled 17: 35687
# of URLs sampled: 76026


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,989097603664138240,http://arte.tv/abgedreht,https://www.arte.tv/de/videos/RC-014033/abgedreht/,200,https://www.arte.tv/,arte.tv,False,False,True,False
1,988175933659021318,https://twitter.com/piersmorgan/status/987388203593322496,https://twitter.com/piersmorgan/status/987388203593322496,200,https://twitter.com/,twitter.com,True,False,True,False
2,988176164358361088,https://www.journal.koeln/pol-ham-fahrradfahrer-verletzt-sich-bei-sturz/,https://www.journal.koeln/pol-ham-fahrradfahrer-verletzt-sich-bei-sturz/,200,https://www.journal.koeln/,journal.koeln,False,False,True,False
3,988176403412766720,http://www.radionomy.com/erika1,https://www.radionomy.com/en/radio/erika1,200,https://www.radionomy.com/,radionomy.com,False,False,True,False
4,988176415995592704,https://twitter.com/JanLatten/status/988161845205913600,https://twitter.com/JanLatten/status/988161845205913600,200,https://twitter.com/,twitter.com,True,False,True,False


## Query URLs - Filtered

In [5]:
urlsF03 = pd.read_sql_query("SELECT * FROM tweets_urls;", connF03 )
print("# of URLs filtered 03: %s" %len(urlsF03))

urlsF17 = pd.read_sql_query("SELECT * FROM tweets_urls;", connF17 )
print("# of URLs filtered 17: %s" %len(urlsF17))

urlsF = urlsF17.append(urlsF03)
print("# of URLs filtered: %s" %len(urlsF))

urlsF.head()

# of URLs sampled 03: 2411523
# of URLs sampled 17: 2107279
# of URLs sampled: 4518802


Unnamed: 0,tweet_id,short_url,resolved_url,response_code,domain,top_level_domain,is_twitter_url,is_media,is_processed,failed
0,988437985363406853,https://www.facebook.com/nadjashah/posts/10215947002527277,https://www.facebook.com/nadjashah/posts/10215947002527277,200,https://www.facebook.com/,facebook.com,False,False,True,False
1,988437986797916160,https://www.facebook.com/StageSchoolHamburg/posts/1934173349947359,https://www.facebook.com/StageSchoolHamburg/posts/1934173349947359,200,https://www.facebook.com/,facebook.com,False,False,True,False
2,988437989712957440,http://www.faz.net/aktuell/rhein-main/frankfurt/frankfurter-gutleutviertel-landgericht-erlaubt-drohende-zwangsraeumung-15556433.html,http://www.faz.net/aktuell/rhein-main/frankfurt/frankfurter-gutleutviertel-landgericht-erlaubt-drohende-zwangsraeumung-15556433.html,200,http://www.faz.net/,faz.net,False,False,True,False
3,988437989704568832,http://vera-lengsfeld.de/2018/04/22/buergerprotest-ueberall-wie-lange-wird-das-noch-verschwiegen/,http://vera-lengsfeld.de/2018/04/22/buergerprotest-ueberall-wie-lange-wird-das-noch-verschwiegen/,200,http://vera-lengsfeld.de/,vera-lengsfeld.de,False,False,True,False
4,988437990732128256,https://ift.tt/2Jm4wnp,https://news.google.com/?sa=t&fd=R&ct2=de&usg=AFQjCNFSz3Yf_fDFIytLtnNe8JNGM2BPZg&clid=c3a7d30bb8a4878e06b80cf16b898331&ei=O_rdWqDIHNCT3QHCyJDQAg&url=https://www.waz.de/kultur/fuer-silke-j-raebiger-ist-es-das-letzte-frauen-filmfestival-id214100661.html&taa=1&hl=en-US&gl=US&ceid=US:en,200,https://news.google.com/,google.com,False,False,True,False


## Merged URLs

In [8]:
urls = urlsS.append(urlsF)

print("# of URLs merged: %s" %len(urls))

# of URLs merged: 4594828


## Dataframes

### Youtube URLs from merged Filtered and Sampled Datasets - Dataframe

In [14]:
youtube_urls = urls[urls['top_level_domain'] == 'youtube.com']
print("# of URLs in Dataframe: %s" %len(youtube_urls))

# of URLs in Dataframe: 445421


### Unique Youtube URLs from merged Filtered and Sampled Datasets - Series

In [17]:
unique_youtube_urls = youtube_urls['resolved_url'].unique()
print("# of unique URLs in Dataframe: %s" %len(unique_youtube_urls))

# of unique URLs in Dataframe: 277313


In [40]:
len(youtube_urls['resolved_url'].unique())

277313

## Youtube Link Types


In [42]:

path_list = []
field_list = []

for url in unique_youtube_urls:
    parsed_url = urlparse(url)
    parsed_query = parse_qs(parsed_url.query)
    
    path_list.append(parsed_url.path.split("/")[1])

    #print(parsed_url)
    #print(parsed_query)
    for field in parsed_query.keys():
        field_list.append(field)
        #print(field)
        

    

### Count by Path (youtube.com/{path}/ ... )

In [48]:
path_series = pd.Series(np.array(path_list))
print("# of different path types: %s" %len(path_series.value_counts()))

print("\n\n#count", "#path\n", sep="\t")
for path, count in path_series.value_counts().iteritems():
    print(count, path, sep="\t")

# of different path types: 280


#count	#path

274210	watch
1361	channel
626	playlist
324	user
319	c
99	embed
47	results
19	add_contact
14	
4	page
3	whatsbroadcast
3	intl
2	ausgespielt
2	MotocrossPassionAndLifestyle
2	redirect
2	lpmitkev
2	bloodheadline
2	haitrol
2	denisbro
2	calypsolp
2	1fsvmainz05
2	FroschCrafterHD
2	derKr%C3%BCger
2	gratiscomictag
2	feed
2	LetsFugi
2	vi
2	timboxtv
1	JzudemD
1	flughund
1	fynix
1	coupondivas
1	pietsmiet
1	Denisbro
1	computergott
1	alpinfreunde
1	Domi
1	GameTasticalHD
1	kilimovie
1	ryvlaw
1	jonah98
1	rocketbeanstv
1	mdrjump
1	MaximNoise
1	FCAfkicken
1	fcbayern
1	KOSMO_official
1	techniklikeblog
1	tofmof
1	LiveGamingYouTube
1	KiElite
1	dsotalk
1	chiyogames
1	HobbykochGourmet
1	beckuplearning
1	mingolisch
1	Wolpertinger_2000
1	zusammengebaut
1	hyperboletv
1	letspatrick
1	oniondog
1	michelledy
1	learngermann
1	laurelkoeniger
1	kebautomation
1	dizzywiggle
1	TwixY
1	JulienBam
1	puntherline
1	watchv=X4cJ_mmtZZE
1	lereglement
1	users
1	aggrotv
1	ralfhtain
1	b

### Count by Query Fields (youtube.com/watch?{field}={parameter} )

In [49]:
field_series = pd.Series(np.array(field_list))
print("# of different query fields: %s" %len(field_series.value_counts()))



print("\n\n#count", "#field\n", sep="\t")
for field, count in field_series.value_counts().iteritems():
    print(count, field, sep="\t")

# of different query fields: 142


#count	#field

274212	v
243765	feature
2212	t
2041	list
1746	app
874	index
829	sns
765	utm_source
763	utm_medium
549	time_continue
516	utm_campaign
447	utm_content
153	view_as
57	attr_tag
56	sub_confirmation
55	ab_channel
54	lc
42	search_query
40	utm_sq
27	platform
26	disable_polymer
24	_lrsc
19	c
15	bpctr
13	pvmeta
13	view
12	linkId
12	has_verified
12	gl
12	hl
10	utm_term
9	flow
9	pbjreload
8	spfreload
7	q
7	autoplay
7	amp_network
7	rel
7	amp
7	via
7	start_radio
5	sort
5	hootPostID
5	params
4	shelf_id
4	rdm
4	vl
4	client
3	__prclt
3	pk_kwd
3	noapp
3	pk_campaign
3	html5
2	mode
2	wt_mc
2	frags
2	redir_token
2	lid
2	vidve
2	es_p
2	utm_postid
2	hd
2	bitly_hash
2	mc_cid
2	wt_zmc
2	itct
2	reload
2	event
2	ytbChannel
2	showinfo
2	dgc
2	cid
2	mc_eid
2	ocid
1	sf187534713
1	sf87701506
1	oref
1	adbpl
1	loop
1	persist_app
1	sf187948911
1	fmt
1	lipi
1	bkn
1	src
1	NR
1	_branch_match_id
1	sf187757715
1	sf87700781
1	version
1	jct
1	utcoffset
1	xing_share
1	sf8315558

## Top Links

In [51]:
youtube_urls["resolved_url"].value_counts()[:50]

https://www.youtube.com/watch?v=Dw-2hh6G_D8&feature=youtu.be&a    6691
https://www.youtube.com/user/videodeutschland/videos            2584
https://www.youtube.com/watch?v=6bTQkwftlf0&feature=youtu.be&a    1484
https://www.youtube.com/channel/UCXDyAGuwSxI4Y-X-N6RXqew/videos    965
https://www.youtube.com/watch?v=Xeqf_HHEMJk                     847
https://www.youtube.com/watch?v=JEVtiDeHKdc&feature=youtu.be&a    595
https://www.youtube.com/watch?v=XBcu5D7EI6g&feature=youtu.be    480
https://www.youtube.com/watch?v=zvKjfWSPI7s                     452
https://www.youtube.com/watch?v=fAYjSLtz6wQ&feature=youtu.be    389
https://www.youtube.com/channel/UCq93BR098LNtNk_TsYnl8ZQ?view_as=subscriber    383
https://www.youtube.com/?gl=DE                                  352
https://www.youtube.com/watch?v=xsarQ8O58YM&feature=youtu.be    291
https://www.youtube.com/watch?v=dOeURUWyhR8                     275
https://www.youtube.com/channel/UCVnvMpMVel0KMCqZM69XMbQ        266
https://www.youtube.c