In [None]:
import os
import gzip
import pickle
import logging
import functools as ft

import psycopg2
import community
import numpy as np
import pandas as pd
import networkx as nx

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from IPython.display import display
from tqdm.notebook import tqdm

import utils as ut

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_colwidth', None)

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

# Load data

## Twitter raw data

In [None]:
user_show_orig = pd.read_csv('data/samples/twitter/user-show.csv')
display(user_show_orig.shape)

In [None]:
user_data = pd.read_csv('data/samples/twitter/user-data.csv', index_col='user_id')
display(user_data.shape)

## Follow community

In [None]:
follow_comm = pd.read_csv('data/twitter/community-ideology-by-show.csv')
follow_comm = follow_comm.rename({'community': 'follow_community'}, axis=1)
follow_comm = follow_comm.set_index('show_id')

## Twitter graphs

These are the quotient graphs under the equivalence relation of being affiliated with the same radio show. Users not affiliated with a show are not equivalent to any other user.

### User level

In [None]:
fg_user = pd.read_csv('data/twitter/community/quotient-follow-graph.csv',
                      names=['source', 'target', 'edge_attr'])
fg_user = fg_user.drop('edge_attr', axis=1)

fg_user = nx.from_pandas_edgelist(fg_user, source='source', target='target',
                                  edge_attr=None, create_using=nx.DiGraph())

In [None]:
mg_user = pd.read_csv('data/twitter/community/quotient-mention-graph.csv',
                      names=['source', 'target', 'edge_attr'])
mg_user = mg_user.drop('edge_attr', axis=1)

mg_user = nx.from_pandas_edgelist(mg_user, source='source', target='target',
                                  edge_attr=None, create_using=nx.DiGraph())

In [None]:
rg_user = pd.read_csv('data/twitter/community/quotient-retweet-graph.csv',
                      names=['source', 'target', 'edge_attr'])
rg_user = rg_user.drop('edge_attr', axis=1)

rg_user = nx.from_pandas_edgelist(rg_user, source='source', target='target',
                                  edge_attr=None, create_using=nx.DiGraph())

### Show level

In [None]:
fg_show = pd.read_csv('data/twitter/community/quotient-follow-graph-by-show.csv',
                      names=['source', 'target', 'edge_attr'])
fg_show = fg_show.drop('edge_attr', axis=1)

fg_show = nx.from_pandas_edgelist(fg_show, source='source', target='target',
                                  edge_attr=None, create_using=nx.DiGraph())

In [None]:
mg_show = pd.read_csv('data/twitter/community/quotient-mention-graph-by-show.csv',
                      names=['source', 'target', 'edge_attr'])
mg_show = mg_show.drop('edge_attr', axis=1)

mg_show = nx.from_pandas_edgelist(mg_show, source='source', target='target',
                                  edge_attr=None, create_using=nx.DiGraph())

In [None]:
rg_show = pd.read_csv('data/twitter/community/quotient-retweet-graph-by-show.csv',
                      names=['source', 'target', 'edge_attr'])
rg_show = rg_show.drop('edge_attr', axis=1)

rg_show = nx.from_pandas_edgelist(rg_show, source='source', target='target',
                                  edge_attr=None, create_using=nx.DiGraph())

## Filter show_ids and user_ids

### Filter to the shows we're using in main dataset

In [None]:
cooc_data = pd.read_csv('data/samples/radio/cooccurrence.csv')
cg_show = nx.Graph(cooc_data.rename({'show_id1': 'source', 'show_id2': 'target'}, axis=1))

In [None]:
with gzip.open('data/paper-round-3/radio/paper-round-3-snippets-audio-keys.csv.gz', 'rt') as f:
    snippets_2019_2020 = pd.read_csv(f, index_col='snippet_id')

In [None]:
with gzip.open('data/paper-round-3/radio/paper-round-3-snippets-show-station.csv.gz', 'rt') as f:
    snippets_2019_2020_show_station = pd.read_csv(f, index_col='snippet_id')
snippets_2019_2020['show_id'] = snippets_2019_2020_show_station['show_id']
assert snippets_2019_2020['show_id'].notna().all()

In [None]:
snippets_2019_2020['start_dt'] = pd.to_datetime(snippets_2019_2020['start_dt'])
snippets_2019_2020['end_dt'] = pd.to_datetime(snippets_2019_2020['end_dt'])

snippets_2019_2020['start_dt'].dt.year.value_counts()

In [None]:
shows_old = pd.concat([cooc_data['show_id1'], cooc_data['show_id2']]).unique().tolist()
shows_new = snippets_2019_2020['show_id'].unique().tolist()
shows_missing = set(shows_old) - set(shows_new)
assert set(shows_new) - set(shows_old) == set()

In [None]:
del snippets_2019_2020, snippets_2019_2020_show_station

### Apply filters

In [None]:
user_show = user_show_orig.loc[user_show_orig['show_id'].isin(shows_new)]
follow_comm = follow_comm.loc[follow_comm.index.isin(shows_new)]

In [None]:
show_names = user_show[['show_id', 'show_name']] \
                      .drop_duplicates() \
                      .sort_values(by='show_id') \
                      .set_index('show_id')

In [None]:
cg_show = cg_show.subgraph(shows_new).copy()
fg_show = fg_show.subgraph(shows_new).copy()
mg_show = mg_show.subgraph(shows_new).copy()
rg_show = rg_show.subgraph(shows_new).copy()

In [None]:
mask = user_show.show_id.isin(fg_show) & user_show.show_id.isin(cg_show) & user_show.user_id.isin(fg_user)

radio_user_ids = user_show.loc[mask, 'user_id'].unique().tolist()
radio_show_ids = user_show.loc[mask, 'show_id'].unique().tolist()

In [None]:
cgr_show = cg_show.subgraph(radio_show_ids).copy()
fgr_show = fg_show.subgraph(radio_show_ids).copy()
mgr_show = mg_show.subgraph(radio_show_ids).copy()
rgr_show = rg_show.subgraph(radio_show_ids).copy()

## Drop self-loop edges and isolates

### Self-loops

Drop these first so that any nodes with only self edges are dropped as isolates

In [None]:
edges = list(nx.selfloop_edges(fgr_show))
fgr_show.remove_edges_from(edges)

len(edges)

In [None]:
edges = list(nx.selfloop_edges(mgr_show))
mgr_show.remove_edges_from(edges)

len(edges)

In [None]:
edges = list(nx.selfloop_edges(rgr_show))
rgr_show.remove_edges_from(edges)

len(edges)

In [None]:
edges = list(nx.selfloop_edges(cgr_show))
cgr_show.remove_edges_from(edges)

len(edges)

### Isolates

In [None]:
isolates = set(list(nx.isolates(fgr_show)))
fgr_show.remove_nodes_from(isolates)

len(isolates)

In [None]:
isolates = set(list(nx.isolates(mgr_show)))
mgr_show.remove_nodes_from(isolates)

len(isolates)

In [None]:
isolates = set(list(nx.isolates(rgr_show)))
rgr_show.remove_nodes_from(isolates)

len(isolates)

In [None]:
isolates = set(list(nx.isolates(cgr_show)))
cgr_show.remove_nodes_from(isolates)

len(isolates)

# Inspect

## Summary statistics

In [None]:
order = cg_show.order()
size = cg_show.size()

display(order)
display(size)
display(order * (order - 1) / 2)

In [None]:
order = cgr_show.order()
size = cgr_show.size()

display(order)
display(size)
display(order * (order - 1) / 2)

In [None]:
order = fgr_show.order()
size = fgr_show.size()

display(order)
display(size)
display(order * (order - 1) / 2)

In [None]:
order = mgr_show.order()
size = mgr_show.size()

display(order)
display(size)
display(order * (order - 1) / 2)

In [None]:
order = rgr_show.order()
size = rgr_show.size()

display(order)
display(size)
display(order * (order - 1) / 2)

## Compare all possible edges

In [None]:
tmp_sr = pd.Series(radio_show_ids, name='show_id')
tmp_cd = pd.DataFrame(cgr_show.to_undirected().edges, columns=['show_id1', 'show_id2'])
tmp_fd = pd.DataFrame(fgr_show.to_undirected().edges, columns=['show_id1', 'show_id2'])
tmp_md = pd.DataFrame(mgr_show.to_undirected().edges, columns=['show_id1', 'show_id2'])
tmp_rd = pd.DataFrame(rgr_show.to_undirected().edges, columns=['show_id1', 'show_id2'])

dat_dyad = pysqldf('''
with frame as
(
    select
        us1.show_id as show_id1,
        us2.show_id as show_id2
    from tmp_sr us1
        cross join tmp_sr us2
    where
        us1.show_id < us2.show_id
)
select
    fr.show_id1,
    fr.show_id2,
    
    (cda.show_id1 is not null or cdb.show_id1 is not null) as in_cg,
    (fda.show_id1 is not null or fdb.show_id1 is not null) as in_fg,
    (mda.show_id1 is not null or mdb.show_id1 is not null) as in_mg,
    (rda.show_id1 is not null or rdb.show_id1 is not null) as in_rg
from frame fr
    left join tmp_cd cda on cda.show_id1 = fr.show_id1 and cda.show_id2 = fr.show_id2
    left join tmp_cd cdb on cda.show_id1 = fr.show_id2 and cda.show_id2 = fr.show_id1
    
    left join tmp_fd fda on fda.show_id1 = fr.show_id1 and fda.show_id2 = fr.show_id2
    left join tmp_fd fdb on fda.show_id1 = fr.show_id2 and fda.show_id2 = fr.show_id1
    
    left join tmp_md mda on mda.show_id1 = fr.show_id1 and mda.show_id2 = fr.show_id2
    left join tmp_md mdb on mda.show_id1 = fr.show_id2 and mda.show_id2 = fr.show_id1

    left join tmp_rd rda on rda.show_id1 = fr.show_id1 and rda.show_id2 = fr.show_id2
    left join tmp_rd rdb on rda.show_id1 = fr.show_id2 and rda.show_id2 = fr.show_id1

''')

In [None]:
pd.crosstab(dat_dyad.in_cg, dat_dyad.in_fg)

In [None]:
pd.crosstab(dat_dyad.in_cg, dat_dyad.in_fg, normalize='index')

In [None]:
pd.crosstab(dat_dyad.in_cg, dat_dyad.in_fg, normalize='columns')

## Plot co-airing graph

In [None]:
comps = [x for x in nx.connected_components(cg_show)]

pd.Series([len(x) for x in comps])

In [None]:
comps[1]

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

am = np.argmax([len(x) for x in comps])

nx.draw_spring(cg_show.subgraph(comps[am]), node_size=50, ax=ax)

## Compare co-airing and follow / mention graphs

### Degree distribution

In [None]:
deg_fg = pd.DataFrame(list(fgr_show.to_undirected().degree()), columns=['name', 'degree'])
deg_mg = pd.DataFrame(list(mgr_show.to_undirected().degree()), columns=['name', 'degree'])
deg_cg = pd.DataFrame(list(cgr_show.degree()), columns=['name', 'degree'])

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12, 5))
axes = axes.flatten()

deg_fg.boxplot('degree', ax=axes[0])
deg_mg.boxplot('degree', ax=axes[1])
deg_cg.boxplot('degree', ax=axes[2])

axes[0].set_title('Follow degree')
axes[1].set_title('Mention degree')
axes[2].set_title('Co-airing degree')

axes[0].set_ylim(-2, 40)
axes[1].set_ylim(-2, 40)
axes[2].set_ylim(-2, 40)

fig.tight_layout(rect=[0, 0.03, 1, 0.95])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 5))
axes = axes.flatten()

deg_fg.boxplot('degree', ax=axes[0])
deg_cg.boxplot('degree', ax=axes[1])

axes[0].set_title('Co-airing degree')
axes[1].set_title('Follow degree')

axes[0].set_ylim(-2, 40)
axes[1].set_ylim(-2, 40)

fig.tight_layout(rect=[0, 0.03, 1, 0.95])

In [None]:
deg_fg['degree'].mean()

In [None]:
deg_mg['degree'].mean()

In [None]:
deg_cg['degree'].mean()

### Transitivity

In [None]:
nx.transitivity(fgr_show.to_undirected())

In [None]:
nx.transitivity(mgr_show.to_undirected())

In [None]:
nx.transitivity(cgr_show)

### Clustering coefficients

In [None]:
nx.average_clustering(fgr_show.to_undirected())

In [None]:
nx.average_clustering(mgr_show.to_undirected())

In [None]:
nx.average_clustering(cgr_show)

### Components

In [None]:
nx.is_connected(fgr_show.to_undirected())

In [None]:
nx.is_connected(mgr_show.to_undirected())

In [None]:
nx.is_connected(cgr_show.to_undirected())

# Individual-level measures

In [None]:
dat_indiv_user = user_show.loc[user_show.user_id.isin(radio_user_ids), 'user_id']
dat_indiv_user = pd.Series(dat_indiv_user, name='user_id')
dat_indiv_user = pd.DataFrame(dat_indiv_user)

dat_indiv_user.loc[:, 'follow_indegree'] = dat_indiv_user.user_id.map(dict(fg_user.in_degree()))
dat_indiv_user.loc[:, 'mention_indegree'] = dat_indiv_user.user_id.map(dict(mg_user.in_degree()))
dat_indiv_user.loc[:, 'retweet_indegree'] = dat_indiv_user.user_id.map(dict(rg_user.in_degree()))
dat_indiv_user.loc[:, 'follow_outdegree'] = dat_indiv_user.user_id.map(dict(fg_user.out_degree()))
dat_indiv_user.loc[:, 'mention_outdegree'] = dat_indiv_user.user_id.map(dict(mg_user.out_degree()))
dat_indiv_user.loc[:, 'retweet_outdegree'] = dat_indiv_user.user_id.map(dict(rg_user.out_degree()))
dat_indiv_user.loc[:, 'follow_pagerank'] = dat_indiv_user.user_id.map(dict(nx.pagerank(fg_user)))
dat_indiv_user.loc[:, 'mention_pagerank'] = dat_indiv_user.user_id.map(dict(nx.pagerank(mg_user)))
dat_indiv_user.loc[:, 'retweet_pagerank'] = dat_indiv_user.user_id.map(dict(nx.pagerank(rg_user)))
dat_indiv_user.loc[:, 'follow_clustering'] = dat_indiv_user.user_id.map(dict(nx.clustering(fg_user)))
dat_indiv_user.loc[:, 'mention_clustering'] = dat_indiv_user.user_id.map(dict(nx.clustering(mg_user)))
dat_indiv_user.loc[:, 'retweet_clustering'] = dat_indiv_user.user_id.map(dict(nx.clustering(rg_user)))

In [None]:
dat_indiv_show = dat_indiv_user.merge(user_show[['user_id', 'show_id']], how='inner', on='user_id')

dat_indiv_show.loc[:, 'cooccurrence_degree'] = dat_indiv_show.show_id.map(dict(cg_show.degree()))
dat_indiv_show.loc[:, 'cooccurrence_pagerank'] = dat_indiv_show.show_id.map(dict(nx.pagerank(cg_show)))
dat_indiv_show.loc[:, 'cooccurrence_clustering'] = dat_indiv_show.show_id.map(dict(nx.clustering(cg_show)))

dat_indiv_show = dat_indiv_show.set_index('show_id')

display(dat_indiv_show.shape)

In [None]:
dat_indiv_show.corr()[['cooccurrence_degree', 'cooccurrence_pagerank', 'cooccurrence_clustering']]

# Community

Louvain communities in the coairing graph, vs the previously computed Louvain communities in the follow graph (ignoring edge directions).

In [None]:
comms = follow_comm.copy()

qpartition = community.best_partition(cg_show)
qpartition = pd.Series(qpartition, name='community')

comms['cooccurrence_community'] = qpartition
comms = comms.loc[~comms.cooccurrence_community.isna(), :]

display(comms.shape)

In [None]:
pd.crosstab(comms.follow_community, comms.cooccurrence_community)

In [None]:
pd.crosstab(comms.mention_community, comms.cooccurrence_community)

In [None]:
pd.crosstab(comms.follow_community == 3, comms.cooccurrence_community == 0)

In [None]:
pd.crosstab(comms.mention_community == 2, comms.cooccurrence_community == 0)

# Graph distances

## Calculate configuration model baseline

In [None]:
seed = 13893224121

# n = cgr_show.order()
# p = cgr_show.size() / ((n - 1)**2 / 2)
# g1 = nx.erdos_renyi_graph(n=n, p=p, directed=False)
g1 = nx.configuration_model([s[1] for s in cgr_show.degree()], seed=seed)

# n = fgr_show.order()
# p = fgr_show.size() / ((n - 1)**2 / 2)
# g2 = nx.erdos_renyi_graph(n=n, p=p, directed=False)
g2 = nx.configuration_model([s[1] for s in fgr_show.to_undirected().degree()], seed=seed)

for graph in [g1, g2, cgr_show, fgr_show, mgr_show, rgr_show]:
    for n in graph.nodes:
        graph.nodes[n]['show_id'] = n

## Approx edit distance

In [None]:
timeout = 30
node_match = lambda g1, g2: g1['show_id'] == g2['show_id']

edit_dists = {
    'random': nx.graph_edit_distance(g1, g2, node_match=node_match, timeout=timeout),
    'follow_directed': nx.graph_edit_distance(fgr_show, cgr_show, node_match=node_match, timeout=timeout),
    'follow_undirected': nx.graph_edit_distance(fgr_show.to_undirected(), cgr_show, node_match=node_match, timeout=timeout),
    'mention_directed': nx.graph_edit_distance(mgr_show, cgr_show, node_match=node_match, timeout=timeout),
    'mention_undirected': nx.graph_edit_distance(mgr_show.to_undirected(), cgr_show, node_match=node_match, timeout=timeout),
}

edit_dists

## Simrank similarities

In [None]:
cs = nx.simrank_similarity(cgr_show)
cs = pd.DataFrame.from_records(cs)
cs = pd.melt(cs, ignore_index=False).reset_index().rename({'variable': 'show_id1', 'index': 'show_id2'}, axis=1)

fs = nx.simrank_similarity(fgr_show.to_undirected())
fs = pd.DataFrame.from_records(fs)
fs = pd.melt(fs, ignore_index=False).reset_index().rename({'variable': 'show_id1', 'index': 'show_id2'}, axis=1)

ms = nx.simrank_similarity(mgr_show.to_undirected())
ms = pd.DataFrame.from_records(ms)
ms = pd.melt(ms, ignore_index=False).reset_index().rename({'variable': 'show_id1', 'index': 'show_id2'}, axis=1)

g1s = nx.simrank_similarity(g1)
g1s = pd.DataFrame.from_records(g1s)
g1s = pd.melt(g1s, ignore_index=False).reset_index().rename({'variable': 'show_id1', 'index': 'show_id2'}, axis=1)

g2s = nx.simrank_similarity(g2)
g2s = pd.DataFrame.from_records(g2s)
g2s = pd.melt(g2s, ignore_index=False).reset_index().rename({'variable': 'show_id1', 'index': 'show_id2'}, axis=1)

### Random graph baseline

In [None]:
dat = g2s.merge(g1s, how='inner', on=['show_id1', 'show_id2']).rename({'value_x': 'g2', 'value_y': 'g1'}, axis=1)
dat = dat.loc[dat['show_id1'] > dat['show_id2'], :]

In [None]:
dat[['g1', 'g2']].corr().loc['g1', 'g2']

In [None]:
(dat['g1'] - dat['g2']).hist(bins=50)

In [None]:
fig, ax = plt.subplots()
ax.scatter(dat['g1'], dat['g2'], s=5, alpha=0.75, c='navy', lw=0.25)

### Simrank similarities: follow

In [None]:
dat = fs.merge(cs, how='inner', on=['show_id1', 'show_id2']).rename({'value_x': 'fs', 'value_y': 'cs'}, axis=1)
dat = dat.loc[dat['show_id1'] > dat['show_id2'], :]

In [None]:
dat[['fs', 'cs']].corr().loc['fs', 'cs']

In [None]:
(dat['fs'] - dat['cs']).hist(bins=50)

In [None]:
fig, ax = plt.subplots()
ax.scatter(dat['fs'], dat['cs'], s=5, alpha=0.75, c='navy', lw=0.25)

In [None]:
sqldf('''
select
    sn1.show_name,
    sn2.show_name,
    dat.fs,
    dat.cs
from dat
    inner join show_names sn1 on sn1.show_id = dat.show_id1
    inner join show_names sn2 on sn2.show_id = dat.show_id2
where
    dat.cs < 0.15
order by random()
limit 20;
''')

In [None]:
sqldf('''
select
    sn1.show_name,
    sn2.show_name,
    dat.fs,
    dat.cs
from dat
    inner join show_names sn1 on sn1.show_id = dat.show_id1
    inner join show_names sn2 on sn2.show_id = dat.show_id2
where
    dat.cs > 0.15
order by random()
limit 20;
''')

### Simrank similarities: mention

In [None]:
dat = ms.merge(cs, how='inner', on=['show_id1', 'show_id2']).rename({'value_x': 'ms', 'value_y': 'cs'}, axis=1)
dat = dat.loc[dat['show_id1'] > dat['show_id2'], :]

In [None]:
dat[['ms', 'cs']].corr().loc['ms', 'cs']

In [None]:
(dat['ms'] - dat['cs']).hist(bins=50)

In [None]:
fig, ax = plt.subplots()
ax.scatter(dat['ms'], dat['cs'], s=5, alpha=0.75, c='navy', lw=0.25)

In [None]:
sqldf('''
select
    sn1.show_name,
    sn2.show_name,
    dat.ms,
    dat.cs
from dat
    inner join show_names sn1 on sn1.show_id = dat.show_id1
    inner join show_names sn2 on sn2.show_id = dat.show_id2
where
    dat.cs < 0.15
order by random()
limit 20;
''')

In [None]:
sqldf('''
select
    sn1.show_name,
    sn2.show_name,
    dat.ms,
    dat.cs
from dat
    inner join show_names sn1 on sn1.show_id = dat.show_id1
    inner join show_names sn2 on sn2.show_id = dat.show_id2
where
    dat.cs > 0.15
order by random()
limit 20;
''')

## Shortest path distances

In [None]:
g1_dists = ft.reduce(lambda x, y: x + y, [
    [(source, target, distance) for target, distance in targets.items()]
    for source, targets in list(nx.shortest_path_length(g1))
])
g1_dists = pd.DataFrame.from_records(g1_dists, columns=['show_id1', 'show_id2', 'dist'])
g1_dists = g1_dists.loc[g1_dists.show_id1 > g1_dists.show_id2, :]

g2_dists = ft.reduce(lambda x, y: x + y, [
    [(source, target, distance) for target, distance in targets.items()]
    for source, targets in list(nx.shortest_path_length(g2))
])
g2_dists = pd.DataFrame.from_records(g2_dists, columns=['show_id1', 'show_id2', 'dist'])
g2_dists = g2_dists.loc[g2_dists.show_id1 > g2_dists.show_id2, :]

cgr_dists = ft.reduce(lambda x, y: x + y, [
    [(source, target, distance) for target, distance in targets.items()]
    for source, targets in list(nx.shortest_path_length(cgr_show))
])
cgr_dists = pd.DataFrame.from_records(cgr_dists, columns=['show_id1', 'show_id2', 'dist'])
cgr_dists = cgr_dists.loc[cgr_dists.show_id1 > cgr_dists.show_id2, :]

fgr_dists = ft.reduce(lambda x, y: x + y, [
    [(source, target, distance) for target, distance in targets.items()]
    for source, targets in list(nx.shortest_path_length(fgr_show.to_undirected()))
])
fgr_dists = pd.DataFrame.from_records(fgr_dists, columns=['show_id1', 'show_id2', 'dist'])
fgr_dists = fgr_dists.loc[fgr_dists.show_id1 > fgr_dists.show_id2, :]

mgr_dists = ft.reduce(lambda x, y: x + y, [
    [(source, target, distance) for target, distance in targets.items()]
    for source, targets in list(nx.shortest_path_length(mgr_show.to_undirected()))
])
mgr_dists = pd.DataFrame.from_records(mgr_dists, columns=['show_id1', 'show_id2', 'dist'])
mgr_dists = mgr_dists.loc[mgr_dists.show_id1 > mgr_dists.show_id2, :]

### Random baseline

In [None]:
dat = g1_dists.merge(g2_dists, how='inner', on=['show_id1', 'show_id2']).rename({'dist_x': 'g1', 'dist_y': 'g2'}, axis=1)

In [None]:
dat.corr().loc['g1', 'g2']

In [None]:
pd.crosstab(dat['g1'], dat['g2'], normalize=1)

### Follow

In [None]:
dat = fgr_dists.merge(cgr_dists, on=['show_id1', 'show_id2']).rename({'dist_x': 'fgr', 'dist_y': 'cgr'}, axis=1)

In [None]:
dat.corr().loc['fgr', 'cgr']

In [None]:
pd.crosstab(dat['fgr'], dat['cgr'], normalize=1)

### Mention

In [None]:
dat = mgr_dists.merge(cgr_dists, on=['show_id1', 'show_id2']).rename({'dist_x': 'mgr', 'dist_y': 'cgr'}, axis=1)

In [None]:
dat.corr().loc['mgr', 'cgr']

In [None]:
pd.crosstab(dat['mgr'], dat['cgr'], normalize=1)

# Collect stats / plots used in paper

In [None]:
cg_show.order(), cg_show.size()

In [None]:
[len(c) for c in list(nx.connected_components(cg_show))]

In [None]:
user_show_orig['show_id'].nunique(), user_show_orig['user_id'].nunique()

In [None]:
user_show['show_id'].nunique(), user_show['user_id'].nunique()

In [None]:
stats = {
    ('follow', 'order'): fgr_show.to_undirected().order(),
    ('follow', 'size'): fgr_show.to_undirected().size(),
    ('follow', 'avg_degree'): pd.DataFrame(list(fgr_show.to_undirected().degree()), columns=['name', 'degree'])['degree'].mean(),
    ('follow', 'transitivity'): nx.transitivity(fgr_show.to_undirected()),
    ('follow', 'avg_clustering_coef'): nx.average_clustering(fgr_show.to_undirected()),

    ('mention', 'order'): mgr_show.to_undirected().order(),
    ('mention', 'size'): mgr_show.to_undirected().size(),
    ('mention', 'avg_degree'): pd.DataFrame(list(mgr_show.to_undirected().degree()), columns=['name', 'degree'])['degree'].mean(),
    ('mention', 'transitivity'): nx.transitivity(mgr_show.to_undirected()),
    ('mention', 'avg_clustering_coef'): nx.average_clustering(mgr_show.to_undirected()),

    ('coairing', 'order'): cgr_show.order(),
    ('coairing', 'size'): cgr_show.size(),
    ('coairing', 'avg_degree'): pd.DataFrame(list(cgr_show.degree()), columns=['name', 'degree'])['degree'].mean(),
    ('coairing', 'transitivity'): nx.transitivity(cgr_show),
    ('coairing', 'avg_clustering_coef'): nx.average_clustering(cgr_show),
}

stats = pd.DataFrame(pd.Series(stats)) \
    .reset_index() \
    .rename({'level_0': 'graph', 'level_1': 'statistic', 0: 'value'}, axis=1) \
    .set_index(['graph', 'statistic']) \
    .unstack(0) \
    .loc[['order', 'size', 'avg_degree', 'transitivity', 'avg_clustering_coef']]

stats = stats.rename({
    'order': r'Order (\# nodes)',
    'size': r'Size (\# edges)',
    'avg_degree': 'Average degree',
    'transitivity': 'Transitivity',
    'avg_clustering_coef': 'Avg. Clust. Coef.',
}, axis=0)

stats.columns = [c[1].title() for c in stats.columns]
stats.index.name = stats.index.name.title()

kwargs = {
    'environment': 'table',
    'label': 'tab:si-graph-metrics-follow-vs-coairing',
    'position_float': 'centering',
    'column_format': 'lccc',
    
    'caption': r'''
    Selected summary statistics of the follow and coairing graphs for the set of Twitter-matched shows, demonstrating a substantial degree of similarity. Respectively 10 and 9 shows out of 67 have been excluded from the follow and coairing graph statistics shown here because they were isolates.
    '''.strip(),
    
    'hrules': True,
}

print(stats \
    .style \
    .format(precision=2) \
    .to_latex(**kwargs)
)

In [None]:
df_baseline = g2s.merge(g1s, how='inner', on=['show_id1', 'show_id2']).rename({'value_x': 'g2', 'value_y': 'g1'}, axis=1)
df_baseline = df_baseline.loc[df_baseline['show_id1'] > df_baseline['show_id2'], :]

df_follow = fs.merge(cs, how='inner', on=['show_id1', 'show_id2']).rename({'value_x': 'fs', 'value_y': 'cs'}, axis=1)
df_follow = df_follow.loc[df_follow['show_id1'] > df_follow['show_id2'], :]

df_mention = ms.merge(cs, how='inner', on=['show_id1', 'show_id2']).rename({'value_x': 'ms', 'value_y': 'cs'}, axis=1)
df_mention = df_mention.loc[df_mention['show_id1'] > df_mention['show_id2'], :]

lineticks = np.linspace(0.12, 0.3, 1500)
textbox_props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ylim = (0.05, 0.35)
xlim = (0.12, 0.32)

fig, axes = plt.subplots(1, 3, constrained_layout=True, figsize=(15, 5))

### Follow
n_shows = len(set(df_follow['show_id1'].tolist() + df_follow['show_id1'].tolist()))
x, y = df_follow['fs'], df_follow['cs']
axes[0].scatter(x, y, s=5, alpha=0.75, c='navy', lw=0.25)

fit = np.poly1d(np.polyfit(x, y, 1))
axes[0].plot(lineticks, fit(lineticks)) # best-fit line

r, r2 = x.corr(y), x.corr(y) ** 2
txt = f'$r^2 = {r2.round(3)}$\n$r = {r.round(3)}$\nn = {n_shows} shows'
axes[0].text(0.68, 0.18, txt, transform=axes[0].transAxes, fontsize=13,
         verticalalignment='top', bbox=textbox_props)

axes[0].set_title('Follow vs Coairing')
axes[0].set_xlabel('SimRank: Follow')
axes[0].set_ylabel('SimRank: Coairing')
axes[0].tick_params(axis='x', labelrotation = 45)

axes[0].set_ylim(*ylim)
axes[0].set_xlim(*xlim)

### Mention
n_shows = len(set(df_mention['show_id1'].tolist() + df_mention['show_id1'].tolist()))
x, y = df_mention['ms'], df_mention['cs']
axes[1].scatter(x, y, s=5, alpha=0.75, c='navy', lw=0.25)

fit = np.poly1d(np.polyfit(x, y, 1))
axes[1].plot(lineticks, fit(lineticks)) # best-fit line

r, r2 = x.corr(y), x.corr(y) ** 2
txt = f'$r^2 = {r2.round(3)}$\n$r = {r.round(3)}$\nn = {n_shows} shows'
axes[1].text(0.68, 0.18, txt, transform=axes[1].transAxes, fontsize=13,
         verticalalignment='top', bbox=textbox_props)

axes[1].set_title('Mention vs Coairing')
axes[1].set_xlabel('SimRank: Mention')
axes[1].set_ylabel('SimRank: Coairing')
axes[1].tick_params(axis='x', labelrotation = 45)

axes[1].set_ylim(*ylim)
axes[1].set_xlim(*xlim)

### Baseline
n_shows = len(set(df_baseline['show_id1'].tolist() + df_baseline['show_id1'].tolist()))
x, y = df_baseline['g1'], df_baseline['g2']
axes[2].scatter(x, y, s=5, alpha=0.75, c='navy', lw=0.25)

fit = np.poly1d(np.polyfit(x, y, 1))
axes[2].plot(lineticks, fit(lineticks)) # best-fit line

r, r2 = x.corr(y), x.corr(y) ** 2
txt = f'$r^2 = {r2.round(3)}$\n$r = {r.round(3)}$\nn = {n_shows} shows'
axes[2].text(0.68, 0.18, txt, transform=axes[2].transAxes, fontsize=13,
         verticalalignment='top', bbox=textbox_props)

axes[2].set_title('Baseline: Configuration Models')
axes[2].set_xlabel('SimRank: Graph 1')
axes[2].set_ylabel('SimRank: Graph 2')
axes[2].tick_params(axis='x', labelrotation = 45)

axes[2].set_ylim(*ylim)
axes[2].set_xlim(*xlim)