In [None]:
import os
import logging

import psycopg2
import community
import numpy as np
import pandas as pd
import networkx as nx

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from IPython.display import display

from tqdm.notebook import tqdm

from nltk.corpus import stopwords as sw

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

import utils as ut

In [None]:
fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = None

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
os.makedirs('data/twitter/community/', exist_ok=True)

# Load data

### User profile data

In [None]:
user_data = pd.read_csv('data/samples/twitter/user-data.csv', index_col='user_id')
display(user_data.shape)

In [None]:
user_show = pd.read_csv('data/samples/twitter/user-show.csv')
display(user_show.shape)

### Follow graph

In [None]:
follow_graph = pd.read_csv('data/samples/twitter/follow-graph.csv')
follow_graph = follow_graph.rename({'source_user_id': 'source', 'target_user_id': 'target'}, axis=1)

fg = nx.from_pandas_edgelist(follow_graph, source='source', target='target',
                             edge_attr=None, create_using=nx.DiGraph())

display(follow_graph.shape)

In [None]:
follow_graph.head()

In [None]:
fg.order()

In [None]:
fg.size()

# Confirm connectivity

In [None]:
ucomps = [x for x in nx.connected_components(fg.to_undirected())]
display(len(ucomps))

In [None]:
scomps = [x for x in nx.strongly_connected_components(fg)]
display(len(scomps))

In [None]:
pd.Series([len(x) for x in scomps]).value_counts()

# Community detection + centrality

## Graph as-is

In [None]:
partition = community.best_partition(fg.to_undirected())
partition = pd.Series(partition, name='community')

comms = pd.DataFrame(partition).merge(user_data[['screen_name', 'name', 'description', 'location', 'is_radio']],
                                      how='inner', left_index=True, right_index=True)
comms.index.name = 'user_id'

In [None]:
comms.community.value_counts()

In [None]:
comms.loc[comms.is_radio, 'community'].value_counts()

In [None]:
comms.sample(frac=1).groupby('community').head(10).sort_values('community')

## Quotient graph collapsing shows

The `nx.quotient_graph()` function is very, very slow...

In [None]:
qg = fg.to_undirected().copy()

In [None]:
for sh in user_show.show_id.unique():
    nodes = {i for i in user_show.loc[user_show.show_id == sh, 'user_id']}
    
    rep = list(nodes)[0]
    for n in nodes - {rep}:
        if n not in qg.nodes: # has no followers
            continue
        
        edges = list(qg.edges(n))
        for e1, e2 in edges:
            assert e1 == n

            if e2 == rep:
                continue
            else:
                qg.add_edge(rep, e2)
        
        qg.remove_node(n)

In [None]:
qpartition = community.best_partition(qg)
qpartition = pd.Series(qpartition, name='community')

qcomms = pd.DataFrame(qpartition).merge(user_data[['screen_name', 'name', 'description', 'location', 'is_radio']],
                                        how='inner', left_index=True, right_index=True)
qcomms.index.name = 'user_id'

In [None]:
qcomms.community.value_counts()

In [None]:
qcomms.loc[qcomms.is_radio, 'community'].value_counts()

In [None]:
qcomms.sample(frac=1).groupby('community').head(10).sort_values('community')

## Betweenness centrality

In [None]:
bc = pd.Series(nx.betweenness_centrality(fg), name='centrality')
bc = pd.DataFrame(bc).reset_index().rename({'index': 'user_id'}, axis=1).set_index('user_id')

In [None]:
qbc = pd.Series(nx.betweenness_centrality(qg), name='centrality')
qbc = pd.DataFrame(qbc).reset_index().rename({'index': 'user_id'}, axis=1).set_index('user_id')

# Match to shows

## Graph as-is

In [None]:
rcomms = comms.merge(user_show, how='inner', right_on='user_id', left_index=True)

In [None]:
rcomms.community.value_counts()

In [None]:
dupes = rcomms.groupby('show_id').community.nunique().sort_values()
dupes = dupes[dupes > 1].index.tolist()

rcomms.loc[rcomms.show_id.isin(dupes), ['show_id', 'show_name']].drop_duplicates()

In [None]:
show_community = rcomms.loc[~rcomms.show_id.isin(dupes), ['show_id', 'community']]
show_community = show_community.reset_index().drop('index', axis=1).set_index('show_id').community

In [None]:
show_community.value_counts()

## Quotient graph

In [None]:
rqcomms = qcomms.merge(user_show, how='inner', right_on='user_id', left_index=True)

In [None]:
rqcomms.community.value_counts()

In [None]:
dupes = rqcomms.groupby('show_id').community.nunique().sort_values()
dupes = dupes[dupes > 1].index.tolist()

rcomms.loc[rcomms.show_id.isin(dupes), ['show_id', 'show_name']].drop_duplicates()

In [None]:
qshow_community = rqcomms.loc[~rqcomms.show_id.isin(dupes), ['show_id', 'community']]
qshow_community = qshow_community.reset_index().drop('index', axis=1).set_index('show_id').community

In [None]:
qshow_community.value_counts()

## Betweenness centrality

In [None]:
bc_show = qbc.merge(user_show, how='inner', right_on='user_id', left_index=True)
bc_show = bc_show[['show_id', 'centrality']].set_index('show_id')

bc_show.head()

In [None]:
bc_show.centrality.hist(bins=50)

# Write out data

In [None]:
nx.write_edgelist(qg, path='data/twitter/community/quotient-follow-graph.csv', delimiter=',')

In [None]:
comms[['community']].to_csv('data/twitter/community/follow-community-all.csv')

In [None]:
qcomms[['community']].to_csv('data/twitter/community/quotient-follow-community-all.csv')

In [None]:
bc.to_csv('data/twitter/community/follow-centrality.csv')

In [None]:
qbc.to_csv('data/twitter/community/quotient-follow-centrality.csv')

In [None]:
show_community.to_csv('data/twitter/community/follow-community-by-show.csv')

In [None]:
qshow_community.to_csv('data/twitter/community/quotient-follow-community-by-show.csv')

In [None]:
bc_show.to_csv('data/twitter/community/quotient-follow-centrality-by-show.csv')