In [None]:
import os
import gzip
import random
import logging

import numpy as np
import pandas as pd

import networkx as nx
import community

import sklearn.model_selection as ms

import sklearn.metrics as mt

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
np.random.randint(0, 2**32 - 1)

In [None]:
seed = 1511200828

random.seed(seed)
np.random.seed(seed)

# Load data

In [None]:
ci = pd.read_csv('data/twitter/community-ideology-by-show.csv', index_col='show_id')
show = pd.read_csv('data/samples/radio/show-data.csv', index_col='show_id')
ci['public_fraction'] = show['public_fraction']
ci['show_name'] = show['show_name']

ci = ci.loc[~ci['public_fraction'].isna() & ~ci['dim0_all'].isna(), :]

ci['is_public_approx'] = (show['public_fraction'] > 0.5).astype(int)

ci = ci[['public_fraction', 'is_public_approx', 'dim0_all', 'show_name']] \
    .rename({'dim0_all': 'ideology'}, axis=1)

ci['ideology'] *= -1
ci['ideology'] -= ci['ideology'].min()
ci['ideology'] /= ci['ideology'].max()

ci['ideology_bin'] = (ci['ideology'] > 0.5).astype(int)

In [None]:
ci_user = pd.read_csv('data/twitter/community-ideology.csv')

In [None]:
follow_graph = pd.read_csv('data/samples/twitter/follow-graph.csv')
follow_graph = follow_graph.rename({'source_user_id': 'source', 'target_user_id': 'target'}, axis=1)

fg = nx.from_pandas_edgelist(follow_graph, source='source', target='target',
                             edge_attr=None, create_using=nx.DiGraph())

display(follow_graph.shape)

In [None]:
spc = pd.read_csv('data/radio/show-pairs-content-with-twitter-metrics.csv')

# Check predictability of public/talk status from ideology scores

In [None]:
ci.shape[0], ((ci['public_fraction'] > 0.99) | (ci['public_fraction'] < 0.01)).sum()

In [None]:
pd.crosstab(ci['ideology_bin'], ci['is_public_approx'])

In [None]:
ci.describe()

In [None]:
{
    'auc': mt.roc_auc_score(ci['is_public_approx'], ci['ideology']),
    'f1': mt.f1_score(ci['is_public_approx'], ci['ideology_bin']),
    'precision': mt.precision_score(ci['is_public_approx'], ci['ideology_bin']),
    'recall': mt.recall_score(ci['is_public_approx'], ci['ideology_bin']),
}

# Community size

In [None]:
grp = ms.GroupShuffleSplit(n_splits=3, train_size=0.75, random_state=seed)

In [None]:
train_inds, test_inds = next(grp.split(spc, groups=spc['show_id']))
data_train, data_test = spc.iloc[train_inds, :].copy(), spc.iloc[test_inds, :].copy()

In [None]:
data_test.groupby('follow_community').size()

In [None]:
data_test.groupby('follow_community').size() / data_test.shape[0]

# Ideology by community

In [None]:
ci_user.groupby('follow_community')['dim0'].mean()

# Check modularity

We quote the modularity of these follow communities in the paper, let's reproduce it here.

In [None]:
partition = dict(ci_user \
    .loc[ci_user['user_id'].isin(list(fg.nodes))] \
    [['user_id', 'follow_community']] \
    .to_records(index=False) \
    .tolist())

In [None]:
community.modularity(partition, fg.to_undirected())

# Number of episodes

In [None]:
spc.shape[0]