In [None]:
import os
import pickle
import logging
import itertools as it

import numpy as np
import pandas as pd
import networkx as nx
import scipy.stats as ss

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

import sklearn as sk
import sklearn.multiclass as mc
import sklearn.metrics as mt
import sklearn.pipeline as pp
import sklearn.linear_model as lm
import sklearn.preprocessing as pr
import sklearn.model_selection as ms
import sklearn.feature_extraction.text as te

from nltk.util import ngrams

from IPython.display import display
from tqdm.notebook import tqdm

import utils as ut

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_colwidth', None)

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger("gensim").setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
os.makedirs('data/twitter/', exist_ok=True)
os.makedirs('data/radio/', exist_ok=True)

# Load common Twitter data

In [None]:
user_show = pd.read_csv('data/samples/twitter/user-show.csv')

display(user_show.shape)

In [None]:
universe = pd.read_csv('data/samples/twitter/user-data.csv', index_col='user_id')
display(universe.shape)

In [None]:
show_names = user_show[['show_id', 'show_name']].drop_duplicates().set_index('show_id')

## User-level graphs

In [None]:
fg_user_data = pd.read_csv('data/twitter/community/quotient-follow-graph.csv',
                           names=['source', 'target', 'edge_attr'])
fg_user_data = fg_user_data.drop('edge_attr', axis=1)

fg_user = nx.from_pandas_edgelist(fg_user_data, source='source',
                                  target='target', edge_attr=None,
                                  create_using=nx.DiGraph())

In [None]:
mg_user_data = pd.read_csv('data/twitter/community/quotient-mention-graph.csv',
                           names=['source', 'target', 'edge_attr'])
mg_user_data = mg_user_data.drop('edge_attr', axis=1)

mg_user = nx.from_pandas_edgelist(mg_user_data, source='source',
                                  target='target', edge_attr=None,
                                  create_using=nx.DiGraph())

In [None]:
rg_user_data = pd.read_csv('data/twitter/community/quotient-retweet-graph.csv',
                           names=['source', 'target', 'edge_attr'])
rg_user_data = rg_user_data.drop('edge_attr', axis=1)

rg_user = nx.from_pandas_edgelist(rg_user_data, source='source',
                                  target='target', edge_attr=None,
                                  create_using=nx.DiGraph())

# Twitter graphs on show level

In [None]:
fg_show_data = fg_user_data.merge(
    user_show[['user_id', 'show_id']],
    how='inner', left_on='source', right_on='user_id'
).drop(['source', 'user_id'], axis=1).rename({'show_id': 'source'}, axis=1)

fg_show_data = fg_show_data.merge(
    user_show[['user_id', 'show_id']],
    how='inner', left_on='target', right_on='user_id'
).drop(['target', 'user_id'], axis=1).rename({'show_id': 'target'}, axis=1)

fg_show = nx.from_pandas_edgelist(fg_show_data, source='source',
                                  target='target', edge_attr=None,
                                  create_using=nx.DiGraph())

In [None]:
mg_show_data = mg_user_data.merge(
    user_show[['user_id', 'show_id']],
    how='inner', left_on='source', right_on='user_id'
).drop(['source', 'user_id'], axis=1).rename({'show_id': 'source'}, axis=1)

mg_show_data = mg_show_data.merge(
    user_show[['user_id', 'show_id']],
    how='inner', left_on='target', right_on='user_id'
).drop(['target', 'user_id'], axis=1).rename({'show_id': 'target'}, axis=1)

mg_show = nx.from_pandas_edgelist(mg_show_data, source='source',
                                  target='target', edge_attr=None,
                                  create_using=nx.DiGraph())

In [None]:
rg_show_data = rg_user_data.merge(
    user_show[['user_id', 'show_id']],
    how='inner', left_on='source', right_on='user_id'
).drop(['source', 'user_id'], axis=1).rename({'show_id': 'source'}, axis=1)

rg_show_data = rg_show_data.merge(
    user_show[['user_id', 'show_id']],
    how='inner', left_on='target', right_on='user_id'
).drop(['target', 'user_id'], axis=1).rename({'show_id': 'target'}, axis=1)

rg_show = nx.from_pandas_edgelist(rg_show_data, source='source',
                                  target='target', edge_attr=None,
                                  create_using=nx.DiGraph())

In [None]:
nx.write_edgelist(fg_show, path='data/twitter/community/quotient-follow-graph-by-show.csv', delimiter=',')

In [None]:
nx.write_edgelist(mg_show, path='data/twitter/community/quotient-mention-graph-by-show.csv', delimiter=',')

In [None]:
nx.write_edgelist(rg_show, path='data/twitter/community/quotient-retweet-graph-by-show.csv', delimiter=',')

# Twitter metrics by show

## Ideology estimates

In [None]:
show_ideology_all = pd.read_csv('data/twitter/ideology-mds/show.csv')
show_ideology_all = show_ideology_all[['show_id', 'dim0', 'dim1']] \
                                     .rename({'dim0': 'dim0_all', 'dim1': 'dim1_all'}, axis=1) \
                                     .set_index('show_id')

show_ideology_hosts = pd.read_csv('data/twitter/ideology-mds/show-hosts.csv')
show_ideology_hosts = show_ideology_hosts[['show_id', 'dim0', 'dim1']] \
                                         .rename({'dim0': 'dim0_hosts', 'dim1': 'dim1_hosts'}, axis=1) \
                                         .set_index('show_id')

## Graph community

In [None]:
show_follow_comm = pd.read_csv('data/twitter/community/quotient-follow-community-by-show.csv')
show_follow_comm = show_follow_comm.rename({'community': 'follow_community'}, axis=1)
show_follow_comm = show_follow_comm.set_index('show_id')

show_mention_comm = pd.read_csv('data/twitter/community/quotient-mention-community-by-show.csv')
show_mention_comm = show_mention_comm.rename({'community': 'mention_community'}, axis=1)
show_mention_comm = show_mention_comm.set_index('show_id')

show_retweet_comm = pd.read_csv('data/twitter/community/quotient-retweet-community-by-show.csv')
show_retweet_comm = show_retweet_comm.rename({'community': 'retweet_community'}, axis=1)
show_retweet_comm = show_retweet_comm.set_index('show_id')

## Graph centrality

In [None]:
show_follow_cent = pd.read_csv('data/twitter/community/quotient-follow-centrality-by-show.csv')
show_follow_cent = show_follow_cent.rename({'centrality': 'follow_centrality'}, axis=1)
show_follow_cent = show_follow_cent.set_index('show_id')

show_mention_cent = pd.read_csv('data/twitter/community/quotient-mention-centrality-by-show.csv')
show_mention_cent = show_mention_cent.rename({'centrality': 'mention_centrality'}, axis=1)
show_mention_cent = show_mention_cent.set_index('show_id')

show_retweet_cent = pd.read_csv('data/twitter/community/quotient-retweet-centrality-by-show.csv')
show_retweet_cent = show_retweet_cent.rename({'centrality': 'retweet_centrality'}, axis=1)
show_retweet_cent = show_retweet_cent.set_index('show_id')

## Combine

In [None]:
show_data = pd.concat([show_follow_comm, show_follow_cent, show_mention_comm,
                       show_mention_cent, show_retweet_comm, show_retweet_cent,
                       show_ideology_all, show_ideology_hosts,
                       show_names], axis=1)

display(show_data.shape)

In [None]:
show_data.head().T

In [None]:
show_data.isna().sum()

In [None]:
with pd.option_context('display.max_rows', None):
    display(show_data.loc[show_data.follow_community.isna(), :])

In [None]:
show_data.to_csv('data/twitter/community-ideology-by-show.csv', index=True)

## Combine with radio data

### Show pairs content

In [None]:
radio = pd.read_csv('data/samples/radio/show-pairs-content.csv')
radio = radio.merge(show_data, how='inner', on='show_id')

display(radio.shape)

In [None]:
radio.to_csv('data/radio/show-pairs-content-with-twitter-metrics.csv', index=False)

### All local content

In [None]:
radio_local = pd.read_csv('data/samples/radio/all-local-content.csv')
radio_local = radio_local.merge(show_data, how='inner', on='show_id')

display(radio_local.shape)

In [None]:
radio_local.to_csv('data/radio/all-local-content-with-twitter-metrics.csv', index=False)

### All syndicated content

In [None]:
radio_synd = pd.read_csv('data/samples/radio/all-syndicated-content.csv')
radio_synd = radio_synd.merge(show_data, how='inner', on='show_id')

display(radio_synd.shape)

In [None]:
radio_synd.to_csv('data/radio/all-syndicated-content-with-twitter-metrics.csv', index=False)

# Twitter metrics by user_id

## Ideology estimates

In [None]:
user_ideology = pd.read_csv('data/twitter/ideology-mds/user.csv', index_col='user_id')

## Graph community

In [None]:
user_follow_comm = pd.read_csv('data/twitter/community/quotient-follow-community-all.csv')
user_follow_comm = user_follow_comm.rename({'community': 'follow_community'}, axis=1)
user_follow_comm = user_follow_comm.set_index('user_id')

user_mention_comm = pd.read_csv('data/twitter/community/quotient-mention-community-all.csv')
user_mention_comm = user_mention_comm.rename({'community': 'mention_community'}, axis=1)
user_mention_comm = user_mention_comm.set_index('user_id')

user_retweet_comm = pd.read_csv('data/twitter/community/quotient-retweet-community-all.csv')
user_retweet_comm = user_retweet_comm.rename({'community': 'retweet_community'}, axis=1)
user_retweet_comm = user_retweet_comm.set_index('user_id')

## Graph centrality

In [None]:
user_follow_cent = pd.read_csv('data/twitter/community/quotient-follow-centrality.csv')
user_follow_cent = user_follow_cent.rename({'centrality': 'follow_centrality'}, axis=1)
user_follow_cent = user_follow_cent.set_index('user_id')

user_mention_cent = pd.read_csv('data/twitter/community/quotient-mention-centrality.csv')
user_mention_cent = user_mention_cent.rename({'centrality': 'mention_centrality'}, axis=1)
user_mention_cent = user_mention_cent.set_index('user_id')

user_retweet_cent = pd.read_csv('data/twitter/community/quotient-retweet-centrality.csv')
user_retweet_cent = user_retweet_cent.rename({'centrality': 'retweet_centrality'}, axis=1)
user_retweet_cent = user_retweet_cent.set_index('user_id')

## Combine

In [None]:
user_data = pd.concat([user_follow_comm, user_follow_cent, user_mention_comm,
                  user_mention_cent, user_retweet_comm, user_retweet_cent,
                  user_ideology], axis=1)

display(user_data.shape)

In [None]:
user_data.head().T

In [None]:
user_data = user_data.reset_index()

user_data = pysqldf("""
select
    uv.user_id,
    
    -- these are computed on a quotient graph and we need to interpolate,
    -- for people who are not selected representatives of each show, the
    -- community the chosen rep was assigned to
    coalesce(ud.follow_community, sd.follow_community) as follow_community,
    coalesce(ud.follow_centrality, sd.follow_centrality) as follow_centrality,
    coalesce(ud.mention_community, sd.mention_community) as mention_community,
    coalesce(ud.mention_centrality, sd.mention_centrality) as mention_centrality,
    coalesce(ud.retweet_community, sd.retweet_community) as retweet_community,
    coalesce(ud.retweet_centrality, sd.retweet_centrality) as retweet_centrality,
    
    -- these not computed on a quotient graph, we can keep just each user's
    -- computed ideology value
    ud.dim0,
    ud.dim1
from universe uv
    left join user_data ud on ud.user_id = uv.user_id
    left join user_show us on us.user_id = uv.user_id
    left join show_data sd on sd.show_id = us.show_id;
""").set_index('user_id')

display(user_data.shape)

In [None]:
user_data.head().T

In [None]:
user_data.isna().sum()

In [None]:
with pd.option_context('display.max_rows', None):
    display(user_data.loc[user_data.follow_community.isna(), :])

In [None]:
user_data.to_csv('data/twitter/community-ideology.csv', index=True)