In [None]:
import os
import pickle
import logging

import psycopg2
import numpy as np
import pandas as pd
import networkx as nx

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

import sklearn as sk
import sklearn.metrics as mt
import sklearn.preprocessing as pr
import sklearn.manifold as md
import sklearn.decomposition as dc

from IPython.display import display
from tqdm.notebook import tqdm

import utils as ut

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_colwidth', None)

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger("gensim").setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))
os.makedirs('data/twitter/ideology-mds/', exist_ok=True)

# Load data

In [None]:
user_show = pd.read_csv('data/samples/twitter/user-show.csv')

display(user_show.shape)

In [None]:
user_data = pd.read_csv('data/samples/twitter/user-data.csv', index_col='user_id')
display(user_data.shape)

In [None]:
show_names = user_show[['show_id', 'show_name']].drop_duplicates().sort_values(by='show_id').set_index('show_id')

In [None]:
follow_graph = pd.read_csv('data/samples/twitter/follow-graph-multiple-radio.csv')

display(follow_graph.shape)

In [None]:
fg = nx.from_pandas_edgelist(follow_graph, source='source', target='target',
                             edge_attr=None, create_using=nx.DiGraph())

# Modeling

In [None]:
nodes = pd.Series(list(fg.nodes()))

In [None]:
mask = nodes.isin(user_show.user_id).to_numpy()

adj = nx.adjacency_matrix(fg, nodelist=nodes.tolist()).T
adj = adj[mask, :][:, ~mask].toarray()

display(adj.shape)

In [None]:
sim = mt.pairwise.cosine_similarity(adj)

display(sim.shape)

In [None]:
scaler = md.MDS(n_components=2, metric=True, n_jobs=-1, n_init=50, max_iter=1000)

In [None]:
rsim = scaler.fit_transform(sim)

# Prepare datasets

In [None]:
scores = pd.DataFrame(rsim, columns=['dim0', 'dim1'])

scores['user_id'] = nodes[mask].tolist()
scores.set_index('user_id', inplace=True)

scores['screen_name'] = user_data.loc[nodes[mask], 'screen_name']

In [None]:
show_scores = scores.merge(user_show, on='user_id')\
                    .drop('show_name', axis=1)\
                    .set_index('user_id')\
                    .groupby('show_id')\
                    .mean()
show_scores['show_name'] = show_names['show_name']

In [None]:
show_scores_hosts = scores.merge(user_show, on='user_id')
show_scores_hosts = show_scores_hosts.loc[show_scores_hosts.is_host | show_scores_hosts.is_show, :] \
                    .drop('show_name', axis=1)\
                    .set_index('user_id')\
                    .groupby('show_id')\
                    .mean()
show_scores_hosts['show_name'] = show_names['show_name']

# Analyze

## Which dimension is ideology?

## User-level

In [None]:
with pd.option_context('display.max_rows', None):
    display(scores.set_index('screen_name').sort_values('dim0'))

In [None]:
binarized = (scores['dim0'] >= scores['dim0'].median())

display(scores.loc[binarized, :].sort_values('dim0', ascending=False).head())
display(scores.loc[~binarized, :].sort_values('dim0', ascending=True).head())

In [None]:
with pd.option_context('display.max_rows', None):
    display(scores.set_index('screen_name').sort_values('dim1'))

In [None]:
binarized = (scores['dim1'] >= scores['dim1'].median())

display(scores.loc[binarized, :].sort_values('dim1', ascending=False).head())
display(scores.loc[~binarized, :].sort_values('dim1', ascending=True).head())

## Show-level

In [None]:
with pd.option_context('display.max_rows', None):
    display(show_scores.set_index('show_name').sort_values('dim0'))

In [None]:
binarized = (show_scores['dim0'] >= show_scores['dim0'].median())

display(show_scores.loc[binarized, :].sort_values('dim0', ascending=False).head())
display(show_scores.loc[~binarized, :].sort_values('dim0', ascending=True).head())

In [None]:
with pd.option_context('display.max_rows', None):
    display(show_scores.set_index('show_name').sort_values('dim1'))

In [None]:
binarized = (show_scores['dim1'] >= show_scores['dim1'].median())

display(show_scores.loc[binarized, :].sort_values('dim1', ascending=False).head())
display(show_scores.loc[~binarized, :].sort_values('dim1', ascending=True).head())

In [None]:
fig, ax = plt.subplots(figsize=(30, 30))

ax.scatter(scores['dim0'], scores['dim1'])

for x, y, name in zip(scores['dim0'], scores['dim1'], scores['screen_name']):
    plt.annotate(name, (x, y), textcoords="offset points", xytext=(0, 10), ha='center')
    
_ = ax.set_title('Ideology by user, all accounts')

In [None]:
fig, ax = plt.subplots(figsize=(30, 30))

ax.scatter(show_scores['dim0'], show_scores['dim1'])
    
for x, y, name in zip(show_scores['dim0'], show_scores['dim1'], show_scores['show_name']):
    plt.annotate(name, (x, y), textcoords="offset points", xytext=(0, 10), ha='center')
    
_ = ax.set_title('Ideology by show, all accounts')

In [None]:
fig, ax = plt.subplots(figsize=(30, 30))

ax.scatter(show_scores_hosts['dim0'], show_scores_hosts['dim1'])
    
for x, y, name in zip(show_scores_hosts['dim0'], show_scores_hosts['dim1'], show_scores_hosts['show_name']):
    plt.annotate(name, (x, y), textcoords="offset points", xytext=(0, 10), ha='center')
    
_ = fig.suptitle('Ideology by show, hosts and institutional accounts only')

# Write out scores

In [None]:
scores.drop('screen_name', axis=1).to_csv('data/twitter/ideology-mds/user.csv', index=True)

In [None]:
show_scores.to_csv('data/twitter/ideology-mds/show.csv', index=True)

In [None]:
show_scores_hosts.to_csv('data/twitter/ideology-mds/show-hosts.csv', index=True)