ufes-pgcs
---

Código desenvolvido e utilizado no âmbito da dissertação produzida para o Curso de Mestrado em Ciências Sociais da Universidade Federal do Espírito Santo.

___

## Prerrequesites

### Import requirements

> Some graph building and analysis functions require [graphkit](https://github.com/nelsonaloysio/graphkit). Stopwords list used: [stopwords.py](https://gist.github.com/nelsonaloysio/302dbbf3963fababde6e9f97669587df) (plus `nltk.corpus.stopwords`).

In [None]:
import json
import os
import pickle
# import collections
# import io
# import math
# import pkgutil
# import re
# import string
# import urllib
from os import listdir
# from datetime import datetime
# from pprint import pprint

import igraph as ig
import leidenalg as la
import matplotlib
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import networkx as nx
import nltk
import pandas as pd
import plotly.io as pio
import plotly.offline as py
import scattertext as st
import scipy.stats as stats
import seaborn as sns
import spacy
# import networkit as nk
# import numpy as np
# import plotly.express as px
# import plotly.graph_objects as go
from matplotlib.ticker import FuncFormatter, StrMethodFormatter
from urllib.request import urlopen
# from IPython.core.display import HTML, display
# from IPython.display import IFrame
# from matplotlib.ticker import IndexFormatter

from pygraphkit import GraphKit
from notebook_functions import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

### Set up vars and dictionaries

In [None]:
matplotlib.rc('mathtext', fontset='stix')
matplotlib.rcParams['font.size'] = 11 # 14

plt.rcParams.update({'font.family': 'Times New Roman', 'font.size': 12}) # 'font.size': 14
py.init_notebook_mode = True

gk = GraphKit()
tokenizer = Tokenizer()
plot = Plot("ggplot2")
pio.templates.default = 'ggplot2' # 'seaborn'

datetime_format = "%a %b %d %H:%M:%S %z %Y"

params = dict(
    # colors='#000000',
    font_size=12,
    edge_width=0.5,
    edge_alpha=0.9,
    figsize=(3, 2.5),
    show_labels=False,
    layout='kamada_kawai_layout',
    node_size=15,
)

# display(HTML("<style>.container { width:98% !important; }</style>"))

___

## Data overview

### Counts

In [None]:
d = "../data/counts"
counts = pd.DataFrame(dict(
    tweets=flatten([load_json(f"{d}/{f}", "tweet_count") for f in sorted(os.listdir(d)) if f.startswith("t")]),
    retweets=flatten([load_json(f"{d}/{f}", "tweet_count") for f in sorted(os.listdir(d)) if f.startswith("r")])),
    index=flatten([load_json(f"{d}/{f}", "start") for f in sorted(os.listdir(d)) if f.startswith("t")], 10),
)
# counts

### Errors

In [None]:
d = "../data/errors"
tweet_errors = pd.concat([load_json(f"{d}/{f}") for f in sorted(os.listdir(d)) if f.startswith("t")])
retweet_errors = pd.concat([load_json(f"{d}/{f}") for f in sorted(os.listdir(d)) if f.startswith("r")])
errors = pd.concat([tweet_errors, retweet_errors]).drop_duplicates("resource_id")
errors.index = range(errors.shape[0])
# errors

#### Suspended users

In [None]:
t = errors[errors["resource_type"] == "tweet"]
u = errors[errors["resource_type"] == "user"]
u[u["parameter"] == 'in_reply_to_user_id']["detail"].apply(
    lambda x: True if "has been suspended" in x else None).dropna().shape
# u[u["parameter"] == 'entities.mentions.username']
# print(t, u)

### Media

In [None]:
d = "../data/media"
tweet_media = pd.concat([load_json(f"{d}/{f}") for f in sorted(os.listdir(d)) if f.startswith("t")])
retweet_media = pd.concat([load_json(f"{d}/{f}") for f in sorted(os.listdir(d)) if f.startswith("r")])
media = pd.concat([tweet_media, retweet_media]).drop_duplicates("media_key")
media.index = range(media.shape[0])
# media

### Places

In [None]:
d = "../data/places"
places = pd.concat([load_json(f"{d}/{f}") for f in sorted(os.listdir(d))]).drop_duplicates("id")
places.index = range(places.shape[0])
# places

### Polls (2016 onwards)

In [None]:
d = "../data/polls"
tweet_polls = pd.concat([load_json(f"{d}/{f}") for f in sorted(os.listdir(d)) if f.startswith("t")])
retweet_polls = pd.concat([load_json(f"{d}/{f}") for f in sorted(os.listdir(d)) if f.startswith("r")])
polls = pd.concat([tweet_polls, retweet_polls]).drop_duplicates("id")
polls.index = range(polls.shape[0])
# polls

### Referenced tweets

In [None]:
d = "../data/tweets"
tweet_ref = pd.concat([load_json(f"{d}/{f}") for f in sorted(os.listdir(d)) if f.startswith("t")])
retweet_ref = pd.concat([load_json(f"{d}/{f}") for f in sorted(os.listdir(d)) if f.startswith("r")])
ref = pd.concat([tweet_ref, retweet_ref]).drop_duplicates("id")
ref.index = range(ref.shape[0])
# ref

### Referenced users

In [None]:
d = "../data/users"
tweet_users = pd.concat([load_json(f"{d}/{f}") for f in sorted(os.listdir(d)) if f.startswith("t")])
retweet_users = pd.concat([load_json(f"{d}/{f}") for f in sorted(os.listdir(d)) if f.startswith("r")])
users = pd.concat([tweet_users, retweet_users]).drop_duplicates("id")
users.index = range(users.shape[0])
# users

___

## General statistics

### Aggregated count of published tweets (2013-2018)

> Search query ([twitter-v2](https://github.com/nelsonaloysio/twitter_v2_search)): `(Manifestação OR Manifestações OR Protesto OR Protestos OR #VemPraRua) (lang:pt OR place:Brazil OR place_country:BR)`

#### Daily

In [None]:
_ = pd.read_json("../json/stats_2013-2018.json")
# _['tweet_count'].plot(figsize=(20,4))
_.index = [x.strftime("%Y-%m-%d") for x in _.index]

d = pd.DataFrame(index=[x[5:] for x in _.index[:365]])

for y in _["y"].unique():
    d[y] = _.loc[[f"{y}-{x}" for x in d.index], "tweet_count"].values

colors = ["#3c76d6", "orange", "green", "red", "purple", "brown"]
yticks = [750000, 70000, 450000, 300000, 100000, 175000]
ax = d.plot(figsize=(9.2, 8.7), subplots=True, style='-')

[a.grid(color="#eee") for a in ax]
[a.set_yticks([0, int(yticks[i]/2), yticks[i]]) for i, a in enumerate(ax)]
#[a.axhline(0.5, color=colors[i], ls='--', linewidth=1.25, alpha=0.75) for i, a in enumerate(ax)]
[a.set_ylim([0, yticks[i]]) for i, a in enumerate(ax)]
[a.set_xlim([0, 365]) for a in ax]
[a.set_xticks([i for i, x in enumerate(d.index) if x.endswith("-01")] + [365]) for a in ax]

[ax[-1].set_xticklabels(["1/Jan", "Fev", "Mar", "Abr", "Mai", "Jun", "Jul", "Ago", "Set", "Out", "Nov", "Dez", "31/Dez"])]

ax[2].set_ylabel("Número diário de publicações no Twitter com um ou mais termos selecionados                                 ",
                 loc="center", fontsize=10)

fmtr = FuncFormatter(lambda x, p: format(int(x), ',').replace(',', '.'))
[a.yaxis.set_major_formatter(fmtr) for a in ax]
plt.minorticks_off()
# [d[d.columns[i]].max() for i in range(d.shape[1])]

##### Daily (incl. "`manifestante`", "`manifestantes`")

> Comparatively, in the chart below, there's a lot of added noise in the years 2014 and 2017 due to the inclusion of the keywords above.

In [None]:
d_ = pd.read_json("../json/stats_2013_2018_new.json")
ax = d_.plot(subplots=True, figsize=(9, 8))  # , style='.-')
[a.set_xlim([0, 365]) for a in ax]
[a.grid(color="#eee") for a in ax]
plt.minorticks_off()
d_.sum()

#### Weekly

In [None]:
ylim = [2914867, 176551, 632738, 668321, 186084, 317893]
g = _.groupby("w").sum()["tweet_count"]
idx = g.index.str[5:].unique()

d = pd.DataFrame()
for y in _["y"].unique():
    rng = idx[1:] if y == 2017 else idx[:53]
    d[y] = g.loc[[f"{y}-{x}" for x in rng]].values

colors = ["#3c76d6", "orange", "green", "red", "purple", "brown"]
colors_ = ["#03558d", "#d86500", "green", "#d62728", "purple", "#8c574c"]

ax = d.plot(figsize=(9.2, 8.7), subplots=True, style='.-')

[a.grid(color="#eee") for a in ax]
# [a.axhline(d.iloc[:, i].mean(), color=colors[i], ls='--', linewidth=1.25, alpha=0.75) for i, a in enumerate(ax)]
# ticks = _[_.index.str.endswith("-01")]["w"].str[5:][:12].values.tolist()

ticks = {i: [int(x) for x in _[_.index.str.endswith("-01")].query(f"y == {y}")["w"].str[5:].values.tolist()] for i,y in enumerate(_["y"].unique())}
[a.set_xticks(ticks[i]) for i,a in enumerate(ax)]
[ax[-1].set_xticklabels(["Jan", "Fev", "Mar", "Abr", "Mai", "Jun", "Jul", "Ago", "Set", "Out", "Nov", "Dez"])]
[a.set_xlim([0, 52]) for a in ax]
[a.set_yticks([0,  d.iloc[:, i].max()/2,   d.iloc[:, i].max()]) for i, a in enumerate(ax)]
[a.get_yticklabels()[1].set_color(colors_[i]) for i, a in enumerate(ax)]
[a.get_yticklabels()[2].set_color(colors_[i]) for i, a in enumerate(ax)]

# [a.plot(range(56), [d.iloc[:, i].max()/2 for _ in range(56)], linestyle='--', linewidth='1.25', color=colors[i], alpha=0.5) for i, a in enumerate(ax)]
fmtr = FuncFormatter(lambda x, p: format(int(x), ',').replace(',','.'))
[a.yaxis.set_major_formatter(fmtr) for a in ax]
ax[3].set_ylabel("                           Total de publicações por semana no Twitter com um ou mais termos de pesquisa selecionados",
                 loc="center", fontsize=10)
plt.minorticks_off()
[d[d.columns[i]].max() for i in range(d.shape[1])]

#### Monthly

In [None]:
g = _.groupby("m").sum()["tweet_count"]
idx = g.index.str[5:].unique()

d = pd.DataFrame()
for y in _["y"].unique():
    rng = idx  # [1:] if y == 2017 else idx[:53]
    d[y] = g.loc[[f"{y}-{x}" for x in rng]].values

colors = ["#3c76d6", "orange", "green", "red", "purple", "brown"]
ax = d.plot(figsize=(9.2, 8.7), subplots=True, style='.-')
[a.grid(color="#eee") for a in ax]
# [a.axhline(d.iloc[:, i].mean(), color=colors[i], ls='--', linewidth=1.25, alpha=0.75) for i, a in enumerate(ax)]
[a.set_yticks([0, d.iloc[:, i].max()/2, d.iloc[:, i].max()]) for i, a in enumerate(ax)]
[a.set_xlim([0, 11]) for a in ax]
[a.set_xticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) for a in ax]
[ax[-1].set_xticklabels(["Jan", "Fev", "Mar", "Abr", "Mai",
                        "Jun", "Jul", "Ago", "Set", "Out", "Nov", "Dez"])]
ax[2].set_ylabel("Número mensal de publicações no Twitter com um ou mais termos selecionados                                 ",
                 loc="center", fontsize=10)
fmtr = FuncFormatter(lambda x, p: format(int(x), ',').replace(',', '.'))
[a.yaxis.set_major_formatter(fmtr) for a in ax]
plt.minorticks_off()
[d[d.columns[i]].max() for i in range(d.shape[1])]

#### Yearly

In [None]:
_.groupby("y").describe()["tweet_count"].T.applymap(lambda x: f"{int(x)}")
_.groupby("y").sum().plot(kind="bar")

#### Additional #1: June 2013

> `protesto OR protestos OR dilma OR manifestação OR manifestaçoẽs OR #vemprarua OR impeachment`

In [None]:
_ = pd.read_json('../json/stats_2013.json')
_.loc[:, _.sum().sort_values(ascending=False)[:5].index].plot(figsize=(20, 10))

#### Additional #2: 2013-2016

> `impeachment+dilma`

In [None]:
_ = pd.Series(json.loads(
    open("../json/stats_2013-2016_impeachment+dilma.json").read()))
_.plot(figsize=(20,4))

### Social network usage (2013-2018)

In [None]:
df = pd.read_json('../json/social_networks.json')
df = df.filter(['Facebook','YouTube','Twitter','Tumblr','Pinterest'])#,'Instagram'])
df.columns = ['Facebook','YouTube','Twitter','Tumblr','Pinterest']#,'Instagram*']
# df.index = range(df.shape[0])
ax = df.plot(figsize=(7.9,4.2), alpha=1)

x = range(df.shape[0]+1000)
plt.plot([x for x in x], [int(df.iloc[:, 0].mean()) for x in x], linestyle='--', linewidth='1.25', color='#3c76d6', alpha=0.5)
plt.plot([x for x in x], [int(df.iloc[:, 2].mean()) for x in x], linestyle='--', linewidth='1.25', color='g', alpha=0.5)
ax.set_yticks([4.24,15,30,45,60,76.41,90])
ax.set_yticklabels(['4.24%','15%', '30%', '45%', '60%','76.41%','90%'])
ax.set_ylim([0,100])
# fmtr = StrMethodFormatter('{x:2.0f}%')
# ax.yaxis.set_major_formatter(fmtr)

plt.grid(color = 'gray', linestyle = '--', linewidth = 0.15)
plt.ylabel('Estimativa de utilização da plataforma entre brasileiros')
plt.legend(fontsize=12, title='Plataforma (>1%)')#, bbox_to_anchor=(1, 1.025), loc='upper left', fontsize=12)
ax.get_lines()[0].set_alpha(1.0)
ax.get_lines()[1].set_alpha(0.7)
ax.get_lines()[2].set_alpha(1)
ax.get_lines()[3].set_alpha(0.7)
ax.get_lines()[4].set_alpha(0.7)
# ax.get_lines()[5].set_alpha(0.2)

ax.get_yticklabels()[0].set_color("#03558d")
ax.get_yticklabels()[5].set_color("#3c76d6")
plt.minorticks_off()

### Protests (2015-2016)

In [None]:
df = pd.read_json('../json/time_series_2015_2016.json')
df.columns = ['a) Oposição (pró-impeachment)', 'b) Situação (contra o impeachment)']
ax = df.plot(kind='bar', color=['#3c76d6', 'r'], figsize=(10, 4), rot=0)
x = range(df.shape[0]+10)
plt.plot([x-1 for x in x], [int(df.iloc[:, 0].mean()) for x in x], linestyle='--', linewidth='1.25', color='#3c76d6', alpha=0.75)
plt.plot([x-1 for x in x], [int(df.iloc[:, 1].mean()) for x in x], linestyle='--', linewidth='1.25', color='r', alpha=0.75)

ax.set_xticklabels(['Março de 2015', 'Abril de 2015', 'Agosto de 2015', 'Dezembro de 2015', 'Março de 2016', 'Abril de 2016', 'Ago./out. de 2016'])
ax.set_yticks([0,91285,500000,1000000,1146428,1500000,2000000,2500000,3000000,3500000,4000000])
ax.set_yticklabels(['0','0.09', '0.5', '1','1.15','1.5','2','2.5','3','3.5','4'], fontsize=10)
plt.grid(color = 'gray', linestyle = '--', axis='y', linewidth = 0.15)
plt.legend(title='Est. de manifestantes em protestos de 2015-2016', loc='upper left', fontsize=12)
plt.ylabel('Número de manifestantes (em milhões)')
ax.set_ylim([1, 3800000])

ax.get_yticklabels()[1].set_color("r")
ax.get_yticklabels()[4].set_color("#3c76d6")

### Gini (1960-2012)

In [None]:
df = pd.read_json('../json/gini.json')
df.index = [' '+x+' ' for x in df.index.astype(str).values]
deg = df.index.tolist()
cnt = [x for x in df.values.tolist() for x in x]
fig, ax = plt.subplots(figsize=(9.6,3.4),)

plt.bar(deg, cnt, width=0.5, color='#3c76d6')
plt.ylabel("Índice de Gini", fontsize=12)
plt.axes.fontsize = 14
ax.set_xticks([d for d in deg])
ax.set_xticklabels([d for d in deg], fontsize=12)
ax.set(ylim=[0.45, 0.65])
ax.set_yticks([0.45, 0.5, 0.55, 0.5645, 0.6])
ax.set_yticklabels(['0.45', '0.50', '0.55', '0.5645', '0.60'])
plt.grid(color = 'gray', linestyle = '--', linewidth = 0.15, axis='y')
plt.plot()

rects = ax.patches
labels = [x for x in df.values.tolist() for x in x]

for rect, label in zip(rects, labels):
    x = rect.get_x() + rect.get_width() / 2
    h = rect.get_height() + 0.001
    ax.text(x, h, f'{label:.4f}', fontsize=12,
            ha='center', va='bottom')
    
x = range(df.shape[0]+10)
ax.set_xlim([-0.5, 7.5])
ax.set_ylim([0.45, 0.63])
ax.get_yticklabels()[3].set_color("#3c76d6")
plt.plot([x-1 for x in x], [df.iloc[:, 0].mean() for x in x], linestyle='--', linewidth='1.25', color='#3c76d6', alpha=0.75)

### Political parties (2010-2018)

In [None]:
df = pd.read_json('../json/party_2010_2018.json')
ax = df.plot(kind='line', figsize=(4.2,4.2), style='.-')
ax.set(xticks=df.index)
ax.set(xlim=[df.index[0], df.index[-1]])
fmtr = StrMethodFormatter('{x:2.1f}%')
ax.yaxis.set_major_formatter(fmtr)
ax.set_xticklabels(df.index, fontsize=10)

plt.legend(title='Partido', bbox_to_anchor=(1, 1.03), loc='upper left', fontsize=12)
plt.grid(color = 'gray', linestyle = '--', axis='y', linewidth = 0.15)
plt.grid(color = 'gray', linestyle = '--', linewidth = 0.15)
plt.ylabel("Porcetagem de deputados federais eleitos", fontsize=12)

[x.set_alpha(0.75) for x in ax.get_lines()]

plt.show()

### Engagement/interactions (2013-2018)

In [None]:
with open("../json/metrics.json", "r") as j:
    m = {int(k): pd.DataFrame(v) for k, v in json.loads(j.read()).items()}
    for y in m:
        print(y, m[y].iloc[:, 1:].sum().sum() / m[y].iloc[:, 0].sum().sum())

df_ = pd.DataFrame({k: v.sum().to_dict() for k, v in m.items()}).T.apply(lambda x: x*100/sum(x), axis=1)
df_.index = ["$\it{T}$$_%s$ (%s)" % (i,k) for i,k in enumerate(df_.index)]
ax = df_.plot(kind="bar", figsize=(7,3.04), stacked=True, width=0.54)
fmtr = FuncFormatter(lambda x, p: format(int(x), ',').replace(',', '.')+'%')
ax.yaxis.set_major_formatter(fmtr)
#ax.set_xticklabels(df.index, fontsize=10)
ax.set_ylabel("Total de interações por tipo (%)", fontsize=12)
ax.set_ylim([0, 100])
# plt.legend(title='Interações', title_fontsize=12, 
#             bbox_to_anchor=(1,1.027), 
#             loc='upper left',
#             fontsize=12, ncol=1, columnspacing=1)
plt.legend(ncol=5, bbox_to_anchor=(0.434,-.322), loc='lower center', columnspacing=.1, fontsize=12)
# plt.legend(title_fontsize=12, bbox_to_anchor=(0.5, -0.4), loc='lower center', fontsize=12, columnspacing=1, ncol=2)
#plt.grid(color = 'gray', linestyle = '--', axis='y', linewidth = 0.15)
#plt.grid(color = 'gray', linestyle = '--', linewidth = 0.15)
plt.grid(color='gray', linestyle='--', linewidth=0.35)
plt.xticks(rotation='horizontal')

pd.DataFrame({k: v.sum().to_dict() for k, v in m.items()}).sum()


### Activity per period

In [None]:
fmtr = FuncFormatter(lambda x, p: format(int(x), ',').replace(',', '.'))
# matplotlib.rc('text', usetex = True)

fig, ax = plt.subplots(figsize=(10, 2.5))
plotstats(m, 121, 2013, [0, 500000], fmtr)
plt.ylabel('Total de interações')
plotstats(m, 122, 2014, [0, 40000], fmtr)

fig, ax = plt.subplots(figsize=(10, 2.5))
plotstats(m, 121, 2015, [0, 350000], fmtr)
plt.ylabel('Total de interações')
plotstats(m, 122, 2016, [0, 300000], fmtr)

fig, ax = plt.subplots(figsize=(10, 2.5))
plotstats(m, 121, 2017, [0, 200000], fmtr, legend=True)
plt.ylabel('Total de interações')
plotstats(m, 122, 2018, [0, 500000], fmtr)

# plt.legend(title_fontsize=12, bbox_to_anchor=(0.5, -0.4),
#             loc='lower center', fontsize=12, columnspacing=1, ncol=2)


### Tweet source per period

In [None]:
# def plots(d, y=None, key=None, kind="barh"):
#     for y in (y if type(y) == list else [y] if type(y) == str else d.keys()):
#         fig = plt.figure()
#         ax = (d[y][key] if key else d[y]).value_counts(ascending=False)[:15].sort_values().plot(kind=kind)
#         ax.set_title(f"{y}")
#         ax.plot()
# plots(source, [2013,2014,2015,2016,2017,2018], "source")

___

### Comparing artificial network models

Quick comparison between Erdös-Rényi (random), Watts-Strogatz (small-word) and Barabási-Albert (scale-free) models.

In [None]:
G = nx.erdos_renyi_graph(50, 0.1, seed=20)
f1 = nx_plot(G, **params, title=f'a) Erdös-Rényi (p=0.1)', colors='r')
graph_histogram(G, color='r')


In [None]:
G = nx.watts_strogatz_graph(50, k=4, p=0.1, seed=85)
f2 = nx_plot(G, **params, title=f'b) Watts-Strogatz (k=4, p=0.1)', colors='#3c76d6')
graph_histogram(G, color='#3c76d6')

In [None]:
G = nx.barabasi_albert_graph(50, m=1, seed=333)
f3 = nx_plot(G, **params, title=f'c) Barabási-Albert (m=1)', colors='g')
graph_histogram(G, color='g')

___

### Graph building

In [None]:
for f in os.listdir("full"):
    if f.endswith(".json") and f.startswith("retweets_"):
        extract_graph(f"full/{f}")

#### Build main graphs

In [None]:
for f in reversed(sorted(os.listdir(path))):
    if f.endswith("_full.csv"):
        print(f)
        G = gk.graph(f"graphs/{f}", source_attr="source", target_attr="target", directed=True, weights=True)
        G = gk.nx_set_node_attrs(G, pd.read_csv(f"graphs/{f}".replace("_full", "_nodes"), index_col="id"))
        gk.nx_write_graph(G, f"{f}.gexf")

#### Build daily graphs

In [None]:
# for f in reversed(sorted(os.listdir(path))):
#     if f.endswith("_nodes.csv"):
#         year = f[15:19]
#         nodes = pd.read_csv(f"{path}/{f}", index_col="id")
#         edges = pd.read_csv(f"{path}/{f}".replace("_nodes","_edges"))
#         for x in sorted(os.listdir(f"{path}/{year}")):
#             if x.endswith("_nodes.csv"):
#                 print(x)
#                 nodes_daily = pd.read_csv(f"{path}/{year}/{x}", index_col="id")
#                 nodes_daily["leiden"] = nodes.loc[nodes_daily.index, "leiden"].astype(str)
#                 G = gk.graph(f"{path}/{year}/{x}".replace("_nodes","_full"), source_attr="source", target_attr="target", directed=True, weights=True)
#                 G = gk.nx_set_node_attrs(G, nodes_daily)
#                 gk.nx_write_graph(G, f"{path}/{year}/{x}.gml".replace("_nodes.csv", ""))

#### Build filtered graphs (by in-degree)

In [None]:
# for f in reversed(sorted(os.listdir(path))):
#     if f.startswith("graph_"):
#         year = f.split("_")[1].split(".")[0]
#         G = gk.graph(f"graphs/{f}")
#         nodes = gk.nodes(G)
#         in_deg = int(nodes["in_degree"].mean())+1
#         G_ = gk.subgraph(G, nodes[nodes["in_degree"] >= in_deg].index)
#         print(year, G_.order(), G_.size())
#         gk.nx_write_graph(G_, f"graphs/filtered_{year}_indeg={in_deg}.gexf")

___

### Graph overview

#### Load nodes, edges, centrality

In [None]:
path = "graphs"

nodes = {} # yearly
nodes_ = {} # daily
source = {} # yearly
target = {} # yearly

for f in sorted(os.listdir(path)):
    if f.startswith("centrality_"):
        year = int(f.split("_")[1].replace(".csv", ""))
        nodes[year] = pd.read_csv(f"{path}/{f}", index_col="label")
        nodes_[year] = pd.DataFrame(index=nodes[year].index.values.tolist())
        nodes[year].index.name = 'id'
    if f.startswith("edges_"):
        source[year] = pd.read_csv(f"{path}/{f}".replace("_nodes", "_edges"), index_col="source", usecols=["source"]).dropna()
        target[year] = pd.read_csv(f"{path}/{f}".replace("_nodes", "_edges"), index_col="target", usecols=["target"]).dropna()

all_nodes = pd.DataFrame(pd.concat([pd.Series(x.index) for x in nodes_.values()]).value_counts())[0]

#### Compare source and target nodes

> Ao todo, foram observados 604.540 atores; destes, 549.717 (90,93%) foram constatados compartilhando publicações realizadas por outros perfis na rede, para um total de 140.686 (23,27%) membros sendo republicados em todos os intervalos observados. Apenas 85.863 (14,2%) do total foi observado realizando as duas ações, isto é, tanto compartilhando conteúdo quanto tendo seu conteúdo compartilhado.

In [None]:
all_nodes = pd.Index(pd.concat([pd.Series(x.index) for x in nodes.values()])).drop_duplicates()
all_sources = pd.Index(pd.concat([pd.Series(x.index) for x in source.values()])).drop_duplicates()
all_targets = pd.Index(pd.concat([pd.Series(x.index) for x in target.values()])).drop_duplicates()

f"{all_nodes.shape[0]} nodes ({all_sources.shape[0]} [source] => {all_targets.shape[0]} [target]) (source & target amounts to {all_sources.intersection(all_targets).shape[0]} nodes)"

#### Histograms

In [None]:
def hist_gram(nodes=nodes):

    def hist_freq(_):
        nodes = _.index.tolist()
        values = _.values.tolist()
        dmax = max(values)+1
        freq = [0 for d in range(dmax)]
        for d in values:
            freq[d] += 1
        return freq

    fig, ax = plt.subplots(figsize=(9.5, 6))
    # plt.figure(figsize=(9.5, 6))
    plt.subplot(231)
    df_ = hist_freq(nodes[2013]["in_degree"])
    plt.loglog(range(len(df_)), df_, 'b-', label='2013')
    df_ = hist_freq(nodes[2013]["out_degree"])
    plt.loglog(range(len(df_)), df_, 'g-', label='2013*')
    plt.ylabel('Frequência (número de nós)')
    plt.title("$\it{T}$$_0$ (2013)")
    plt.subplot(232)
    df_ = hist_freq(nodes[2014]["in_degree"])
    plt.loglog(range(len(df_)), df_, 'b-', label='Grau de entrada')
    df_ = hist_freq(nodes[2014]["out_degree"])
    plt.loglog(range(len(df_)), df_, 'g-', label='Grau de saída')
    plt.title("$\it{T}$$_1$ (2014)")
    # plt.legend(title_fontsize=12, bbox_to_anchor=(0.5, -0.4), loc='lower center', fontsize=12, columnspacing=1, ncol=2)
    plt.subplot(233)
    df_ = hist_freq(nodes[2015]["in_degree"])
    plt.loglog(range(len(df_)), df_, 'b-', label='2015')
    df_ = hist_freq(nodes[2015]["out_degree"])
    plt.loglog(range(len(df_)), df_, 'g-', label='2015*')
    plt.title("$\it{T}$$_2$ (2015)")
    fig, ax = plt.subplots(figsize=(9.5, 6))
    # plt.figure(figsize=(9.5, 6))
    plt.subplot(231)
    df_ = hist_freq(nodes[2016]["in_degree"])
    plt.loglog(range(len(df_)), df_, 'b-', label='2013')
    df_ = hist_freq(nodes[2016]["out_degree"])
    plt.loglog(range(len(df_)), df_, 'g-', label='2013*')
    plt.ylabel('Frequência (número de nós)')
    plt.title("$\it{T}$$_3$ (2016)")
    plt.subplot(232)
    df_ = hist_freq(nodes[2017]["in_degree"])
    plt.loglog(range(len(df_)), df_, 'b-', label='Grau de entrada')
    df_ = hist_freq(nodes[2017]["out_degree"])
    plt.loglog(range(len(df_)), df_, 'g-', label='Grau de saída')
    plt.title("$\it{T}$$_4$ (2017)")
    plt.legend(title_fontsize=12, bbox_to_anchor=(0.5, -0.4), loc='lower center', fontsize=12, columnspacing=1, ncol=2,)
    plt.subplot(233)
    df_ = hist_freq(nodes[2018]["in_degree"])
    plt.loglog(range(len(df_)), df_, 'b-', label='2018')
    df_ = hist_freq(nodes[2018]["out_degree"])
    plt.loglog(range(len(df_)), df_, 'g-', label='2018*')
    plt.title("$\it{T}$$_5$ (2018)")

hist_gram()


#### Correlation matrix (Jaccard indices)

In [None]:
def gen_user_corr(dct, **kwargs):
    d1 = pd.DataFrame({
        key: {
            k: df.index.intersection(dct[k].index).drop_duplicates().shape[0] for k in dct}
        for key, df in dct.items()
    })
    d2 = d1.divide(d1.max(), axis=0)
    sns.set(font_scale=1.0)
    sns.set(font="Times New Roman")
    # Sample figsize in inches
    fig, ax = plt.subplots(figsize=(5.3, 4.3))
    for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
                 ax.get_xticklabels() + ax.get_yticklabels()):
        item.set_fontsize(12)
    heatmap(d2.round(2), annot=True, center=.5,  linewidths=0, ax=ax, **kwargs)
    ax.set_xticklabels(["$\it{T}$$_%s$" % i for i in range(6)], fontsize=14)
    ax.set_yticklabels(["$\it{T}$$_%s$" % i for i in range(6)], fontsize=14)
    plt.show()

    # .applymap(lambda x: x) + d2.multiply(100).round(2).applymap(lambda x: f' ({x}%)'.replace('.0%','%').replace(".",",")).astype(str)
    return d1.astype(str)


gen_user_corr(nodes)


#### General statistics

In [None]:
graphs = {}
leiden = {}
nodes = {}

for f in sorted(os.listdir(path)):
    p = f"{path}/{f}"
    if f.startswith("graph_"):
        year = int(f.split("_")[1].split(".")[0])
        G = gk.graph(p)
        graphs[year] = G
        nodes[year] = gk.nodes(G)
        leiden[year] = nodes[year]["leiden"]
        leiden[year].index = G.nodes()

stats = pd.DataFrame({year: {c: f"{_.loc['mean',c]} ± {_.loc['std',c]}" for c in _.columns} for year, _ in 
 {k: gk.nodes(graphs[k]).describe().round(2).loc[['mean','std']].astype(str) for k in sorted(graphs.keys())}.items()})
stats.drop(["leiden", "louvain"])

##### Density

In [None]:
density = {}
for year in sorted(graphs):
    G = graphs[year]
    density[year] = nx.density(G)
    dens = str(density[year])
    print(year, f"{dens[:4]}{dens[-4:]}")

##### Coreness / k-cores

In [None]:
def remove_self_loops(G):
    G.remove_edges_from(nx.selfloop_edges(G))
    return G
k_cores = stats.loc["degree"].apply(lambda x: int(x[0])+1).to_dict()
k_graphs = {y: nx.k_core(remove_self_loops(graphs[y]), k) for y,k in k_cores.items()}
print("order:", [x.order() for x in k_graphs.values()])
k_cores

___

### Graph communities

In [None]:
path = "graphs"

graphs = {}
leiden = {}
nodes = {}

for f in sorted(os.listdir(path)):
    p = f"{path}/{f}"
    if f.startswith("graph_"):
        year = int(f.split("_")[1].split(".")[0])
        G = gk.graph(p)
        graphs[year] = G
        nodes[year] = gk.nodes(G)
        leiden[year] = nodes[year]["leiden"]
        leiden[year].index = G.nodes()

#### Leiden modules/communities

In [None]:
# import leidenalg as la

# modules = {}
# results = {}

# for y in sorted(graphs):
#     iG = gk.nx2ig(graphs[y])
#     mod = la.find_partition(iG, la.ModularityVertexPartition, seed=0, n_iterations=10)
    
#     communities = max(mod.membership)+1 if mod.membership else 0
#     modularity = mod.quality()

#     print(y)
#     print(f'Communities (Leiden): {communities} (m={modularity:.3f})')

#     modules[y] = pd.Series(
#         pd.to_numeric(mod.membership, downcast='integer'),
#         name='leiden_partition',
#     )
#     results[y] = mod

#### Leiden temporal communities

> Instead of computing the communities considering each temporal graph as above, let's consider all periods at once with an `interslice_weight` of `1.0`.

In [None]:
igraphs = {}

for y in sorted(graphs):
    G = graphs[y]
    iG = ig.Graph(directed=G.is_directed())
    
    n_ = gk.nodes(G)
    e_ = gk.edges(G)

    iG.add_vertices(n_.index.tolist())
    iG.vs['id'] = n_.index.tolist()
    
    iG.add_edges(e_[["source", "target"]].values.tolist())
    iG.es["weight"] = e_["weight"]
    
    igraphs[y] = iG

In [None]:
membership, improvement = la.find_partition_temporal(
    list(igraphs.values()),
    la.ModularityVertexPartition,
    interslice_weight=1,
    seed=0,
    n_iterations=10,
)

In [None]:
leiden = {
    year:
        pd.Series(
            membership[i],
            index=graphs[year].nodes(),
            name='leiden',
        ).astype(int) # .to_dict()
    for i, year in enumerate(graphs.keys())}

#### Plot top communities over time (order)

In [None]:
# leiden_ = pd.DataFrame(leiden)
# leiden_ = {y: leiden[y].value_counts().divide(leiden[y].value_counts().sum()) for y in leiden}

In [None]:
leiden_ = {}
top_leiden = pd.concat(pd.Series(df.value_counts().index[:5]) for df in leiden.values()).unique().tolist()
max_leiden = pd.concat(pd.Series(df.unique()) for df in leiden.values()).unique().max()

for y, m in leiden.items():
    leiden_[y] = {
        **{"$\it{C}$$_{%s}$"%(c+1 if c != 38 else 7): m.value_counts().loc[c] / m.value_counts().sum() for i,c in enumerate(m.value_counts().index.intersection(top_leiden))},
        **{"$\it{C}$$_{8}$ - $\it{C}$$_{25}$": m.value_counts().drop(top_leiden).iloc[:18].sum() / m.value_counts().sum()},
        **{"$\it{C}$$_{26}$ - $\it{C}$$_{n}$": m.value_counts().drop(top_leiden).iloc[18:].sum() / m.value_counts().sum()},
    }
leiden_ = pd.DataFrame({"$\it{T}$$_{%s}$ (%s)" % (i,k): leiden_[k] for i, k in enumerate(leiden_)})

plt.figure()
ax = leiden_.T.plot(kind="bar", stacked=True, figsize=(7.6,3.6), width=0.56, fontsize=12 )
ax.set_yticklabels(['0%', '20%', '40%', '60%', '80%', '100%', ])
ax.set_ylim([0, 1])
plt.legend(ncol=1, loc='upper right', title='Comunidade ($\it{C}$)',  bbox_to_anchor=(1.275,1.04), columnspacing=1.4, fontsize=13, title_fontsize=12)
plt.grid(color='gray', linestyle='--', linewidth=0.35)
plt.xticks(rotation='horizontal')
# ax.tick_params(axis='y', colors='blue')
# [[x.set_facecolor("#999") for x in ax.containers[-3]]]
[[x.set_facecolor("#bbb") for x in ax.containers[-2]]]
[[x.set_facecolor("#ddd") for x in ax.containers[-1]]]
# ax.get_legend().legendHandles[-3].set_facecolor('#999')
ax.get_legend().legendHandles[-2].set_facecolor('#bbb')
ax.get_legend().legendHandles[-1].set_facecolor('#ddd')
ax.set_ylabel("Total de perfis por comunidade (%)", fontsize=12)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=12)

# for _, __ in leiden.items():
#     print(_, [x+1 if x != 38 else 7 for x in __.value_counts().index[:5]])
leiden_    

#### List top communities' most relevant nodes over time

In [None]:
for year in leiden:
    print(year)
    G = graphs[year]
    df = leiden[year]
    cent = gk.compute(G, "degree")
    for c in top_leiden:
        c_ = cent.loc[df[df==c].index]
        print(int(str(c).replace("38","6"))+1, c_.shape[0], 'nós', '\t', c_.sum().values[0], 'arestas',
              c_["degree"].sort_values(ascending=False).index[:10].tolist())
    print()

#### Compare Louvain and Leiden communities

In [None]:
# for y in nodes:
#     fig = plt.figure()
#     leiden = nodes[y]['leiden'].value_counts().values
#     louvain = nodes[y]['louvain'].value_counts().values
#     pd.DataFrame({"leiden": leiden[:100], "louvain": louvain[:100]}).plot(figsize=(14,6))

#### Community significance with [qstest](https://github.com/skojaku/qstest)

> Kojaku, S. and Masuda, N. "A generalised significance test for individual communities in networks". Sci. Rep. 8, 7351 (2018)

In [None]:
# qsresults = qstest(network, communities, qfunc=, sfunc=, cdalgorithm=, num_of_rand_net=500, alpha=0.05, num_of_thread=4)

##### Assign node colors (per community)

In [None]:
for y in [2013, 2014, 2015, 2016, 2017, 2018]:
    c = gk.nodes(gk.graph(f"graphs/f/filtered_{y}.gexf"))["leiden"].apply(lambda x: COLORS_COMMUNITIES.get(int(x), "#aaaaaa"))
    c.index.name = "id"
    c.name = "color"
    c.to_csv(f"graphs/colors_{y}.csv")

___

### Plot graphs from C(n), n=7 communities

> Restrict temporal graph plotting to the nodes within top 7 Leiden communities previously identified, with a minimum node in-degree equivalent to the mean.

In [None]:
path = "graphs"

graphs = {}
indeg = {}
pos = {}

for f in sorted(listdir(path)):
    if f.endswith(".gexf") and f.startswith("filtered_"):
        year = int(f.split("_")[1].split(".")[0])
        graphs[year] = gk.graph(f"{path}/{f}")
        indeg[year] = int(f.split("=")[1].split(".")[0])

for k in [2013, 2014, 2015, 2016, 2017, 2018]:
    with open(f"positions_{k}.json", "r") as j:
        pos[k] = pd.DataFrame(json.load(j)).T

#### Using DataShader (graph drawing), KDEEB (kernel-density edge estimation bundling), ForceAtlas2 (node positioning)

In [None]:
dsplot = DataShaderPlot()

figs = []

order = {}
size = {}

for year, G in graphs.items():
    G = gk.subgraph(G, pos[year].index)
    # G = gk.subgraph(G, list(nx.connected_components(G.to_undirected()))[0])
    nodes = gk.nodes(G)
    cat = nodes["leiden"].astype("category")
    nx.set_node_attributes(G, cat, 'cat')
    
    # G = gk.subgraph(G, list(nx.connected_components(G.to_undirected()))[0]) # <-- first component only
    # nodes = gk.nodes(G)
    idx = nodes.sort_values(["leiden", "page_rank"], ascending=[True, False]).index

    # pos = gk.circular_layout(idx)
    # pos = gk.forceatlas2_layout(G, pos=pos, iterations=100, linlog=False, nohubs=False, seed=0)
    # pos = {node:[pos['x'][i],pos['y'][i]] for i,node in enumerate(idx)}
    name = f"{year}, n={G.order()}, E={G.size()}, d={indeg[year]}"
    
    order[year] = G.order()
    size[year] = G.size()
    
    print("Plotting", name)
    fig = dsplot.ds_plot(
        G,
        pos=pos[year],
        cat="cat",
        method="bundle",
        bw=.05,
        name=name,
        output=f"fig_{year}",
        kwargs=dict(plot_height=768, plot_width=768),
    )
    figs.append(fig)
    
dsplot.tf_plot(figs, 3)

#### Export figures to images (again)

In [None]:
# g = [2013, 2014, 2015, 2016, 2017, 2018]
# for i, graph in enumerate(figs):
#     ds.utils.export_image(img=graph, filename=f"fig_{g[i]}", fmt='.png', background='white')

#### Add title to figures using Matplotlib

In [None]:
titles = {
    2013: [12736, 73688],
    2014: [1347, 4401],
    2015: [4525, 40369],
    2016: [4281, 34384],
    2017: [1651, 6574],
    2018: [1496, 7217],
}

axs = []

for i, year in enumerate(titles):
    fig, ax = plt.subplots(figsize=(7, 7)) # 5, 4
    # fig.subplots_adjust(top=0.85, bottom=0.15, left=0.2, right=0.5, hspace=0.8)
    img = mpimg.imread(f'fig_{year}.png')
    imgplot = plt.imshow(img, interpolation='none', vmin=0, vmax=1, aspect='equal')
    ax = plt.gca()
    ax.axis('off')
    ax.grid(b=None)
    # fig.patch.set_linewidth(0.1)
    # fig.patch.set_edgecolor('black')  
    plt.title("$\it{T}$$_%s$ (%s): $\it{n}$=%s, $\it{E}$=%s, $\it{d}$=%s" % (i, year, titles[year][0], order[year], size[year]), fontdict = {'fontsize' : 9})
    fig = ax.get_figure()
    fig.set_facecolor("w")
    fig.savefig(f"graph_{year}_large.png", bbox_inches="tight")
    # plt.show()

___

### Text extraction and cleanup

#### Extract and load data from text + users

In [None]:
# path = "../data/full"
# columns = ["id_str", "user.screen_name", "retweeted_status.user.screen_name", "retweeted_status.full_text", "retweeted_status.id_str", "user.verified", "retweeted_status.user.verified"]
# new_columns = ["id", "source", "target", "text", "rt_id", "source_verified", "target_verified"]

# for x in reversed(sorted(os.listdir(path))):
#     if x.startswith("retweets_"):
#         year = x.split("_")[1].split("-")[0]
#         with open(f"{path}/{x}", "r") as f:
#             df = pd.DataFrame([json.loads(x) for x in f.readlines()])
#             df["user.screen_name"]                  = df["user"].apply(lambda x: x["screen_name"])
#             df["user.verified"]                     = df["user"].apply(lambda x: x["verified"])

#             df["retweeted_status.id_str"]           = df["retweeted_status"].apply(lambda x: x["id_str"])
#             df["retweeted_status.full_text"]        = df["retweeted_status"].apply(lambda x: x["full_text"])
#             df["retweeted_status.user.screen_name"] = df["retweeted_status"].apply(lambda x: x["user"]["screen_name"])
#             df["retweeted_status.user.verified"]    = df["retweeted_status"].apply(lambda x: x["user"]["verified"])

#             df = df[columns]
#             df.columns = new_columns
#             df.to_csv("text/text_2013.csv", index=False)

#### ...or load already extracted text data from files

In [None]:
dfs = {}
for x in os.listdir("text"):
    year = int(x.split("_")[1].split(".")[0])
    dfs[year] = pd.read_csv(x, index_col="id", dtype=str)

#### Extract tokens from text

In [None]:
tokens = {}
for year in dfs:
    tokens[year] = dfs[year]["text"].astype(str).apply(lambda x: Tokenizer(stop_words=nltk.corpus.stopwords.words('portuguese')).tokenize(x.replace("\n", " ")))
    tokens[year].apply(lambda x: " ".join(x)).to_csv(f"tokens/tokens_{year}.csv")

#### Compute tf-idf (`term frequency * inverse document frequency`)

In [None]:
search_keywords = [
    "manifestacao",
    "manifestacoes",
    "manifestante",
    "manifestantes",
    "protesto",
    "protestos",
]

features = {
    year: get_features(df["tokens"], tfidf=True).drop(search_keywords)
    for year, df in dfs.items()
}

all_features = reduce(lambda x, y: x.add(y, fill_value=0), features.values())
all_features = all_features.divide(all_features.max()).sort_values(ascending=False)

features = {
    year: df.divide(df.max()).apply(lambda x: None if x == 0 else x).dropna()
    for year, df in features.items()
}

community_features = {
    year: {
        community:
            df.drop([k for k in search_keywords if df.get(k, False)])
        for community, df in
            get_features(df["tokens"], groupby=df["leiden"], tfidf=True).items()
    }
    for year, df in dfs.items()
}
community_features = {
    year: {
        community:
            df.divide(df.max()).apply(lambda x: None if x == 0 else x).dropna()
        for community, df in
            groups.items()
    }
    for year, groups in community_features.items()
}

tf = pd.DataFrame({k: d["tokens"].str.split().explode().value_counts() for k, d in dfs.items()})
tf = tf.loc[tf.sum(axis=1).sort_values(ascending=False).index]
tf = tf.loc[tf.dropna().sum(axis=1).sort_values(ascending=False).index[:100]]
tf[:30]

##### Plot years correlation

In [None]:
# N = 100

# for Y in range(2013,2019):
#     py.iplot(
#         go.Figure(
#             data=go.Scatter(
#                 mode="markers",
#                 x=all_features.loc[features[Y].index].values.tolist()[:N],
#                 y=features[Y].values.tolist()[:N],
#                 text=features[Y].index.tolist()[:N],
#             ),
#             layout=go.Layout(    
#                 title=f"{Y}",
#             )
#         )
#     )

#### Plot communities correlation

In [None]:
# Y = 2013
# N = 100

# for C in range(0,7):
#     py.iplot(
#         go.Figure(
#             data=go.Scatter(
#                 mode="markers",
#                 x=features[Y].loc[community_features[Y][C].index].values.tolist()[:N],
#                 y=community_features[Y][C].values.tolist()[:N],
#                 text=community_features[Y][C].index.tolist()[:N],
#             ),
#             layout=go.Layout(    
#                 title=f"{Y}: Comunidade {C}",
#             )
#         )
#     )

#### Tweets from target nodes (retweeted users) within top Leiden communities (Cn, n<7)

Consider only the top 7 communities previously identified by the Leiden algorithm.

In [None]:
leiden = {}
for x in os.listdir("graphs"):
    if x.startswith("centrality_"):
        year = int(x.split("_")[1].split(".")[0])
        leiden[year] = pd.read_csv(f"graphs/{x}", index_col="id")["leiden"]
        leiden[year] = leiden[year][leiden[year].isin([0,1,2,3,4,5,6])]

for year in dfs:
    dfs[year]["leiden"] = dfs[year]["target"].apply(lambda x: leiden[year].get(x, None))
    print(year, "=>", dfs[year].shape[0], "tweets =>", dfs[year]["leiden"].dropna().shape[0], "within top 7 communities")

#### Check verified vs. non-verified tweets & users

In [None]:
for year in range(2013, 2019):
    print(
        year,
        dfs[year].query("target_verified == 'True'")["rt_id"].unique().shape[0],
        "verified vs.",
        dfs[year].query("target_verified == 'False'")["rt_id"].unique().shape[0],
        "non-verified unique tweets"
    )
print()
for year in range(2013, 2019):
    print(
        year,
        dfs[year].query("target_verified == 'True'")["target"].unique().shape[0],
        "verified vs.",
        dfs[year].query("target_verified == 'False'")["target"].unique().shape[0],
        "non-verified unique users"
    )

##### ...same thing, now per community

In [None]:
df = pd.concat(dfs.values())

for n in range(7):
    print(
        f"C{n+1}",
        df[df["leiden"]==n].query("target_verified == 'True'")["rt_id"].unique().shape[0],
        "verified vs.",
        df[df["leiden"]==n].query("target_verified == 'False'")["rt_id"].unique().shape[0],
        "non-verified unique tweets"
    )
print()
for n in range(7):
    print(
        f"C{n+1}",
        df[df["leiden"]==n].query("target_verified == 'True'")["target"].unique().shape[0],
        "verified vs.",
        df[df["leiden"]==n].query("target_verified == 'False'")["target"].unique().shape[0],
        "non-verified unique users"
    )


___

### Text and hyperlinks (unshortened)

#### Tweets and links published from top communities

In [None]:
!mkdir -p retweets urls

retweets = {}
leiden_rts = {}

for x in range(2013, 2019):
    retweets[x] = pd.read_csv(f"../retweets/retweets_{x}.csv", dtype=str)
    retweets[x]["leiden"] = retweets[x]["leiden"].apply(lambda x: int(float(x)) if type(x) == str else x)

for c in range(7):
    leiden_rts[c] = pd.concat([_[_["leiden"] == c] for _ in retweets.values()])
retweets = pd.concat([_ for _ in retweets.values()])

# links = {}
# links_ = {}
# for c in range(7):
#     links[c] = {}
#     links_[c] = pd.concat([_[_["leiden"] == c] for _ in urls.values()])["full_url"].explode().value_counts().to_dict()
#     for y in [2013, 2014, 2015, 2016, 2017, 2018]:
#         links[c][y] = urls[y][urls[y]["leiden"] == c]["full_url"].explode().value_counts().to_dict()

# with open("../json/links.json", "w") as j:
#     json.dump(links, j)
with open("../json/links.json", "r") as j:
    links = {int(k): v for k, v in json.load(j).items()}

# with open("../json/links_.json", "w") as j:
#     json.dump(links_, j)
with open("../json/links_.json", "r") as j:
    links_ = {int(k): v for k, v in json.load(j).items()}

urls = pd.read_csv("../urls/urls_UNSHORTENED.tab", delimiter="\t", index_col="url", usecols=["full_url", "url"], squeeze=True).explode()
urls = {v: k for k, v in urls.items()}

#### URLs published from top communities

In [None]:
d = pd.read_csv("../urls/urls_UNSHORTENED.tab", index_col="url", delimiter="\t")
dct = d[d["is_short"] == True]["full_url"].dropna().apply(lambda x: json.loads(x) if type(x) == str and not x.startswith("http") else x).to_dict()

for y in [2013, 2014, 2015, 2016, 2017, 2018]:
    leiden[y] = gk.nodes(gk.graph(f"graphs/graph_{y}.gexf"))["leiden"]
    urls[y] = pd.read_csv(f"urls/urls_{y}.csv")
    urls[y]["leiden"] = urls[y]["from_user"].apply(lambda x: leiden[y][x])
    urls[y]["url"] = urls[y]["urls"].apply(lambda x: x.replace("https:", "http:").lstrip('"').rstrip('"'))
    urls[y]["full_url"] = urls[y]["url"].apply(lambda x: dct.get(x, x) if d.get(x, x) != "" else x)

> ...after loading the data, the analysis then may be done by usnig the produced dictionary of unshortened links per community: `links`.

___

### Prepare text analysis with [Scattertext](https://github.com/JasonKessler/scattertext) (@jasonkessler)

> Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.

> https://arxiv.org/abs/1703.00565

#### Load portuguese spaCy model

In [None]:
nlp = spacy.load('pt_core_news_sm')

##### Optionally use Stanza (2020) - requires some code adaptation

In [None]:
# import stanza
# #stanza.download("pt")
# nlp = stanza.Pipeline('pt')

#### Load data sets

Let's first grab the 2012 political convention data set used as example in Scattertext and preview it, then replicate the structure with our tweets.

In [None]:
df = st.SampleCorpora.ConventionData2012.get_data()
df.head(1)

In [None]:
# dfs = {}
# for x in sorted(os.listdir(".")):
#     if x.startswith("scatter_2"):
#         year = int(x.split("_")[1].split(".")[0])
#         dfs[year] = pd.read_csv(f"{x}", dtype=str, index_col="id").astype(str)
#         dfs[year]["leiden"] = dfs[year]["leiden"].apply(lambda x: f"C{int(float(x))+1}" if x != "nan" else "Cn")
#         dfs[year] = dfs[year][["leiden", "source", "text"]] # "text"
#         dfs[year].columns = ["party", "speaker", "text"]
# df_source = pd.concat([_ for _ in dfs.values()]).drop_duplicates()

# dfs = {}
# for x in sorted(os.listdir(".")):
#     if x.startswith("scatter_2"):
#         year = int(x.split("_")[1].split(".")[0])
#         dfs[year] = pd.read_csv(f"{x}", dtype=str, index_col="id").astype(str)
#         dfs[year]["leiden"] = dfs[year]["leiden"].apply(lambda x: f"C{int(float(x))+1}" if x != "nan" else "Cn")
#         dfs[year] = dfs[year][["leiden", "target", "text"]] # "text"
#         dfs[year].columns = ["party", "speaker", "text"]
# df_target = pd.concat([_ for _ in dfs.values()]).drop_duplicates()

#### Preview stats from our dataset

In [None]:
# df = df_source
# print("Document Count")
# print(df.groupby('party')['text'].count())
# print("Word Count")
# print(df.groupby('party').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum()))
# print("total:", df.text.apply(lambda x: len(x.split())).sum())
# print()

In [None]:
# df = df_target
# print("Document Count")
# print(df.groupby('party')['text'].count())
# print("Word Count")
# print(df.groupby('party').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum()))
# print("total:", df.text.apply(lambda x: len(x.split())).sum())
# print()

#### Parse full corpus from data set

##### Using target node information (retweeted users)

In [None]:
# %time df_target["parsed"] = df_target.text.apply(nlp)

In [None]:
# %time corpus_t = st.CorpusFromParsedDocuments(df_target, category_col='party', parsed_col='parsed').build()

In [None]:
# with open("corpus/corpus_target.pickle", "wb") as p:
#     pickle.dump(corpus_t, p, protocol=-1)

##### Using source node information (retweeting users)

In [None]:
# %time df_source["parsed"] = df_source.text.apply(nlp)

In [None]:
# %time corpus_s = st.CorpusFromParsedDocuments(df_source, category_col='party', parsed_col='parsed').build()

In [None]:
# with open("corpus/corpus_source.pickle", "wb") as p:
#     pickle.dump(corpus_s, p, protocol=-1)

#### Parse data considering time intervals (optional)

##### Preview stats

In [None]:
# for year in range(2013,2019):
#     print(year)
#     df = dfs[year]
#     print("Document Count")
#     print(df.groupby('party')['text'].count())
#     print("Word Count")
#     print(df.groupby('party').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum()))
#     print("total:", df.text.apply(lambda x: len(x.split())).sum())
#     print()

##### Turn into Scattertext corpora and parse with spaCy (NLP)

In [None]:
# corpora = {}
# for year, df in dfs.items():
#     df["parsed"] = df.text.apply(nlp)
#     corpora[year] = st.CorpusFromParsedDocuments(df, category_col='party', parsed_col='parsed').build()
#     with open(f"corpora/corpus_{year}.pickle", "wb") as p:
#         pickle.dump(corpus, p, protocol=-1)

#### Load full corpus from parsed documents

In [None]:
# %time with open("../corpus/corpus_target.pickle", "rb") as p:\
#     corpus = pickle.load(p)

In [None]:
# print(f"Loaded corpus with:\n\
# {corpus.get_num_categories()} categories\n\
# {corpus.get_num_docs()} documents\n\
# {corpus.get_num_terms()} terms\n")

##### Alternatively consider lemmatized words

> e.g. (`vem pra rua` => `ir pra rua`)

In [None]:
# with open("../corpus/corpus_target.pickle", "rb") as p:
#     %time corpus = st.CorpusFromParsedDocuments(\
#         pickle.load(p).get_df(),\
#         category_col='party',\
#         parsed_col='parsed',\
#         feats_from_spacy_doc=st.FeatsFromSpacyDoc(use_lemmas=True),\
#     ).build()

In [None]:
# print(f"Loaded corpus with:\n\
# {corpus.get_num_categories()} categories\n\
# {corpus.get_num_docs()} documents\n\
# {corpus.get_num_terms()} terms\n")

In [None]:
# print(f"Lemmatization resulted in {1370600 - 1183422} less n-grams.")

##### Alternatively consider bigrams only 

In [None]:
# corpus = corpus.remove_terms([x for x in corpus.get_terms() if " " not in x])

In [None]:
# print(f"Filtering for bigrams only resulted in {1183422 - 926533} removed unigrams.")

##### Select stopwords to remove from corpus and cleanup

In [None]:
# search_terms = ["protesto", "protestos", "vemprarua", "manifestação", "manifestações", "manifestante", "manifestantes", "manifestacao", "manifestacoes"]
# extra_stopwords = ["ai","av","ces","ir","ne","pra","pros","pq","rt","ta","tao","to","vai","vc","vcs","sent","zs","zc","ne","rs","ver"]
# nltk_stopwords = list(nltk.corpus.stopwords.words("portuguese"))
# allow_stopwords = ['ela','elas','ele','eles','não']
# consider_stopwords = set([x for x in nltk_stopwords+extra_stopwords])
# [consider_stopwords.remove(x) for x in allow_stopwords]
# remove_terms = [x for x in corpus.get_terms() if not is_clean(x, consider_stopwords)]

In [None]:
# before = corpus.get_num_terms()
# corpus = corpus.remove_terms(remove_terms)
# print(f"Removed {before-corpus.get_num_terms()} terms considering the {len(consider_stopwords)} stopwords selected.")

In [None]:
# min_num_docs = int(corpus.get_num_docs()/10000)+1
# print(f"Removing n-grams observed in >{min_num_docs} docs (0.01%).")
# corpus = corpus.remove_terms_used_in_less_than_num_docs(min_num_docs)
# print("Finished cleaning, resulted in %s n-grams (n = {1,2})." % corpus.get_num_terms())

In [None]:
# corpus.get_term_count_df().sort_values("corpus", ascending=False).to_csv("term_count.csv")

#### Save cleaned corpus as pickle

In [None]:
# with open("/media/dislocker/nvme0n1p5/corpus_cleaned.pickle", "wb") as p:
#     %time pickle.dump(corpus, p, protocol=-1)

___

### Generate scattertext plots from cleaned corpus

In [None]:
with open("/media/dislocker/nvme0n1p5/corpus_cleaned.pickle", "rb") as p:
    corpus = pickle.load(p) # protocol=-1

In [None]:
cat_range = list(range(1,8))
num_terms = 2000
# corpus = corpus.get_unigram_corpus()
# corpus = corpus.compact(st.AssociationCompactor(num_terms))

for cat in cat_range:
    print(cat)
#     if cat != 1: continue
    html = st.produce_scattertext_explorer(
        corpus.compact(st.AssociationCompactor(num_terms)),
        category=f'C{cat}',
        category_name=f'Comunidade {cat} (C{cat})',
        metadata=corpus.compact(st.AssociationCompactor(num_terms)).get_df()['speaker'],
        not_category_name='Outras comunidades (Cn: {n ∈ ℕ | 0 < n < 8; n≠%s})'%cat,
        not_categories=[f"C{x}" for x in cat_range if x != cat],
        minimum_term_frequency=0,
        pmi_threshold_coefficient=0,
        width_in_pixels=700,
        height_in_pixels=400,
        max_overlapping=1,
        scores=corpus.compact(st.AssociationCompactor(num_terms)).get_scaled_f_scores(f'C{cat}', beta=0.5),
#         scores=corpus.compact(st.AssociationCompactor(num_terms)).get_scaled_f_scores(f'C{cat}', beta=0.5),
        transform=st.Scalers.dense_rank,
        sort_by_dist=False,
#         max_terms=200,
    #     transform=st.Scalers.scale,
#         transform=st.Scalers.percentile, # also try with "jitter"
    #     jitter=0.1,
#         transform=st.Scalers.log_scale_standardize,
    #     term_significance = st.LogOddsRatioUninformativeDirichletPrior(),
    #     x_coords=frequencies_scaled,
    #     y_coords=corpus.get_scaled_f_scores('democrat', beta=0.5),
    #     x_label='Log Frequency',
    #     y_label='Scaled F-Score')
    #     sort_by_dist=False,
        top_terms_length=21,
    )
    file_name = f'scatter/scatter_cat_{cat}.html'
    with open(f'scatter/scatter_cat_{cat}.html', 'wb') as f:
        f.write(html.encode('utf-8'))
    # IFrame(src=file_name, width = 1200, height=700)

___

In [None]:
# cmaps = ['Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r', 'Greys', 'Greys_r', 'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1', 'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu', 'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r', 'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'crest', 'crest_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 'flare', 'flare_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 'gist_yarg_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r', 'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'inferno', 'inferno_r', 'jet', 'jet_r', 'magma', 'magma_r', 'mako', 'mako_r', 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'pink', 'pink_r', 'plasma', 'plasma_r', 'prism', 'prism_r', 'rainbow', 'rainbow_r', 'rocket', 'rocket_r', 'seismic', 'seismic_r', 'spring', 'spring_r', 'summer', 'summer_r', 'tab10', 'tab10_r', 'tab20', 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'terrain', 'terrain_r', 'turbo', 'turbo_r', 'twilight', 'twilight_r', 'twilight_shifted', 'twilight_shifted_r', 'viridis', 'viridis_r', 'vlag', 'vlag_r', 'winter', 'winter_r']