In [1]:
import numpy as np
import pandas as pd
from netpixi.integration.gt import *
from regression.integration.gt import *
import math
from graph_tool import centrality, clustering
import regression as reg

  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)
  mpl_cm.register_cmap(_name, _cmap)
  mpl_cm.register_cmap(_name + "_r", _cmap_r)
  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)


In [2]:
dir_ = 'votacao/'
data_file_name = 'camara_municipal_sp_sessoes_12_20.csv'
name_file_name = 'nomes.csv'

data_file_dir = dir_ + data_file_name
name_file_dir = dir_ + name_file_name

In [3]:
df = pd.read_csv(data_file_dir)
nm = pd.read_csv(name_file_dir)

In [4]:
# Caminho para as pastas de read e save"
dir_save = "networks/"

# Nome do dataset
file_save_name = "vereadores"

# Extensões dos arquivos 
file_save_extension = ".net.gz"

##DATA
data_dir = "data/"

In [5]:
class Model:
    
    def __init__(self, df, nm, y1, y2):
        self.df = df
        self.nm = nm
        self.y1 = y1
        self.y2 = y2
        self.frame = str(self.y1) + "_" + str(self.y2)
        self.path_save = dir_save + file_save_name + "_" + self.frame + file_save_extension
        self.betweenness_file = dir_save + file_save_name + "_betweenness" + "_" + self.frame + file_save_extension
        self.clustering_file = dir_save + file_save_name + "_clustering" + "_" + self.frame + file_save_extension
        self.data_networks_path = data_dir + file_save_name + "_" + self.frame + ".csv"
    
    def drop_nulls(self):
        self.df = self.df[self.df['vereador'].notna() &
                          self.df['id_parlamentar'].notna() &
                          self.df['voto'].notna() &
                          self.df['id_votacao'].notna() &
                          self.df['resultado'].notna()]
    
    def weight_votes(self):
        self.df.loc[(self.df['voto'] == 'Sim') | 
                    (self.df['voto'] == 'Marco A Cunha (PSD)') | 
                    (self.df['voto'] == 'José Américo (PT)'), 
                    'voto'] = '2'
        self.df.loc[(self.df['voto'] == 'Não') | 
                    (self.df['voto'] == 'Souza Santos (PSD)') | 
                    (self.df['voto'] == 'Antônio Vespoli (PSOL)'), 
                    'voto'] = '0'
        self.df.loc[self.df['voto'] == 'Abstenção', 'voto'] = '1'
    
        df_clear_idx = list(x.isdigit() for x in self.df['voto'])
        self.df = self.df[df_clear_idx]
        self.df['voto'] = self.df.apply(lambda row: int(row['voto']) - 1, axis=1)
        
        return self.df
    
    def weight_results(self):
        not_results = [
            "Pendente de Votação",
            "Falta de Quórum",
            "PREJUDICADO POR FALTA DE QUÓRU",
            "PENDENTE DE VOTAÇÃO",
            "Prejudicado- Falta de Quórum",
            "Prejudicado - falta de quorum",
            "PREJUDICADO POR FALTA QUÓRUM",
            "Retirado",
        ]
        filter_results = list(x not in not_results for x in self.df['resultado'])
        self.df = self.df[filter_results]
        self.df.loc[
            (self.df['resultado'] == 'Aprovado') |
            (self.df['resultado'] == 'Eleito') |
            (self.df['resultado'] == 'APROVADO') |
            (self.df['resultado'] == 'Eleito José Américo (PT)') |
            (self.df['resultado'] == 'Eleito Marco A Cunha (PSD)'),
            'resultado'
        ] = 1
        self.df.loc[self.df['resultado'] == 'Reprovado', 'resultado'] = -1
        self.df.loc[self.df['resultado'] == 'Rejeitado', 'resultado'] = -1
        
        return self.df
    
    def framing(self):
        self.df['ano'] = self.df.apply(lambda row: int(row['data'][-4:]), axis=1)
        self.df = self.df[
            (self.df['ano'] >= self.y1) &
            (self.df['ano'] <= self.y2)
        ]
        return self.df
    
    def convergent_ids(self):
        cte = int(1e5)
        self.df['id_votacao'] = self.df['id_votacao'] + cte
    
    def parlament_votes(self):
        self.vereadores = np.unique(self.df['id_parlamentar'])
        self.n_vereadores = len(self.vereadores)
        self.vereadores_votos = []
        for i in range(self.n_vereadores):
            votes = (
                self.df[self.df['id_parlamentar'] == self.vereadores[i]]
                [['id_parlamentar', 'id_votacao', 'voto', 'resultado']]
            )
            votes['success'] = votes.apply(lambda row: 1 if row['voto'] == row['resultado'] else 0, axis=1)
            self.vereadores_votos.append(votes)
    
    def cut_point(self):
        self.vereadores_corr = np.zeros(shape=(self.n_vereadores, self.n_vereadores))
        list_vereadores_corr = []

        for i in range(self.n_vereadores):
            v1 = self.vereadores_votos[i]
            for j in range(i+1, self.n_vereadores - 1):
                v2 = self.vereadores_votos[j]
                corrs = (v1.merge(v2, how='inner', on='id_votacao')[['voto_x', 'voto_y']]
                         .corr(numeric_only = False))
                corr = corrs['voto_x']['voto_y']
                if math.isnan(corr):
                    corr = 0
                self.vereadores_corr[i][j] = corr
                list_vereadores_corr.append(corr)

        self.cut_point = np.median(list_vereadores_corr)
        return self.cut_point
    
    def network_create(self):
        self.g = Graph(directed=False)
    
        for i in range(self.n_vereadores):
            self.g.add_vertex(int(self.vereadores[i]))

        for i in range(self.n_vereadores):    
            for j in range(i+1, self.n_vereadores - 1):
                if self.vereadores_corr[i][j] > self.cut_point:
                    self.g.add_edge(
                        int(self.vereadores[i]),
                        int(self.vereadores[j])
                    )

        gt_save(self.g, self.path_save)
        
    def bet(self):
        g_betweenness = gt_load(self.path_save)
        bc, _ = centrality.betweenness(g_betweenness)
        g_betweenness.add_vp('betweenness', bc)
        gt_save(g_betweenness, self.betweenness_file)
        self.bet = gt_data(g_betweenness)
        return self.bet
    
    def clus(self):
        g_cluster = gt_load(self.path_save)
        lc = clustering.local_clustering(g_cluster)
        g_cluster.add_vp('clustering', lc)
        gt_save(g_cluster, self.clustering_file)
        self.clus = gt_data(g_cluster)
        return self.clus
    
    def rep(self):
        df_parlamentar = self.df.drop_duplicates(subset = "id_parlamentar").copy()
        df_tam = (
            df_parlamentar[["partido", "id_parlamentar"]]
            .groupby("partido").count()
            .rename(columns = {"id_parlamentar": "parlamentares"})
        )

        total_parlamentares = df_tam["parlamentares"].sum()
        df_tam['parlamentares'] = df_tam.apply(lambda row: row['parlamentares']/total_parlamentares, axis=1)

        self.rep = (
            self.df.merge(df_tam, on = "partido", how = "inner")[["id_parlamentar","parlamentares"]]
            .drop_duplicates("id_parlamentar")
            .rename(columns = {"id_parlamentar": "id"})
        )
        
        return self.rep
    
    def suc(self):
        success_list = list()
        for i in range(self.n_vereadores):
            vereador_suc = (self.vereadores_votos[i]
                            [['id_parlamentar', 'success']]
                            .groupby('id_parlamentar')['success']
                            .agg(['sum','count'])
                            .reset_index())
            success_list.append((vereador_suc['id_parlamentar'][0],
                                 vereador_suc['sum'][0]/vereador_suc['count'][0]))
        self.suc = pd.DataFrame(success_list, columns=['id', 'success'])
        return self.suc
    
    def gen(self):
        self.df['first_name'] = self.df.apply(lambda row : row['vereador'].split(' ')[0]
                                              .replace('Á', 'A')
                                              .replace('É', 'E')
                                              .replace('Í', 'I')
                                              .replace('Ã', 'A')
                                              .replace('Â', 'A'),
                                              axis=1)
        self.df = (self.df.merge(self.nm[['first_name', 'classification']], on='first_name', how='left')
                   .rename(columns = {"classification": "gender"}))
        self.df.loc[self.df['first_name'] == "SONINHA", 'gender'] = 'F'
        self.df.loc[self.df['gender'].isnull(), 'gender'] = 'M'
        self.df['gender'] = self.df.apply(lambda row : 0 if row['gender'] == 'F' else 1, axis=1)
        self.gen = (self.df.drop_duplicates(subset = "id_parlamentar")
                    [['id_parlamentar', 'gender']]
                    .rename(columns = {"id_parlamentar": "id"}))
        return self.gen
    
    def var(self):
        self.var = (
            self.bet
            .merge(self.clus, on = 'id', how='inner')
            .merge(self.rep, on="id", how = "inner")
            .merge(self.suc, on="id", how = "inner")
            .merge(self.gen, on="id", how = "inner")
        )
        self.var.to_csv(self.data_networks_path)
        return self.var
    
    def do_all(self):
        self.drop_nulls()
        self.weight_votes()
        self.weight_results()
        self.framing()
        self.convergent_ids()
        self.parlament_votes()
        self.cut_point()
        self.network_create()
        self.bet()
        self.clus()
        self.rep()
        self.suc()
        self.gen()
        self.var()

In [6]:
## Descomentar apenas se Model mudar
# x = Model(df, nm, 2013, 2016)
# x.do_all()
# y = Model(df, nm, 2017, 2020)
# y.do_all()

In [7]:
rec_13_16 = pd.read_csv(data_dir + file_save_name + "_" + str(2013) + "_" + str(2016) + ".csv")
rec_17_20 = pd.read_csv(data_dir + file_save_name + "_" + str(2017) + "_" + str(2020) + ".csv")

In [8]:
rec_13_16.drop(columns='Unnamed: 0', axis=1)

Unnamed: 0,id,betweenness,clustering,parlamentares,success,gender
0,132,0.002700,0.901596,0.027027,0.902174,1
1,155,0.005679,0.850679,0.175676,0.896406,1
2,176,0.000516,0.896667,0.094595,0.873239,1
3,179,0.002017,0.926263,0.027027,0.914062,1
4,187,0.004122,0.911111,0.148649,0.497908,1
...,...,...,...,...,...,...
69,2108,0.017473,0.855180,0.094595,0.921569,1
70,2127,0.004801,0.873227,0.081081,0.906367,1
71,2129,0.003053,0.896277,0.027027,0.906977,1
72,2131,0.003702,0.886179,0.027027,0.815789,1


In [9]:
rec_17_20.drop(columns='Unnamed: 0', axis=1)

Unnamed: 0,id,betweenness,clustering,parlamentares,success,gender
0,155,0.000073,0.981818,0.147059,0.588235,1
1,160,0.002306,0.858974,0.147059,0.555556,1
2,187,0.003156,0.904348,0.191176,0.940845,1
3,220,0.005550,0.875116,0.058824,0.924188,1
4,224,0.002885,0.914141,0.191176,0.915942,1
...,...,...,...,...,...,...
63,2174,0.052702,0.560440,0.029412,0.717949,1
64,2180,0.000000,1.000000,0.147059,0.333333,1
65,2185,0.051025,0.772727,0.058824,0.532787,1
66,2186,0.003584,0.915854,0.191176,0.982143,1


In [10]:
reg_bet__rep_2013_2016 = reg.linear(data=rec_13_16, formula='betweenness ~ parlamentares')
reg_bet__suc_2013_2016 = reg.linear(data=rec_13_16, formula='betweenness ~ success')
reg_bet__rep_suc_2013_2016 = reg.linear(data=rec_13_16, formula='betweenness ~ parlamentares + success')
reg_clu__rep_2013_2016 = reg.linear(data=rec_13_16, formula='clustering ~ parlamentares + gender')
display(reg_bet__rep_2013_2016.micro_summary())
display(reg_bet__suc_2013_2016.micro_summary())
display(reg_bet__rep_suc_2013_2016.micro_summary())
display(reg_clu__rep_2013_2016.micro_summary())

variable,coefficient,std error,p-value
parlamentares,0.073364,0.082388,0.37618


variable,coefficient,std error,p-value
success,-0.041288,0.027518,0.137883


variable,coefficient,std error,p-value
parlamentares,0.044865,0.084837,0.598569
success,-0.037398,0.028619,0.195511


variable,coefficient,std error,p-value
parlamentares,0.703954,0.421104,0.098988
gender,-0.059151,0.076098,0.439562


In [11]:
reg_bet__rep_2017_2020 = reg.linear(data=rec_17_20, formula='betweenness ~ parlamentares')
reg_bet__suc_2017_2020 = reg.linear(data=rec_17_20, formula='betweenness ~ success')
reg_bet__rep_suc_2017_2020 = reg.linear(data=rec_17_20, formula='betweenness ~ parlamentares + success')
reg_clu__rep_2017_2020 = reg.linear(data=rec_17_20, formula='clustering ~ parlamentares + gender')
display(reg_bet__rep_2017_2020.micro_summary())
display(reg_bet__suc_2017_2020.micro_summary())
display(reg_bet__rep_suc_2017_2020.micro_summary())
display(reg_clu__rep_2017_2020.micro_summary())

variable,coefficient,std error,p-value
parlamentares,-0.019842,0.048902,0.686237


variable,coefficient,std error,p-value
success,-0.01938,0.016262,0.237628


variable,coefficient,std error,p-value
parlamentares,-0.024642,0.048878,0.615865
success,-0.020045,0.016408,0.22624


variable,coefficient,std error,p-value
parlamentares,0.580538,0.336259,0.089015
gender,-0.046385,0.055608,0.407257
