In [162]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from networkx.classes.function import path_weight
from tqdm import tqdm


In [163]:

def create_graph(transports):
    graph = {}

    # Creating the graph with the list of transports
    for i,transport in tqdm(transports.iterrows()):
        id_emp_orig = transport['node_src']
        id_emp_dest = transport['node_dest']

        volume = transport['vol']

        # Graph is a dict (source) -> (destination, weight)
        if graph.get(id_emp_orig) is None:
            # if source is not in the graph we need to map it as source to the destination
            graph[id_emp_orig] = {id_emp_dest: {'weight': volume }}
        else:
            # if source is already in the graph
            # 1. new destination from that source: create the edge
            # 2. source already mapped to destination: increase the volume in that edge
            if graph[id_emp_orig].get(id_emp_dest) is None:
                graph[id_emp_orig][id_emp_dest] = {'weight': volume }
            else:
                graph[id_emp_orig][id_emp_dest]['weight'] += volume

    # In this context we want to maximize paths over the volume
    # Since most functions minimizes over the weight of the edges,
    # we need to invert the relation
    for source, targets in graph.items():
        for target, volume in targets.items():
            graph[source][target]['weight'] = -1 * volume['weight']

    return nx.DiGraph(graph)


In [164]:
# transportes_junho = pd.read_csv('../data/df_06.csv')
transportes_julho = pd.read_csv('../data/df_07.csv')
transportes_agosto = pd.read_csv('../data/df_08.csv')
transportes_setembro = pd.read_csv('../data/df_09.csv')
transportes_outubro= pd.read_csv('../data/df_10.csv')
transportes_novembro = pd.read_csv('../data/df_11.csv')
transportes_dezembro = pd.read_csv('../data/df_12.csv')

# transporte_primeiro_semestre= pd.concat([transportes_julho, transportes_agosto, transportes_setembro, transportes_outubro, transportes_novembro, transportes_dezembro], ignore_index=True)


  transportes_julho = pd.read_csv('../data/df_07.csv')
  transportes_agosto = pd.read_csv('../data/df_08.csv')
  transportes_setembro = pd.read_csv('../data/df_09.csv')
  transportes_outubro= pd.read_csv('../data/df_10.csv')
  transportes_novembro = pd.read_csv('../data/df_11.csv')
  transportes_dezembro = pd.read_csv('../data/df_12.csv')


In [165]:

transporte_primeiro_semestre= pd.concat([transportes_julho, transportes_agosto, transportes_setembro, transportes_outubro, transportes_novembro, transportes_dezembro], ignore_index=True)
# transporte_primeiro_semestre = pd.read_csv('../data/df_01.csv')
rem_df = transporte_primeiro_semestre[['CPF_CNPJ_Rem', 'TpRem']].rename(columns={'CPF_CNPJ_Rem': 'CNPJ_CPF', 'TpRem': 'type'})
rem_df['source'] = 'Remetente'

# Criar DataFrame para CPF_CNPJ_Des e TpDes
des_df = transporte_primeiro_semestre[['CPF_CNPJ_Des', 'TpDes']].rename(columns={'CPF_CNPJ_Des': 'CNPJ_CPF', 'TpDes': 'type'})
des_df['source'] = 'Destinatário'

# Concatenar ambos os DataFrames
node_df= pd.concat([rem_df, des_df], ignore_index=True)
node_df= node_df.drop_duplicates('CNPJ_CPF')
node_df


Unnamed: 0,CNPJ_CPF,type,source
0,78952082000161,PTO_IBAMA,Remetente
1,9402361000139,PTO_IBAMA,Remetente
3,470994000100,PTO_IBAMA,Remetente
4,75218883000100,PTO_IBAMA,Remetente
5,5422729000170,PTO_IBAMA,Remetente
...,...,...,...
300472,28638514034,FINAL,Destinatário
300473,58371249004,FINAL,Destinatário
300474,50911406034,FINAL,Destinatário
300476,83227113015,FINAL,Destinatário


In [166]:

emp_type = {}

for i,node in node_df.iterrows():
  emp_type[node['CNPJ_CPF']] = node['type']


In [167]:
arestas = transporte_primeiro_semestre[['CPF_CNPJ_Rem', 'CPF_CNPJ_Des', 'Volume']].rename(columns={'CPF_CNPJ_Rem': 'node_src', 'CPF_CNPJ_Des': 'node_dest', 'Volume': 'vol'})

#  Removendo loops
arestas_1 = arestas[arestas['node_dest']!=arestas['node_src']]
arestas_1

Unnamed: 0,node_src,node_dest,vol
0,78952082000161,23342420987,0.4843
1,9402361000139,67972640904,0.2062
2,9402361000139,67972640904,0.4420
3,470994000100,369058950,0.8769
4,75218883000100,35649917,0.7840
...,...,...,...
150234,5148772000199,58371249004,0.3610
150235,91692392000152,50911406034,0.5330
150236,9153048000104,28726995034,0.3768
150237,92080878000100,83227113015,0.1050


In [168]:
graph = create_graph(arestas_1)

148517it [00:10, 14190.80it/s]


In [169]:
def get_concessions(list_nodes,emp_type): 
  # As concessões são todas emps marcadas como MANEJO,1
  # (fonte legal e extratores de madeira)

  count = 0
  for node in list_nodes:
    if emp_type[node] == 'MANEJO':
      count+=1

  return count

In [170]:

G_orig_weight = graph.copy()

In [171]:

def print_graph_metrics(graph, emp_type):
  # Print overall graph metrics

  g_aux = G_orig_weight.to_undirected()

  # Number of connected components in the graph
  components_len = []
  for item in nx.connected_components(g_aux):
    components_len.append((len(item),item))

  print(f'Total Graph: {graph}')
  print(f'Number of components: {len(components_len)}')
  print(f'Number of concessions: {get_concessions(graph.nodes(),emp_type)}')

  print()
  print()
  print('Components with more than 1000 nodes')
  for c in components_len:
    if c[0] > 1000:
      subg = graph.subgraph(c[1])

      print(f'Subgraph: {subg}')
      print(f'Number of concessions: {get_concessions(subg.nodes(),emp_type)}')
      print()
      print()


In [172]:
print_graph_metrics(G_orig_weight, emp_type)

Total Graph: DiGraph with 72094 nodes and 74552 edges
Number of components: 849
Number of concessions: 214


Components with more than 1000 nodes
Subgraph: DiGraph with 66172 nodes and 69472 edges
Number of concessions: 180




In [173]:
# G_orig_weight.nodes()

In [174]:
node_df['is_bridge_linkage'] =False
node_df

Unnamed: 0,CNPJ_CPF,type,source,is_bridge_linkage
0,78952082000161,PTO_IBAMA,Remetente,False
1,9402361000139,PTO_IBAMA,Remetente,False
3,470994000100,PTO_IBAMA,Remetente,False
4,75218883000100,PTO_IBAMA,Remetente,False
5,5422729000170,PTO_IBAMA,Remetente,False
...,...,...,...,...
300472,28638514034,FINAL,Destinatário,False
300473,58371249004,FINAL,Destinatário,False
300474,50911406034,FINAL,Destinatário,False
300476,83227113015,FINAL,Destinatário,False


In [175]:
node_df['degree']= 0
node_df

Unnamed: 0,CNPJ_CPF,type,source,is_bridge_linkage,degree
0,78952082000161,PTO_IBAMA,Remetente,False,0
1,9402361000139,PTO_IBAMA,Remetente,False,0
3,470994000100,PTO_IBAMA,Remetente,False,0
4,75218883000100,PTO_IBAMA,Remetente,False,0
5,5422729000170,PTO_IBAMA,Remetente,False,0
...,...,...,...,...,...
300472,28638514034,FINAL,Destinatário,False,0
300473,58371249004,FINAL,Destinatário,False,0
300474,50911406034,FINAL,Destinatário,False,0
300476,83227113015,FINAL,Destinatário,False,0


In [176]:
node_manejo = node_df[node_df['type']=="MANEJO"]
node_manejo

Unnamed: 0,CNPJ_CPF,type,source,is_bridge_linkage,degree
2587,36711381149,MANEJO,Remetente,False,0
2589,47445033815,MANEJO,Remetente,False,0
2590,254100163,MANEJO,Remetente,False,0
2598,63601769000185,MANEJO,Remetente,False,0
2599,65596838268,MANEJO,Remetente,False,0
...,...,...,...,...,...
147812,5494941973,MANEJO,Remetente,False,0
149962,61362573272,MANEJO,Remetente,False,0
149985,50979671272,MANEJO,Remetente,False,0
150009,9373192957,MANEJO,Remetente,False,0


# Walleria Way

In [177]:
df_tran = transporte_primeiro_semestre[['CPF_CNPJ_Rem', 'TpRem', 'CPF_CNPJ_Des', 'TpDes', 'Volume']]
df_tran = df_tran.groupby(['CPF_CNPJ_Rem', 'TpRem', 'CPF_CNPJ_Des', 'TpDes'])['Volume'].sum().reset_index()


In [178]:
df_tran

Unnamed: 0,CPF_CNPJ_Rem,TpRem,CPF_CNPJ_Des,TpDes,Volume
0,253169151,MANEJO,11602746000191,PTO_IBAMA,23.7704
1,254100163,MANEJO,4642220000170,PTO_IBAMA,27.5790
2,657233170,MANEJO,75558817000189,PTO_IBAMA,106.1130
3,964767708,MANEJO,16716007000144,PTO_IBAMA,70.0000
4,964767708,MANEJO,24090101000169,PTO_IBAMA,53.3850
...,...,...,...,...,...
74688,97520331000275,PTO_IBAMA,7933304249,FINAL,0.1680
74689,97520331000275,PTO_IBAMA,90658620215,FINAL,0.3150
74690,97529804000114,PTO_IBAMA,20982589000188,PTO_IBAMA,72.1580
74691,97538233000184,PTO_IBAMA,5219072927,FINAL,2.7500


In [179]:
df_tran['CPF_CNPJ_Rem'] = df_tran['CPF_CNPJ_Rem'].astype(str)
df_tran['CPF_CNPJ_Des'] = df_tran['CPF_CNPJ_Des'].astype(str)

In [180]:
nodes_new = set(df_tran['CPF_CNPJ_Rem']).union(set(df_tran['CPF_CNPJ_Des']))

In [181]:
nodes_new

{'28252931804',
 '9198382861',
 '36264884898',
 '5652692405',
 '6633179839',
 '1830994867',
 '5158634599',
 '34605528830',
 '73390429620',
 '548372861',
 '44161980868',
 '8204074000151',
 '3161208803',
 '10689089716',
 '38021919868',
 '46372121972',
 '64729931815',
 '80512763968',
 '4595204825',
 '27308795870',
 '83041648334',
 '4364534870',
 '31089337876',
 '7795987842',
 '36800614807',
 '1116918854',
 '85870480825',
 '41297504534',
 '66190562000153',
 '42125983885',
 '44156359034',
 '47046848904',
 '8455267828',
 '40227030915',
 '33408143846',
 '11682406890',
 '15518119879',
 '95233059800',
 '3991542137',
 '19420544087',
 '94691614834',
 '43789307734',
 '3181048810',
 '47869534900',
 '2624665110',
 '68826044872',
 '3718502909',
 '25900776883',
 '92018130900',
 '8866165816',
 '96041897353',
 '9986088402',
 '23397489091',
 '1079790802',
 '10769760856',
 '10791997642',
 '4447676575',
 '27869509884',
 '67407110891',
 '7493143897',
 '66578060510',
 '39906288876',
 '1202297811',
 '43171528

In [182]:
G = nx.DiGraph()
G.add_nodes_from(nodes_new)

In [183]:

# Cria as arestas com base nas transações e com peso = volume
edges = []

for row in df_tran.iterrows():
    # Ignora laços
    if str(row[1]['CPF_CNPJ_Rem']) != str(row[1]['CPF_CNPJ_Des']):
        edges.append((str(row[1]['CPF_CNPJ_Rem']), str(row[1]['CPF_CNPJ_Des']), {'Volume': row[1]['Volume']}))
G.add_edges_from(edges)

In [184]:
df_tran[df_tran['CPF_CNPJ_Des']=='356********']

Unnamed: 0,CPF_CNPJ_Rem,TpRem,CPF_CNPJ_Des,TpDes,Volume
44041,15721184000156,PTO_IBAMA,356********,PTO_IBAMA,2.789
44803,16904840000119,PTO_IBAMA,356********,PTO_IBAMA,3.048


In [185]:
df_pto = df_tran[(df_tran['TpRem'] == 'PTO_IBAMA') & (df_tran['TpDes'] == 'PTO_IBAMA')]
df_pto = df_pto.groupby('CPF_CNPJ_Rem')['Volume'].sum().reset_index()

In [186]:
df_pto

Unnamed: 0,CPF_CNPJ_Rem,Volume
0,1011441000151,20.0000
1,1011587000105,0.1130
2,10173601000150,8.5200
3,10194607000103,182.9440
4,10198258000106,29.5341
...,...,...
2411,9813168000190,0.5000
2412,981704000192,9.3027
2413,99007000102,17.9054
2414,9942474000127,7.2464


In [187]:
contagens = df_pto['CPF_CNPJ_Rem'].value_counts()
print(contagens[contagens > 1])  # 

Series([], Name: count, dtype: int64)


In [188]:
nodes_pto = set(df_pto['CPF_CNPJ_Rem'])
nodes_pto

{'22856324000169',
 '2442355000166',
 '4223482000108',
 '53314274000111',
 '83491191000169',
 '5672477000137',
 '8275731000151',
 '48508402000551',
 '5992508000137',
 '11923271000135',
 '7605761000116',
 '26051691000146',
 '22935116000155',
 '8204074000151',
 '62334867000130',
 '21918398000110',
 '8785064000157',
 '5661374000171',
 '8712155000162',
 '28110901000155',
 '84637073000189',
 '3502540000161',
 '3887313000100',
 '5090712000162',
 '965036000100',
 '17755846000134',
 '3597926000102',
 '12537411000108',
 '5461797000148',
 '24682889000100',
 '22034016000158',
 '4547560000111',
 '26126242000110',
 '6089024000145',
 '15625377000103',
 '4653785000152',
 '27623890000144',
 '23937838000101',
 '13188713000172',
 '22582628000185',
 '549861000123',
 '18208019000193',
 '1968569000108',
 '44633071000288',
 '56147184000171',
 '25204205000110',
 '4689550000110',
 '55903280000130',
 '14815822000135',
 '20825514000193',
 '21867928000229',
 '7623050000174',
 '21045233000180',
 '10598764000184',

In [189]:
emp_pto_degree = {}

for node in nodes_pto:
    emp_pto_degree[node]= G.degree(node)


emp_pto_degree

{'22856324000169': 30,
 '2442355000166': 24,
 '4223482000108': 493,
 '53314274000111': 184,
 '83491191000169': 11,
 '5672477000137': 5,
 '8275731000151': 3,
 '48508402000551': 153,
 '5992508000137': 6,
 '11923271000135': 23,
 '7605761000116': 1,
 '26051691000146': 7,
 '22935116000155': 23,
 '8204074000151': 2,
 '62334867000130': 7,
 '21918398000110': 10,
 '8785064000157': 5,
 '5661374000171': 60,
 '8712155000162': 36,
 '28110901000155': 44,
 '84637073000189': 2,
 '3502540000161': 2,
 '3887313000100': 3,
 '5090712000162': 11,
 '965036000100': 22,
 '17755846000134': 6,
 '3597926000102': 11,
 '12537411000108': 16,
 '5461797000148': 14,
 '24682889000100': 1,
 '22034016000158': 14,
 '4547560000111': 18,
 '26126242000110': 5,
 '6089024000145': 3,
 '15625377000103': 1,
 '4653785000152': 9,
 '27623890000144': 7,
 '23937838000101': 5,
 '13188713000172': 16,
 '22582628000185': 21,
 '549861000123': 67,
 '18208019000193': 161,
 '1968569000108': 3,
 '44633071000288': 629,
 '56147184000171': 127,
 '

In [190]:
graus = list(emp_pto_degree.values())
graus

[30,
 24,
 493,
 184,
 11,
 5,
 3,
 153,
 6,
 23,
 1,
 7,
 23,
 2,
 7,
 10,
 5,
 60,
 36,
 44,
 2,
 2,
 3,
 11,
 22,
 6,
 11,
 16,
 14,
 1,
 14,
 18,
 5,
 3,
 1,
 9,
 7,
 5,
 16,
 21,
 67,
 161,
 3,
 629,
 127,
 21,
 5,
 99,
 3,
 12,
 13,
 6,
 10,
 1,
 7,
 1,
 44,
 10,
 14,
 15,
 3,
 3,
 36,
 5,
 27,
 7,
 57,
 1,
 1,
 6,
 1,
 9,
 11,
 3,
 3,
 447,
 1,
 8,
 73,
 2,
 7,
 34,
 14,
 32,
 26,
 1,
 22,
 4,
 16,
 31,
 15,
 2,
 40,
 23,
 11,
 2,
 5,
 2,
 28,
 9,
 32,
 7,
 28,
 7,
 5,
 4,
 4,
 9,
 1,
 22,
 1,
 8,
 2,
 62,
 3,
 24,
 9,
 12,
 6,
 72,
 26,
 45,
 6,
 6,
 5,
 2,
 1,
 44,
 7,
 2,
 1,
 1,
 5,
 39,
 2,
 32,
 3,
 13,
 1,
 3,
 18,
 400,
 14,
 7,
 4,
 16,
 265,
 3,
 25,
 4,
 115,
 10,
 14,
 3,
 2,
 13,
 3,
 18,
 11,
 5,
 4,
 1,
 4,
 27,
 132,
 53,
 68,
 176,
 20,
 17,
 131,
 2,
 25,
 1,
 3,
 10,
 1,
 8,
 68,
 121,
 8,
 64,
 11,
 93,
 153,
 9,
 8,
 3,
 34,
 87,
 2,
 6,
 6,
 1,
 2,
 2,
 2,
 11,
 7,
 13,
 1,
 1,
 75,
 17,
 4,
 17,
 11,
 4,
 2,
 34,
 14,
 10,
 33,
 2,
 5,
 12,
 1,
 136,
 4,
 

In [191]:
print(f"Grau:    {graus}")
q1 = np.quantile(graus, 0.25)  # Primeiro quartil (25%)
q2 = np.quantile(graus, 0.5)   # Segundo quartil (50%, ou mediana)
q3 = np.quantile(graus, 0.75)  # Terceiro quartil (75%)
interquartil = q3-q1
limite_superior = q3 + 1.5*interquartil
print(f"Q1 (25%): {q1}")
print(f"Q2 (50% - Mediana): {q2}")
print(f"Q3 (75%): {q3}")
print(f"Limite superior: {limite_superior}")
print(f"Máximo: {max(graus)}")

Grau:    [30, 24, 493, 184, 11, 5, 3, 153, 6, 23, 1, 7, 23, 2, 7, 10, 5, 60, 36, 44, 2, 2, 3, 11, 22, 6, 11, 16, 14, 1, 14, 18, 5, 3, 1, 9, 7, 5, 16, 21, 67, 161, 3, 629, 127, 21, 5, 99, 3, 12, 13, 6, 10, 1, 7, 1, 44, 10, 14, 15, 3, 3, 36, 5, 27, 7, 57, 1, 1, 6, 1, 9, 11, 3, 3, 447, 1, 8, 73, 2, 7, 34, 14, 32, 26, 1, 22, 4, 16, 31, 15, 2, 40, 23, 11, 2, 5, 2, 28, 9, 32, 7, 28, 7, 5, 4, 4, 9, 1, 22, 1, 8, 2, 62, 3, 24, 9, 12, 6, 72, 26, 45, 6, 6, 5, 2, 1, 44, 7, 2, 1, 1, 5, 39, 2, 32, 3, 13, 1, 3, 18, 400, 14, 7, 4, 16, 265, 3, 25, 4, 115, 10, 14, 3, 2, 13, 3, 18, 11, 5, 4, 1, 4, 27, 132, 53, 68, 176, 20, 17, 131, 2, 25, 1, 3, 10, 1, 8, 68, 121, 8, 64, 11, 93, 153, 9, 8, 3, 34, 87, 2, 6, 6, 1, 2, 2, 2, 11, 7, 13, 1, 1, 75, 17, 4, 17, 11, 4, 2, 34, 14, 10, 33, 2, 5, 12, 1, 136, 4, 5, 8, 1, 15, 129, 30, 50, 3, 72, 4, 13, 1, 20, 10, 9, 4, 4, 4, 83, 2, 6, 938, 10, 53, 85, 6, 26, 313, 1, 5, 8, 24, 65, 19, 2, 51, 27, 10, 139, 6, 55, 5, 30, 3, 10, 40, 31, 21, 22, 9, 10, 422, 69, 11, 2, 5, 4, 4

In [192]:
manejos_outliers = 0 

for emp in graus:
  if emp>= limite_superior:
    manejos_outliers+=1

print(f"Numero de Empresas Patio outliers (importantes quanto grau): {manejos_outliers} ")



Numero de Empresas Patio outliers (importantes quanto grau): 318 


In [193]:
print(nx.number_weakly_connected_components(G))

818


In [194]:
df_pto['Grau'] = df_pto['CPF_CNPJ_Rem'].apply(lambda x: G.degree(x))
df_pto

Unnamed: 0,CPF_CNPJ_Rem,Volume,Grau
0,1011441000151,20.0000,3
1,1011587000105,0.1130,4
2,10173601000150,8.5200,22
3,10194607000103,182.9440,8
4,10198258000106,29.5341,202
...,...,...,...
2411,9813168000190,0.5000,1
2412,981704000192,9.3027,3
2413,99007000102,17.9054,9
2414,9942474000127,7.2464,247


In [195]:
df_pto_outliers = df_pto[df_pto['Grau']>= limite_superior]
df_pto_outliers

Unnamed: 0,CPF_CNPJ_Rem,Volume,Grau
4,10198258000106,29.5341,202
8,10256272000100,639.8343,53
11,10310351000152,14.6111,87
17,10373419000142,5.9944,111
25,10435661000101,9.5043,84
...,...,...,...
2382,95808259000170,69.1650,87
2384,95817664000154,123.5790,141
2386,9583332000110,133.6660,57
2409,976181000196,14.8830,118


In [196]:
df_pto_outliers.loc[:, 'is_bridge_linkage'] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pto_outliers.loc[:, 'is_bridge_linkage'] = False


In [197]:
df_pto_outliers

Unnamed: 0,CPF_CNPJ_Rem,Volume,Grau,is_bridge_linkage
4,10198258000106,29.5341,202,False
8,10256272000100,639.8343,53,False
11,10310351000152,14.6111,87,False
17,10373419000142,5.9944,111,False
25,10435661000101,9.5043,84,False
...,...,...,...,...
2382,95808259000170,69.1650,87,False
2384,95817664000154,123.5790,141,False
2386,9583332000110,133.6660,57,False
2409,976181000196,14.8830,118,False


In [198]:
print("Número de components conexas:")
numero_original_de_componentes = nx.number_weakly_connected_components(G) 
print(nx.number_weakly_connected_components(G))

Número de components conexas:
818


In [199]:
components = []
for node in list(df_pto_outliers['CPF_CNPJ_Rem']):
    SG = nx.subgraph_view(G, filter_node= lambda x: x != node)
    components.append(nx.number_weakly_connected_components(SG))



In [200]:
components

[1013,
 849,
 900,
 924,
 900,
 899,
 871,
 947,
 894,
 1450,
 953,
 879,
 1013,
 867,
 1019,
 1054,
 867,
 893,
 979,
 1844,
 1230,
 1694,
 902,
 941,
 867,
 874,
 877,
 859,
 1527,
 885,
 941,
 914,
 883,
 948,
 985,
 1205,
 884,
 886,
 999,
 975,
 881,
 1191,
 1054,
 935,
 1260,
 872,
 929,
 935,
 889,
 854,
 923,
 887,
 874,
 863,
 927,
 892,
 1492,
 913,
 978,
 919,
 867,
 1090,
 928,
 905,
 875,
 1020,
 946,
 885,
 871,
 870,
 927,
 961,
 1008,
 884,
 824,
 877,
 988,
 970,
 897,
 910,
 883,
 1011,
 986,
 862,
 995,
 874,
 871,
 913,
 877,
 864,
 877,
 1108,
 862,
 889,
 1010,
 883,
 973,
 919,
 923,
 886,
 910,
 879,
 870,
 921,
 896,
 824,
 911,
 1086,
 963,
 821,
 976,
 901,
 931,
 880,
 936,
 887,
 872,
 914,
 1026,
 879,
 888,
 916,
 903,
 878,
 945,
 875,
 876,
 1110,
 899,
 929,
 875,
 955,
 980,
 949,
 878,
 908,
 913,
 910,
 996,
 903,
 880,
 873,
 873,
 895,
 1127,
 877,
 918,
 1305,
 876,
 895,
 875,
 868,
 988,
 1198,
 874,
 897,
 878,
 1438,
 975,
 898,
 874,
 907,
 

In [201]:
df_pto_outliers['Conected_Components']= components

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pto_outliers['Conected_Components']= components


In [202]:
df_pto_outliers.loc[df_pto_outliers['Conected_Components'] > numero_original_de_componentes, 'is_bridge_linkage'] = True
df_pto_outliers

Unnamed: 0,CPF_CNPJ_Rem,Volume,Grau,is_bridge_linkage,Conected_Components
4,10198258000106,29.5341,202,True,1013
8,10256272000100,639.8343,53,True,849
11,10310351000152,14.6111,87,True,900
17,10373419000142,5.9944,111,True,924
25,10435661000101,9.5043,84,True,900
...,...,...,...,...,...
2382,95808259000170,69.1650,87,True,898
2384,95817664000154,123.5790,141,True,954
2386,9583332000110,133.6660,57,True,839
2409,976181000196,14.8830,118,True,934


In [203]:
qtd_de_pontes_de_articulacao =  df_pto_outliers['is_bridge_linkage'].sum()
print(f"Das empresas importantes, quantas são ponts de  de articulação: {qtd_de_pontes_de_articulacao}")

Das empresas importantes, quantas são ponts de  de articulação: 318


## Encontrando fontes e sumidouros

In [204]:
emp_type= {}
df_tran

Unnamed: 0,CPF_CNPJ_Rem,TpRem,CPF_CNPJ_Des,TpDes,Volume
0,253169151,MANEJO,11602746000191,PTO_IBAMA,23.7704
1,254100163,MANEJO,4642220000170,PTO_IBAMA,27.5790
2,657233170,MANEJO,75558817000189,PTO_IBAMA,106.1130
3,964767708,MANEJO,16716007000144,PTO_IBAMA,70.0000
4,964767708,MANEJO,24090101000169,PTO_IBAMA,53.3850
...,...,...,...,...,...
74688,97520331000275,PTO_IBAMA,7933304249,FINAL,0.1680
74689,97520331000275,PTO_IBAMA,90658620215,FINAL,0.3150
74690,97529804000114,PTO_IBAMA,20982589000188,PTO_IBAMA,72.1580
74691,97538233000184,PTO_IBAMA,5219072927,FINAL,2.7500


In [205]:
origem = transporte_primeiro_semestre[["CPF_CNPJ_Rem", "LatOrigem", "LongOrigem", "TpRem"]].rename(
    columns={
    'CPF_CNPJ_Rem': 'CPF_CNPJ', 
    'LatOrigem': 'Latitude', 
    'LongOrigem': 'Longitude', 
    'TpRem': 'Tipo'
})
destino = transporte_primeiro_semestre[["CPF_CNPJ_Des", "LatDestino", "LongDestino", "TpDes"]].rename(
    columns={
    'CPF_CNPJ_Des': 'CPF_CNPJ', 
    'LatDestino': 'Latitude', 
    'LongDestino': 'Longitude', 
    'TpDes': 'Tipo'
})

nodes = pd.concat([origem, destino], ignore_index=True)
nodes.drop_duplicates("CPF_CNPJ", inplace=True)
nodes["Coordenada"] = nodes.apply(lambda row: (row['Latitude'], row['Longitude']), axis=1)
nodes.drop(["Latitude", "Longitude"], axis=1, inplace=True)
nodes.to_csv("../data/nodes.csv", index=False)

In [206]:
edges = transporte_primeiro_semestre[["CPF_CNPJ_Rem", "CPF_CNPJ_Des", "Produto", "Volume", "DtEmissao"]]
edges = edges[edges['CPF_CNPJ_Rem'] != edges['CPF_CNPJ_Des']]
# edges["DtEmissao"] = pd.to_datetime(edges["DtEmissao"], yearfirst=True)
edges.sort_values(['DtEmissao'], ascending=True, inplace=True)
edges.to_csv("../data/edges.csv", index=False)

In [207]:
nodes

Unnamed: 0,CPF_CNPJ,Tipo,Coordenada
0,78952082000161,PTO_IBAMA,"(-25.291667, -49.224167)"
1,9402361000139,PTO_IBAMA,"(-25.427778, -49.273056)"
3,470994000100,PTO_IBAMA,"(-25.615833, -49.344222)"
4,75218883000100,PTO_IBAMA,"(-23.279972, -51.069444)"
5,5422729000170,PTO_IBAMA,"(-24.98075, -53.472722)"
...,...,...,...
300472,28638514034,FINAL,"(nan, nan)"
300473,58371249004,FINAL,"(nan, nan)"
300474,50911406034,FINAL,"(nan, nan)"
300476,83227113015,FINAL,"(nan, nan)"


In [208]:
edges

Unnamed: 0,CPF_CNPJ_Rem,CPF_CNPJ_Des,Produto,Volume,DtEmissao
19218,5583309000175,18996153000104,Madeira serrada (prancha),7.6230,01/07/2017
1564,2411968000136,45036047915,Madeira serrada (vigota),0.1575,01/07/2017
16041,5669339000107,26441400804,Madeira serrada (tábua),0.0276,01/07/2017
5465,5669339000107,80205062849,Madeira serrada (tábua),0.0069,01/07/2017
16222,55844229000366,3750275807,Madeira serrada (caibro),0.0204,01/07/2017
...,...,...,...,...,...
147539,10315322268,3957205000158,Tora,4.6030,31/12/2017
148621,10315322268,3957205000158,Tora,2.1020,31/12/2017
147834,27176479291,14743232000144,Tora,35.2216,31/12/2017
148348,2495933190,34740696000104,Tora,4.0210,31/12/2017


In [209]:
def get_concessions(list_nodes,emp_type):
  # Concessions are all emps which are marked as MANEJO,
  # (legal source and extractors of timber)

  count = 0
  for node in list_nodes:
    if emp_type[node] == 'MANEJO':
      count+=1

  return count

In [210]:
emps = pd.read_csv('../data/nodes.csv')

emp_type = {}

for i,node in emps.iterrows():
  emp_type[node['CPF_CNPJ']] = node['Tipo']
  
emp_type

{'78952082000161': 'PTO_IBAMA',
 '9402361000139': 'PTO_IBAMA',
 '470994000100': 'PTO_IBAMA',
 '75218883000100': 'PTO_IBAMA',
 '5422729000170': 'PTO_IBAMA',
 '76803972000186': 'PTO_IBAMA',
 '7027349000166': 'PTO_IBAMA',
 '8408127000156': 'PTO_IBAMA',
 '3281325000188': 'PTO_IBAMA',
 '82191941000114': 'PTO_IBAMA',
 '1699020000165': 'PTO_IBAMA',
 '73304982000189': 'PTO_IBAMA',
 '4385294000259': 'PTO_IBAMA',
 '8849597000155': 'PTO_IBAMA',
 '82222290000182': 'PTO_IBAMA',
 '79109088000134': 'PTO_IBAMA',
 '24619519000110': 'PTO_IBAMA',
 '1473384000122': 'PTO_IBAMA',
 '85042737000120': 'PTO_IBAMA',
 '77540078000123': 'PTO_IBAMA',
 '20416094000191': 'PTO_IBAMA',
 '147816000142': 'PTO_IBAMA',
 '2164300000131': 'PTO_IBAMA',
 '24146864000184': 'PTO_IBAMA',
 '81120255000190': 'PTO_IBAMA',
 '3411505000137': 'PTO_IBAMA',
 '23672158000103': 'PTO_IBAMA',
 '4966423000111': 'PTO_IBAMA',
 '8113423000120': 'PTO_IBAMA',
 '82273939000194': 'PTO_IBAMA',
 '7716364000111': 'PTO_IBAMA',
 '78943925000163': 'PTO_IB

In [211]:
def get_concessions(list_nodes,emp_type):
  # Concessions are all emps which are marked as MANEJO,
  # (legal source and extractors of timber)

  count = 0
  for node in list_nodes:
    if emp_type[node] == 'MANEJO':
      count+=1

  return count

In [212]:
def get_sink_nodes(graph, emp_type):
  # If node is marked as FINAL, he is a sink
  # It is the final destination of the timber chain

  nodes = {}
  for node in graph.nodes():
    if emp_type[node] == 'FINAL':
      nodes[node] = 1
      continue

    # we consider sink nodes as final nodes or nodes that only transports to other final nodes
    not_sink = False
    for edge in graph.edges(node):
      if emp_type[edge[1]] != 'FINAL':
        not_sink=True

    if not not_sink:
      nodes[node] = 1

  return nodes

In [213]:
def get_timberflow(graph,emp_type):
  sink_nodes = get_sink_nodes(graph, emp_type)

  total_in = 0
  total_out = 0

  # For all edges in the graph
  # 1. sum the volume of input (from MANEJO types)
  # 2. sum the volume of output (from FINAL types)
  for edge in graph.edges():
    if emp_type[edge[0]] == 'MANEJO':
      total_in += path_weight(graph, [edge[0],edge[1]], weight='Volume')
    elif edge[1] in sink_nodes:
      total_out += path_weight(graph, [edge[0],edge[1]], weight='Volume')

  print(f'Inflow Vol(m3): {total_in} \nOut Vol(m3): {total_out} \nProportion: {total_out/total_in}')

In [214]:
get_timberflow(G, emp_type)

Inflow Vol(m3): 48886.93629999996 
Out Vol(m3): 123529.36519999472 
Proportion: 2.526837935638745


In [215]:
# TODO
# Para  cada componente conexa, cria uma rede de fluxo se tive manejo e final
# Se não tiver um dos dois, não faça 

