# Centralities Computation

In [1]:
import os 
import json
import pandas as pd
import geopandas as gpd
import networkx as nx
import numpy as np
from collections import defaultdict, OrderedDict
from shapely.geometry import LineString, Point
from geopy.distance import geodesic
from tqdm import tqdm
tqdm.pandas()

## Cоздание матриц корреспонденции между коммунами

In [2]:
os.chdir('D:/bachelors/datasets/')

In [3]:
datasets_list = os.listdir('translated_data_v2/')

In [4]:
consumer = ['Покупки', 'Услуги', 'Туризм и рекреация', 'Другие виды досуга']
commuter = ['Работа', 'Рабочая встреча']

In [5]:
with open('Variables/mappers/communes_names_map.json') as infile:
    names_map = json.load(infile)
    
all_names = np.unique(list(names_map.values()))

In [6]:
matricies = defaultdict(dict)

In [19]:
def get_corrMatrix(df_current):
    """Get correspondence matrix from datatable"""
    df_current['route'] = df_current.apply(lambda row: tuple(sorted([row.geo_departure, row.geo_arrival])), axis = 1)
    matrix = df_current.groupby('route')['uid_person'].nunique()
    matrix.index = pd.MultiIndex.from_tuples(matrix.index)
    matrix.index.set_names(['local_1', 'local_2'], inplace = True)
    matrix.rename('flow', inplace = True)
#     matrix.reset_index(inplace = True)
    return matrix.reset_index()

In [20]:
for dataset in datasets_list:
    year = dataset.split('_')[1].split('.')[0]
    df_current = pd.read_csv(f'translated_data_v2/{dataset}')
    df_current['geo_departure'], df_current['geo_arrival'] = df_current['geo_departure'].map(names_map), df_current['geo_arrival'].map(names_map)
    df_current.dropna(subset = ['geo_departure', 'geo_arrival'], inplace = True)
    df_current = df_current[df_current.geo_departure != df_current.geo_arrival]
    
    matricies[year] = {
        'total': get_corrMatrix(df_current),
        'consumers' : get_corrMatrix(df_current[df_current.motiv_motivation.isin(consumer)]), 
        'commuters' : get_corrMatrix(df_current[df_current.motiv_motivation.isin(commuter)]),
        'others' : get_corrMatrix(df_current[~df_current.motiv_motivation.isin(consumer + commuter)])
    }

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
from itertools import combinations as combo

In [52]:
def count_trips(current_data):
    edges = defaultdict(dict)
    indexes = current_data.index.to_flat_index()
    for pair in tqdm(combo(all_names, 2)):
        pair_r = pair[::-1]
        if (pair in indexes) or (pair_r in indexes):
            flow = current_data.loc[pair] + current_data.loc[pair[::-1]]
            edges[(pair[0],pair[1])] = flow
    edge_list = pd.DataFrame(edges, index = ['flow']).transpose().dropna()
    edge_list.index.set_names(['local_1', 'local_2'], inplace = True)
    edge_list.index.set_names(['local_1', 'local_2'], inplace = True)
    edge_list.reset_index(inplace = True)
    return edge_list

In [48]:
def count_trips(current_data):
    edges = defaultdict(dict)
    indexes = current_data.index.to_flat_index()
    for pair in tqdm(combo(all_names, 2)):
        try:
            f1 = current_data.loc[pair]
        except:
            f1 = 0
        try:
            f2 = current_data.loc[pair[::-1]]
        except:
            f2 = 0
        flow = f1 + f2
        edges[(pair[0],pair[1])] = flow
    edge_list = pd.DataFrame(edges, index = ['flow']).transpose().dropna()
    edge_list.index.set_names(['local_1', 'local_2'], inplace = True)
    edge_list.index.set_names(['local_1', 'local_2'], inplace = True)
    edge_list.reset_index(inplace = True)
    return edge_list

In [21]:
with open('Variables/mappers/coordinates_map_final.json') as infile:
    coordinates_map = json.load(infile)

In [22]:
def compute_metrics(edge_list):   
    edge_list = edge_list.assign(
        lat_start = edge_list.local_1.map(coordinates_map).str['lat'],
        lon_start = edge_list.local_1.map(coordinates_map).str['lon'],
        lat_finish = edge_list.local_2.map(coordinates_map).str['lat'],
        lon_finish = edge_list.local_2.map(coordinates_map).str['lon'],
        distance = lambda frame: frame.apply(
            lambda row: geodesic((row.lat_start, row.lon_start), (row.lat_finish, row.lon_finish)).kilometers,
            axis = 1
        ),
        flow_weighted = lambda frame: frame.flow / frame.distance
    ).assign(
        flow = lambda frame: frame.flow / frame.flow.max(),
        flow_weighted = lambda frame: frame.flow_weighted / frame.flow_weighted.max()
    )
    return edge_list

In [23]:
def get_geoLines(edge_list):
    return gpd.GeoDataFrame(
        edge_list,
        geometry = [
            LineString([(lon_start, lat_start),
                       (lon_finish, lat_finish)]) for lat_start, lon_start, lat_finish, lon_finish in zip(edge_list.lat_start,
                                                                                       edge_list.lon_start,
                                                                                       edge_list.lat_finish,
                                                                                       edge_list.lon_finish)],
        crs = {'init' : 'epsg:4326'}
    )

def get_geoPoints(node_list):
    return gpd.GeoDataFrame(
        node_list,
        geometry = [Point(xy) for xy in zip(node_list.lon, node_list.lat)],
        crs = {'init' : 'epsg:4326'}
    )

In [24]:
def centrality_edges(graph):
    edge_centrality = pd.DataFrame({
        'flow_cent': nx.centrality.edge_current_flow_betweenness_centrality(
            graph, weight='flow', normalized = False
        ),
        'flow_cent_weighted' : nx.centrality.edge_current_flow_betweenness_centrality(
            graph, weight='flow_weighted', normalized = False
        )
    })
    edge_centrality.index.set_names(['local_1', 'local_2'], inplace = True)
    edge_centrality.reset_index(inplace = True)
    return edge_centrality

def centrality_nodes(graph):
    node_centrality = pd.DataFrame({
        'flow_cent': nx.centrality.current_flow_betweenness_centrality(
            graph, weight='flow', normalized = False
        ),
        'flow_cent_weighted' : nx.centrality.current_flow_betweenness_centrality(
            graph, weight='flow_weighted', normalized = False
        )
    })
    node_centrality.index.set_names(['commune'], inplace = True)
    node_centrality.reset_index(inplace = True)
    return node_centrality

In [25]:
def perform_fuctions(year, tp):
    total_system = matricies[year][tp]
    edge_list = compute_metrics(total_system)
    
    graph = nx.from_pandas_edgelist(edge_list, 'local_1', 'local_2', ['flow', 'flow_weighted'])
    edge_centrality = centrality_edges(graph)
    node_centrality = centrality_nodes(graph).assign(
        lat = lambda frame: frame.commune.map(coordinates_map).str['lat'],
        lon = lambda frame: frame.commune.map(coordinates_map).str['lon']
    )
    edge_list = edge_list.merge(edge_centrality, on = ['local_1', 'local_2'])
    
#     os.makedirs(f'analysis/yearly_graphs/{tp}/')
    get_geoLines(edge_list).to_file(
        f'analysis/yearly_graphs/{tp}/edges_{year}.gpkg', driver = 'GPKG'
    )
    get_geoPoints(node_centrality).to_file(
        f'analysis/yearly_graphs/{tp}/nodes_{year}.gpkg', driver = 'GPKG'
    )

In [None]:
for year in matricies:
    for tp in ['total', 'commuters', 'consumers', 'others']:
        perform_fuctions(year, tp)