In [None]:
%load_ext autoreload
%autoreload 2

import hydra
import os
import datetime
from pathlib import Path

# Initialize hydra and move to the root of the repository
try:
    hydra.initialize(version_base=None, config_path="../config/")
    CONFIG = hydra.compose(config_name="main.yaml")
    print('Initializing hydra')
except:
    print('Hydra already initalized!')
else:
    # Create an output folder in the root of the repository
    os.chdir('..')
    OUTPUT_FOLDER = Path('output/{0}'.format(datetime.datetime.now()))
    Path(OUTPUT_FOLDER).mkdir(parents=True, exist_ok=True)

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import networkx as nx
import numpy as np
from src.utils.styling import hide_and_move_axis
from itertools import combinations

In [None]:
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
nodes = pd.read_csv('data/01_raw/city_nodes_annual_all.csv')
edges = pd.read_csv('data/01_raw/city_edges_annual_all.csv')

pos = nodes[['PlaceID', 'XCOORD', 'YCOORD']].drop_duplicates().set_index('PlaceID').to_dict('index')
pos = {key: (value['XCOORD'], value['YCOORD']) for key, value in pos.items()}

years = np.sort(edges.Year.unique())
places = nodes.PlaceID.unique()

In [None]:
def weighted_network():
    edge_weights = edges.groupby(['from', 'to']).Year.count().reset_index().rename(columns={'Year': 'weight'})
    edge_weights.weight /= edge_weights.weight.max()
    edge_weights = edge_weights.values.tolist()
    edge_weights = [[int(e[0]), int(e[1]), e[2]] for e in edge_weights]
    
    G = nx.Graph()
    G.add_weighted_edges_from(edge_weights)

    return G

def singular_network(year, edges):
    edges = edges[edges.Year == year]
    edges = edges[['from', 'to']].values.tolist()
    return edges

def compute_coincidences():

    results = []
    
    for i, j in combinations(places, 2):
        scalar1 = nodes[nodes.PlaceID == i][['Year', 'Juden']]
        scalar2 = nodes[nodes.PlaceID == j][['Year', 'Juden']]
        
        scalar1 = scalar1.sort_values(by='Year')
        scalar1[['dYear', 'dJews']] = scalar1.diff()
        
        scalar2 = scalar2.sort_values(by='Year')
        scalar2[['dYear', 'dJews']] = scalar2.diff()
        
        scalar1 = scalar1[scalar1.dYear == 1]
        scalar2 = scalar2[scalar2.dYear == 1]
        
        scalar = pd.merge(scalar1, scalar2, on='Year')
        scalar.dJews_x = scalar.dJews_x == 1
        scalar.dJews_y = scalar.dJews_y == 1
        
        norm = np.max([scalar.dJews_x.sum(), scalar.dJews_y.sum()])
        coicidence = (scalar.dJews_x * scalar.dJews_y)
        count = coicidence.sum()
    
        if count != 0:
            print(i, j, count, norm, scalar[coicidence].Year.values, end='\r')
            rate = count / norm
        else:
            rate = 0
        results.append([i, j, count, norm, rate, scalar[coicidence].Year.values])

    return results


def compute_similarity_absence_presence():

    results = []
    
    for i, j in combinations(places, 2):

        print(i, places.max(), end='\r')
        
        scalar1 = nodes[nodes.PlaceID == i][['Year', 'Juden']]
        scalar2 = nodes[nodes.PlaceID == j][['Year', 'Juden']]
        scalar = pd.merge(scalar1, scalar2, on='Year')
        scalar = scalar.Juden_x == scalar.Juden_y
        scalar = scalar.sum() / len(scalar)
        results.append([i, j, scalar])

    return results


def compute_similarity_presence():

    results = []
    
    for i, j in combinations(places, 2):

        print(i, places.max(), end='\r')
        
        scalar1 = nodes[nodes.PlaceID == i][['Year', 'Juden']]
        scalar2 = nodes[nodes.PlaceID == j][['Year', 'Juden']]
        scalar = pd.merge(scalar1, scalar2, on='Year')
        scalar = scalar.Juden_x * scalar.Juden_y
        scalar = scalar.sum() / len(scalar)
        results.append([i, j, scalar])

    return results

In [None]:
n_edges = []
for year in years:
    e = singular_network(year, edges)
    n_edges.append(len(e))
    
f, ax0 = plt.subplots()
nodes.groupby('Year').Juden.sum().plot(ax=ax0)
ax1 = ax0.twinx()
ax1.plot(years, n_edges, c='k')

In [None]:
G = weighted_network()
comms = nx.community.louvain_communities(G)
T = np.sort([len(c) for c in comms])[-10]
comms = [c for c in comms if len(c) >= T]
assert len(comms) <= 10

In [None]:
weights = [G.get_edge_data(u, v)['weight'] for u, v in G.edges()]

f, ax = plt.subplots()
ax.hist(weights, bins=np.arange(0, 1, 0.05), width=0.04)

ax.set_xlabel('Edge weight')
ax.set_ylabel('Count')

hide_and_move_axis(ax)
plt.savefig(OUTPUT_FOLDER / 'edge_weight_histrogram.jpg')

In [None]:
nx.draw_networkx_nodes(G, pos, node_size=12, node_color='k')

for i, comm in enumerate(comms):
    nx.draw_networkx_nodes(G, pos, nodelist=comm, node_size=15, node_color=colors[i])

nx.draw_networkx_edges(G, pos, alpha=0.025)
plt.savefig(OUTPUT_FOLDER / 'communities.jpg')

In [None]:
try:
    coincidences_df = pd.read_feather('computations/coincidences.feather')
except:
    coincidences = compute_coincidences()
    coincidences_df = pd.DataFrame(coincidences, columns=['id1', 'id2', 'count', 'norm', 'rate', 'years'])
    coincidences_df.set_index(['id1', 'id2'], inplace=True)
    coincidences_df.to_feather('computations/coincidences.feather')
else:
    print('Read data from disk!')
    
try:
    similarity_absence_presence_df = pd.read_feather('computations/similarity_absence_presence.feather')
except:
    similarity_absence_presence = compute_similarity_absence_presence()
    similarity_absence_presence_df = pd.DataFrame(similarity_absence_presence, columns=['id1', 'id2', 'rate'])
    similarity_absence_presence_df.set_index(['id1', 'id2'], inplace=True)
    similarity_absence_presence_df.to_feather('computations/similarity_absence_presence.feather')
else:
    print('Read data from disk!')
    
try:
    similarity_presence_df = pd.read_feather('computations/similarity_presence.feather')
except:
    similarity_presence = compute_similarity_presence()
    similarity_presence_df = pd.DataFrame(similarity_presence, columns=['id1', 'id2', 'rate'])
    similarity_presence_df.set_index(['id1', 'id2'], inplace=True)
    similarity_presence_df.to_feather('computations/similarity_presence.feather')
else:
    print('Read data from disk!')

In [None]:
def get_coefficients(nodes, similarity, key='count'):

    nodes = np.sort(list(nodes))
    indices = list(combinations(nodes, 2))
    return similarity[coincidences_df.index.isin(indices)][key]

def cumulative(results):

    x = np.sort(results)
    N = len(results)
    y = np.arange(N) / float(N) 

    return x, y

In [None]:
f, axarr = plt.subplots(2, 5, figsize=(10, 5), sharex=True, sharey=True)
flatax = axarr.flatten()

which = 'presence'

if which == 'absencepresence':
    SIMILARITY = similarity_absence_presence_df
    KEY = 'rate'
    axarr[1, 2].set_xlabel('Share of shared years with Jews present OR absent')
elif which == 'presence':
    SIMILARITY = similarity_presence_df
    KEY = 'rate'
    axarr[1, 2].set_xlabel('Share of shared years with Jews present')
else:
    assert False
    
for i in range(10):
    print(i, end='\r')
    community = comms[i]
    
    for _ in range(200):
        results = get_coefficients(np.random.choice(places, len(community), replace=False), SIMILARITY, KEY)
        x, y = cumulative(results)
        flatax[i].plot(x, y, alpha=0.1, c='k') 
    
    results = get_coefficients(community, SIMILARITY, KEY)
    x, y = cumulative(results)
    
    flatax[i].plot(x, y, lw=3, color=colors[i]) 

for ax in axarr[:, 0]:
    ax.set_ylabel('Cumulative distribution function')

#for ax in axarr[1]:
plt.tight_layout()
plt.savefig(OUTPUT_FOLDER / f'cumulative_distributions_{which}.jpg')

In [None]:
SIMILARITY = coincidences_df
KEY = 'count'

f, axarr = plt.subplots(2, 5, figsize=(8, 4), sharex=True, sharey=True)
flatax = axarr.flatten()

for i in range(10):
    
    community = comms[i]
    frequencies = []
    
    for _ in range(500):
        results = get_coefficients(np.random.choice(places, len(community), replace=False), SIMILARITY, KEY)
        frequencies.append((results > 0).mean())
    flatax[i].hist(frequencies, color='k', bins=np.arange(0, 0.15, 0.005), alpha=0.25)
    
    results = get_coefficients(community, SIMILARITY, KEY)
    results = (results > 0).mean()
    flatax[i].axvline(results, color=colors[i], lw=3)
    
    frequencies = np.array(frequencies)
    print(i, (frequencies > results).mean())

    if (frequencies > results).mean() < 0.05:
        flatax[i].text(0.13, 2, "*", size=20)

for ax in axarr[:, 0]:
    ax.set_ylabel('Count')

for ax in axarr[1]:
    ax.set_xlabel('Share of\nco-explusion events')

plt.tight_layout()
plt.savefig(OUTPUT_FOLDER / 'coexpulsion_frequencies.jpg')