In [1]:
import pandas as pd
import os
import geopandas as gpd
from scipy.stats import linregress as lr

In [2]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm
import seaborn as sns

In [3]:
medium_font = fm.FontProperties(fname =  r"D:/bachelors/datasets/viz/Roboto-Medium.ttf",size = 12)
regluar_font = fm.FontProperties(fname = r"D:\bachelors\datasets\viz\Roboto-Regular.ttf", size = 12)

colors = {
    'peach' : '#FCB886',
    'blue' : '#6FC7F7',
    'green' : '#78AF34',
    'red' : '#D25B5B',
    'grey' : '#8B8B8B'
}

In [4]:
os.chdir('D:/bachelors/datasets/')

## Import nodes

In [12]:
pd.read_csv(
        f'analysis/yearly_graphs_v4/total/nodes_total_{year}.csv')

Unnamed: 0,commune,flow_cent,flow_cent_weighted,lat,lon
0,'s-Hertogenbosch,4044.998939,6425.120416,51.716578,5.354612
1,Aalburg,228.430508,451.462882,51.745721,5.068460
2,Almelo,1239.923576,2002.925994,52.347764,6.666955
3,Almere,2495.102412,3865.450762,52.366282,5.249946
4,Alphen aan den Rijn,1540.276304,2225.913278,52.112423,4.641881
...,...,...,...,...,...
373,Valkenburg aan de Geul,516.785481,577.022303,50.862246,5.831572
374,Gulpen-Wittem,634.298116,624.548588,50.803984,5.906796
375,Ten Boer,258.001323,342.204732,53.277802,6.699298
376,Zuidhorn,364.151314,606.290369,53.271983,6.380709


In [13]:
nodes = pd.concat([
    pd.read_csv(
        f'analysis/yearly_graphs_v4/total/nodes_total_{year}.csv'
    ).assign(
        year = pd.Timestamp(f'{year}-01-01')
    ) for year in range(2004, 2018)
]).set_index(['year', 'commune'])['flow_cent']

In [14]:
nodes

year        commune               
2004-01-01  's-Hertogenbosch          3613.827049
            Aalburg                    427.784161
            Alkmaar                   1658.776524
            Almere                    2943.174698
            Amersfoort                2809.391779
                                         ...     
2017-01-01  Valkenburg aan de Geul     516.785481
            Gulpen-Wittem              634.298116
            Ten Boer                   258.001323
            Zuidhorn                   364.151314
            Schiermonnikoog             22.599883
Name: flow_cent, Length: 5310, dtype: float64

## Import Edges

In [6]:
def stats(communes_list, graph):
    series_list = []
    for commune in communes_list:
        series_list.append(
            graph[
                (graph.local_1 == commune) | (graph.local_2 == commune)
            ].agg(
                {'flow' : 'mean'}
            ).rename(commune)
        )
    typology = pd.DataFrame(series_list)

    typology = typology.assign(
        flow = typology.flow / typology.flow.max()
    #     flow = zscore(typology.flow)
    )
    
    return typology

In [21]:
dflist = []
for year in range(2004,2018):
    graph = pd.read_csv(f'analysis/yearly_graphs_v4/total/edges_total_{year}.csv')
    
    communes_list = set(graph.local_1).union(set(graph.local_2))
    
    typology = stats(communes_list, graph).assign(
        year = pd.Timestamp(f'{year}-01-01')
    )
    typology.index.set_names(['commune'], inplace = True)
    typology = typology.reset_index().set_index(['year', 'commune'])['flow']
    dflist.append(typology)
#     plot_types(typology, year)

In [23]:
edges = pd.concat(dflist)

## Import data

In [24]:
dataset_paths = dict(zip(
    ['Структура предприятий по специализации',
     'Число предприятий по специализации',
     'Коэффициент обеспеченности услугами',
     'Расстояние до основных объектов инфраструктуры',
     'Число рабочих мест и структура занятости',
     'Доходы населения и домохозяйств',
     'Другие показатели',
     'Индекс коммерческой активности'],
    os.listdir('territory_bases_v3/datatables/')
))

In [25]:
dataset_paths

{'Структура предприятий по специализации': 'business_shares.csv',
 'Число предприятий по специализации': 'business_totals.csv',
 'Коэффициент обеспеченности услугами': 'coeff_func_def.csv',
 'Расстояние до основных объектов инфраструктуры': 'distances_to_objects.csv',
 'Число рабочих мест и структура занятости': 'employment.csv',
 'Доходы населения и домохозяйств': 'incomes.csv',
 'Другие показатели': 'other_variables.csv',
 'Индекс коммерческой активности': 'zent_idx.csv'}

In [108]:
columns = [list(pd.read_csv(f'territory_bases_v3/datatables/{df}').columns) for df in os.listdir('territory_bases_v3/datatables/')]

In [111]:
from functools import reduce
cols = reduce(lambda left, right: left + right, columns)
colNames = pd.DataFrame(cols)
colNames.columns = ['name_eng']

colNames.to_excel('ColNames.xlsx', index = None)

In [26]:
colNames_dict = pd.read_excel('ColNames_edited.xlsx')

In [27]:
def read_data(name):
    data = pd.read_csv(
        f"territory_bases_v3/datatables/{dataset_paths[name]}"
    )
    data['Perioden'] = pd.to_datetime(data['Perioden'])
    data = data.rename(columns = {'commune_ok' : 'commune', 'Perioden' : 'year'})
    data = data.set_index(['year', 'commune'])
    colNames_rename = colNames_dict[colNames_dict.dataset == name].set_index('name_eng')['name_rus'].to_dict()
    data = data.rename(columns = colNames_rename)
    return data

In [286]:
def plot_reg(df, linreg):    
    plt.figure(figsize = (7,7), facecolor = 'w')

    sns.regplot(df.iloc[:, 0], df.iloc[:, 1],
                scatter_kws = {'s' : 3, 'color' : colors['peach']},
                line_kws = {'color' : colors['blue']})

    plt.grid()
    plt.xlim(-.01, 1.01)
    plt.ylim(-.01, 1.01)
    plt.yticks(fontproperties = regluar_font)
    plt.ylabel('Индекс центральности', fontproperties = medium_font)

    plt.xticks(fontproperties = regluar_font)
    plt.xlabel(df.iloc[:, 0].name, fontproperties = medium_font)
    slope = round(linreg.slope, 2)
    intersept = round(linreg.intercept, 2)
    if intersept >= 0:
        plt.text(.01, .95, f'${slope}x + {intersept}$'.replace(',','.'), fontsize = 15)
    else:
        plt.text(.01, .95, f'${slope}x {intersept}$'.replace(',','.'), fontsize = 15)
    plt.text(.99, .95, timestamp.year, fontproperties = medium_font, fontsize = 18, ha = 'right')
    plt.savefig(f'analysis/')

In [28]:
def correalte(column, nodes_one_year, year):
    df = pd.concat([column / column.max(), nodes_one_year], axis = 1).dropna()
    linreg = lr(df)
#     plot_reg(df, linreg)
    return(linreg)

# Compute each

In [29]:
nodes = edges

In [30]:
dfs = []
for ds_name in dataset_paths:
    data = read_data(ds_name)
    df = []
    for year in range(2004, 2018):
        timestamp = pd.Timestamp(f'{year}-01-01')
        one_year = data.loc[timestamp]
        nodes_one_year = nodes.loc[timestamp] / nodes.loc[timestamp].max()

        df.append(one_year.apply(lambda column: correalte(column, nodes_one_year, year)).rename(year))
    df = pd.DataFrame(df).transpose().stack().reset_index()
    df.columns = ['Показатель', 'Год', 'model']
    df = df.assign(data_group = ds_name)
    
    dfs.append(df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [31]:
models = pd.concat(dfs).rename(columns = {'data_group' : 'Группа показателей'}).set_index(['Группа показателей', 'Показатель', 'Год'])

In [32]:
def form_equation(x):
    if x.intercept >=0:
        return f"{round(x.slope, 2)}x+{round(x.intercept,2)}".replace('.', ',')
    else:
        return f"{round(x.slope, 2)}x{round(x.intercept,2)}".replace('.', ',')

In [299]:
lin_reg.intercept

0.10140873445753898

In [33]:
models['Уравнение'] = models.model.apply(lambda x: form_equation(x))
models['R2'] = models.model.apply(lambda x: round(x.rvalue, 2))
models['p'] = models.model.apply(lambda x: round(x.pvalue, 4))
models['Стандартная ошибка'] = models.model.apply(lambda x: round(x.stderr, 4))

In [246]:
models_True = models[models.p < .001]

In [36]:
models = models.drop(columns = 'model').stack().unstack('Год')

In [39]:
models = models.sort_index()

In [40]:
models.to_excel('analysis/correlations_flow_mean.xlsx')