In [1]:
import numpy as np
import pandas as pd

### Matriz de dados $\boldsymbol{X}_{n \times p}$

Seja $\boldsymbol{X}_{4 \times 3} =
        \left[
        \begin{array}{rrr}
        7 & 3 & 9\\
        4 & 6 & 11 \\
        4 & 2 & 5 \\
        5 & 5 & 7\\
        \end{array}
        \right].$

In [2]:
# montar a matriz
X = np.array([[7, 3, 9], [4, 6, 11], [4, 2, 5], [5, 5, 7]])
print(X)

[[ 7  3  9]
 [ 4  6 11]
 [ 4  2  5]
 [ 5  5  7]]


Obter o vetor de médias: $\boldsymbol{\bar{X}}_{p \times 1}$

In [3]:
Xb = np.mean(X, axis=0).reshape(3, 1)
print(Xb)

[[ 5.]
 [ 4.]
 [ 8.]]


Matriz de covariâncias amostrais: $\boldsymbol{S}_{p \times p}$

In [4]:
S = np.cov(X.T)
print(S)

[[ 2.         -0.66666667  0.66666667]
 [-0.66666667  3.33333333  3.33333333]
 [ 0.66666667  3.33333333  6.66666667]]


### Distâncias entre as observações

#### Distância euclidiana

$d_{ij} = \sqrt{\sum_{k=1}^{p} (X_{ik} - X_{jk})^2}$

In [5]:
# usando scipy
from scipy.spatial.distance import squareform, pdist
squareform(pdist(X, metric='euclidean'))

array([[ 0.        ,  4.69041576,  5.09901951,  3.46410162],
       [ 4.69041576,  0.        ,  7.21110255,  4.24264069],
       [ 5.09901951,  7.21110255,  0.        ,  3.74165739],
       [ 3.46410162,  4.24264069,  3.74165739,  0.        ]])

In [6]:
# usando scikit-learn - não será o nosso padrão para distâncias
from sklearn.metrics.pairwise import euclidean_distances
D = euclidean_distances(X, X)
D

array([[ 0.        ,  4.69041576,  5.09901951,  3.46410162],
       [ 4.69041576,  0.        ,  7.21110255,  4.24264069],
       [ 5.09901951,  7.21110255,  0.        ,  3.74165739],
       [ 3.46410162,  4.24264069,  3.74165739,  0.        ]])

Matricialmente:

$d^2_{ij} = (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})^T (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})$

Obter os vetores $\boldsymbol{X}_{1\cdot}$ $\boldsymbol{X}_{2\cdot}$, $\cdots$

In [7]:
X1p = X[0,:].reshape(3, 1)
X2p = X[1,:].reshape(3, 1)
X3p = X[2,:].reshape(3, 1)
X4p = X[3,:].reshape(3, 1)

In [8]:
# distância entre as observações 1 e 2 


In [9]:
# obter as outras: d13, d14, d23, d24




#### Distância euclidiana padronizada

$d_{ij} = \sqrt{\sum_{k=1}^{p}\dfrac{(X_{ik} - X_{jk})^2}{S_{kk}}}$

In [10]:
# usando scipy
squareform(pdist(X, metric='seuclidean'))

array([[ 0.        ,  2.79284801,  2.68328157,  1.94935887],
       [ 2.79284801,  0.        ,  3.19374388,  1.78885438],
       [ 2.68328157,  3.19374388,  0.        ,  1.94935887],
       [ 1.94935887,  1.78885438,  1.94935887,  0.        ]])

Matricialmente:

$d^2_{ij} = (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})^T \boldsymbol{D}^{-1} (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})$,

em que $\boldsymbol{D}^{-1} = diag(1/S_{ii})$ 

In [11]:
# D**{-1}


In [12]:
# distância entre as observações 1 e 2 



In [14]:
# as outras distâncias


#### Distância de Mahalanobis

$d^2_{ij} = (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})^T \boldsymbol{S}^{-1} (\boldsymbol{X_{i \cdot}} - \boldsymbol{X_{j \cdot}})$,

em que $\boldsymbol{S}^{-1}$ é a inversa de $\boldsymbol{S}$

In [15]:
# usando scipy
squareform(pdist(X, metric='mahalanobis'))

array([[ 0.        ,  2.44948974,  2.44948974,  2.44948974],
       [ 2.44948974,  0.        ,  2.44948974,  2.44948974],
       [ 2.44948974,  2.44948974,  0.        ,  2.44948974],
       [ 2.44948974,  2.44948974,  2.44948974,  0.        ]])

In [16]:
# S**{-1}


In [17]:
# distância entre as observações 1 e 2 



In [18]:
# as outras distâncias



### Ler um csv (dataframe)

In [19]:
import pandas as pd

In [20]:
# o arquivo '.csv' deve estar na mesma pasta em que este notebook esteja salvo
# se não estiver, o caminho deve ser colocado
medidas = pd.read_csv('medidas.csv')

In [21]:
medidas.head()

Unnamed: 0,torax,cintura,quadril,sexo
0,34,30,32,M
1,37,32,37,M
2,38,30,36,M
3,36,33,39,M
4,38,29,33,M


In [22]:
medidas.mean()

torax      37.00
cintura    28.00
quadril    37.05
dtype: float64

In [23]:
medidas.cov()

Unnamed: 0,torax,cintura,quadril
torax,6.631579,6.368421,3.0
cintura,6.368421,12.526316,3.578947
quadril,3.0,3.578947,5.944737


In [24]:
medidas.corr()

Unnamed: 0,torax,cintura,quadril
torax,1.0,0.698734,0.4778
cintura,0.698734,1.0,0.414741
quadril,0.4778,0.414741,1.0


### Transformando em matriz

In [25]:
# subselecionar variáveis e transformar em matriz
X = medidas.values[:, 0:3]
# mudar o tipo dos dados
X = np.asarray(X, dtype=float)

In [26]:
n = X.shape[0]

In [27]:
p = X.shape[1]

Obter as matrizes de distâncias (dos três tipos vistos) entre as observações:

In [28]:
# euclidiana
print(squareform(pdist(X, metric='euclidean')))

[[  0.           6.164414     5.65685425   7.87400787   4.24264069  11.
   12.04159458   8.94427191   7.81024968  10.09950494   7.           7.34846923
    7.81024968   8.30662386   7.48331477   7.07106781   7.81024968
    6.70820393   9.16515139   7.68114575]
 [  6.164414     0.           2.44948974   2.44948974   5.09901951
    6.08276253   5.91607978   3.74165739   3.60555128   4.47213595
    8.30662386   7.07106781   8.54400375  11.18033989   6.164414     6.
    7.68114575   6.08276253   5.09901951   9.43398113]
 [  5.65685425   2.44948974   0.           4.69041576   3.16227766
    5.74456265   7.           4.           2.23606798   4.69041576
    6.40312424   5.47722558   7.28010989   9.64365076   4.89897949
    4.24264069   6.70820393   4.58257569   4.47213595   7.68114575]
 [  7.87400787   2.44948974   4.69041576   0.           7.48331477
    7.14142843   5.           3.74165739   5.38516481   5.09901951
    9.8488578    8.24621125   9.43398113  12.4498996    7.07106781
    7.34

In [29]:
D_e = squareform(pdist(X, metric='euclidean'))

In [30]:
D_e[0,1]

6.164414002968976

In [31]:
D_ep = squareform(pdist(X, metric='seuclidean'))
print(D_ep)

[[ 0.          2.42525694  2.25923766  3.09261171  1.63117944  4.31154153
   4.79256312  3.63022368  3.10386405  3.99384549  2.23406992  2.60851942
   2.66070483  2.4357548   2.81713727  2.61531153  2.83752977  2.46695889
   3.67394226  2.36146196]
 [ 2.42525694  0.          0.7989602   0.95052074  1.88699164  2.36575304
   2.37536848  1.40856864  1.29478592  1.75657693  2.4357548   2.01557891
   2.54290907  3.45106301  1.78688535  1.69527272  2.33176405  1.7391789
   1.71524016  2.78252138]
 [ 2.25923766  0.7989602   0.          1.68392584  1.26244848  2.18220831
   2.71614412  1.64056588  0.87828847  1.78617323  1.9092774   1.66348698
   2.33556503  3.09062276  1.59792041  1.26345582  2.2541874   1.4313286
   1.73516112  2.33176405]
 [ 3.09261171  0.95052074  1.68392584  0.          2.81713727  2.76350083
   1.9815758   1.2206056   1.95039737  1.96205841  3.02619281  2.40460138
   2.78252138  3.90156473  2.01989622  2.17610258  2.42500192  2.14117463
   1.66348698  3.29020415]
 [ 1.6

In [32]:
D_m = squareform(pdist(X, metric='mahalanobis'))
print(D_m)

[[ 0.          2.13928548  2.49456078  2.95045513  2.46708024  4.54494525
   4.40174893  3.79408126  3.53388606  3.88030456  3.51118975  3.64493416
   3.56364741  3.19027182  3.65702889  3.60813597  3.65963586  3.3243057
   4.25021398  3.55238013]
 [ 2.13928548  0.          1.34196736  1.36431217  2.48625182  3.29685818
   2.27825898  1.82851444  2.28572778  2.19009974  2.83134654  2.47267105
   2.50145396  2.8964389   2.23613781  2.3896208   2.39760686  2.08203459
   2.47024822  2.9059231 ]
 [ 2.49456078  1.34196736  0.          2.62600661  1.41727802  2.23486314
   2.50537469  1.88319337  1.09504987  1.46571875  1.79612024  1.78733444
   2.27908073  2.33412218  1.8576208   1.51559366  2.46077958  1.51140783
   2.13706151  2.03657544]
 [ 2.95045513  1.36431217  2.62600661  0.          3.84155958  4.37525328
   2.29957259  2.11385738  3.43442979  3.09989176  3.82843862  3.20405215
   2.9059231   3.6506118   2.76696874  3.25464932  2.537635    2.82545541
   2.8695104   3.78869392]
 [ 2.

### Outro dataframe: imrs.csv

In [33]:
mg = pd.read_csv('imrs.csv')

In [34]:
mg.head()

Unnamed: 0,ibge7,meso,nome_meso,micro,nome_micro,mun,tx_mort_acid_15_29,tx_mort_hom_15_29,tx_mort_mama,tx_mort_inf,...,rdpc,renocup,perc_rdpc_140,fam_pbf,tx_emp_form,rend_pc_form,pib_pc,perc_agro,perc_serv,tx_ativ_18m
0,3100104,3105,Triângulo Mineiro/Alto Paranaíba,31019,Patrocínio,Abadia dos Dourados,66.53,0.0,0.0,0.0,...,596.18,920.1,7.94,589,22.9,114.28,11744.69,37.9,26.86,65.97
1,3100203,3106,Central Mineira,31024,Três Marias,Abaeté,56.22,0.0,8.73,27.27,...,707.24,930.85,6.69,1428,27.5,172.19,11466.39,22.08,40.06,65.9
2,3100302,3112,Zona da Mata,31061,Manhuaçu,Abre Campo,30.9,0.0,0.0,27.21,...,444.32,685.72,23.33,1293,20.1,117.72,7998.32,45.8,31.92,63.8
3,3100401,3112,Zona da Mata,31060,Ponte Nova,Acaiaca,0.0,0.0,0.0,20.83,...,357.03,646.71,26.53,389,11.7,61.49,6049.45,25.16,41.87,57.71
4,3100500,3108,Vale do Rio Doce,31039,Ipatinga,Açucena,0.0,34.13,0.0,19.61,...,325.42,514.6,30.7,1172,10.3,52.78,5603.93,41.22,29.89,54.21


In [35]:
# selecionar microrregião de Varginha
vgn = mg.query('nome_micro == "Varginha"')
vgn.head()

Unnamed: 0,ibge7,meso,nome_meso,micro,nome_micro,mun,tx_mort_acid_15_29,tx_mort_hom_15_29,tx_mort_mama,tx_mort_inf,...,rdpc,renocup,perc_rdpc_140,fam_pbf,tx_emp_form,rend_pc_form,pib_pc,perc_agro,perc_serv,tx_ativ_18m
75,3107109,3110,Sul/Sudoeste de Minas,31050,Varginha,Boa Esperança,60.75,19.07,25.98,14.93,...,599.17,903.37,8.43,2835,24.8,140.03,13127.48,28.65,39.28,64.92
117,3110905,3110,Sul/Sudoeste de Minas,31050,Varginha,Campanha,25.91,0.0,13.03,6.21,...,686.16,1021.62,6.55,786,27.3,149.06,10611.67,33.56,29.17,70.14
122,3111309,3110,Sul/Sudoeste de Minas,31050,Varginha,Campo do Meio,0.0,32.32,17.71,26.09,...,472.71,709.35,8.82,1111,18.5,105.89,7836.89,48.22,28.09,67.53
125,3111606,3110,Sul/Sudoeste de Minas,31050,Varginha,Campos Gerais,29.9,26.98,22.16,3.04,...,503.91,744.47,9.73,2422,18.3,93.65,11985.82,57.93,25.34,70.96
150,3113909,3110,Sul/Sudoeste de Minas,31050,Varginha,Carmo da Cachoeira,32.41,0.0,0.0,33.11,...,525.35,822.31,9.56,987,27.7,133.98,11871.57,47.49,26.25,71.98


In [36]:
vgn.shape

(16, 42)

In [37]:
# zerar os índices
vgn = vgn.reset_index().drop('index', axis=1)

In [38]:
# vgn.info()

In [39]:
# selecionar colunas específicas
vgn.iloc[:, [5, 6, 7, 8, 9, 11, 12]]

Unnamed: 0,mun,tx_mort_acid_15_29,tx_mort_hom_15_29,tx_mort_mama,tx_mort_inf,tx_analf_25m,prop_fund_25m
0,Boa Esperança,60.75,19.07,25.98,14.93,9.6,36.9
1,Campanha,25.91,0.0,13.03,6.21,9.8,39.3
2,Campo do Meio,0.0,32.32,17.71,26.09,12.5,34.7
3,Campos Gerais,29.9,26.98,22.16,3.04,14.1,32.0
4,Carmo da Cachoeira,32.41,0.0,0.0,33.11,12.4,33.4
5,Coqueiral,0.0,0.0,21.47,32.97,10.4,31.5
6,Elói Mendes,31.99,15.38,7.85,16.08,14.0,36.9
7,Guapé,59.26,0.0,15.17,7.94,10.5,30.0
8,Ilicínea,0.0,65.0,17.77,7.14,11.8,31.0
9,Monsenhor Paulo,0.0,0.0,0.0,18.52,13.8,38.1


In [40]:
vgn.columns

Index(['ibge7', 'meso', 'nome_meso', 'micro', 'nome_micro', 'mun',
       'tx_mort_acid_15_29', 'tx_mort_hom_15_29', 'tx_mort_mama',
       'tx_mort_inf', 'existe_conselho_saude', 'tx_analf_25m', 'prop_fund_25m',
       'perc_med_25m', 'tx_med_adeq', 'tx_med_lab', 'existe_conselho_educ',
       'tx_crimes_pat', 'tx_crimes_pes', 'tx_hom', 'existe_uni_infra',
       'perc_energia', 'perc_banagua', 'perc_esgoto', 'perc_lixo', 'pop_total',
       'pop_masc', 'pop_fem', 'raz_dep', 'tx_urb', 'tx_fec', 'esp_vida',
       'rdpc', 'renocup', 'perc_rdpc_140', 'fam_pbf', 'tx_emp_form',
       'rend_pc_form', 'pib_pc', 'perc_agro', 'perc_serv', 'tx_ativ_18m'],
      dtype='object')

In [41]:
# pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.select_dtypes.html
# selecionar apenas as variáveis do tipo float
vgn.select_dtypes(include=['float64'])

Unnamed: 0,tx_mort_acid_15_29,tx_mort_hom_15_29,tx_mort_mama,tx_mort_inf,tx_analf_25m,prop_fund_25m,perc_med_25m,tx_med_adeq,tx_med_lab,tx_crimes_pat,...,esp_vida,rdpc,renocup,perc_rdpc_140,tx_emp_form,rend_pc_form,pib_pc,perc_agro,perc_serv,tx_ativ_18m
0,60.75,19.07,25.98,14.93,9.6,36.9,23.56,56.4,100.0,25.9,...,76.0,599.17,903.37,8.43,24.8,140.03,13127.48,28.65,39.28,64.92
1,25.91,0.0,13.03,6.21,9.8,39.3,27.33,39.9,79.6,19.4,...,76.0,686.16,1021.62,6.55,27.3,149.06,10611.67,33.56,29.17,70.14
2,0.0,32.32,17.71,26.09,12.5,34.7,23.05,53.5,100.0,61.0,...,75.0,472.71,709.35,8.82,18.5,105.89,7836.89,48.22,28.09,67.53
3,29.9,26.98,22.16,3.04,14.1,32.0,20.41,52.5,100.0,94.2,...,74.0,503.91,744.47,9.73,18.3,93.65,11985.82,57.93,25.34,70.96
4,32.41,0.0,0.0,33.11,12.4,33.4,22.15,34.9,100.0,8.4,...,73.0,525.35,822.31,9.56,27.7,133.98,11871.57,47.49,26.25,71.98
5,0.0,0.0,21.47,32.97,10.4,31.5,21.05,42.8,100.0,32.3,...,75.0,495.77,714.57,13.64,15.1,71.0,11402.89,53.59,24.72,66.81
6,31.99,15.38,7.85,16.08,14.0,36.9,21.83,43.0,100.0,31.7,...,75.0,577.5,921.97,10.46,25.0,155.01,13057.12,31.12,34.04,66.07
7,59.26,0.0,15.17,7.94,10.5,30.0,19.39,53.9,100.0,0.0,...,75.0,487.68,742.73,7.41,18.4,102.09,8945.31,43.65,27.0,66.59
8,0.0,65.0,17.77,7.14,11.8,31.0,17.42,49.0,100.0,26.1,...,75.0,500.79,784.59,6.93,28.5,156.11,9364.19,45.01,22.27,68.14
9,0.0,0.0,0.0,18.52,13.8,38.1,25.87,58.6,100.0,24.5,...,75.0,555.78,854.92,5.17,32.8,194.62,14457.2,40.17,22.52,71.95


In [42]:
# mudando o dataframe para ter só as variáveis float
vgn_num = vgn.select_dtypes(include=['float64'])

In [43]:
vgn_num.shape

(16, 29)

In [44]:
# transformar em matriz pegando apenas as 10 primeiras variáveis
X = vgn_num.values[:, 0:10]
# mudar o tipo dos dados
# X = np.asarray(X, dtype=float)

In [45]:
X.dtype

dtype('float64')

In [46]:
D_e = squareform(pdist(X, metric='euclidean'))

In [48]:
# distância entre Varginha e Elói Mendes - pelo índice
D_e[6, 15]

47.23051026614047

In [49]:
# incluir nomes dos municípios nas distâncias - criar novo dataframe
Dedf = pd.DataFrame(D_e, index=vgn.mun, columns=vgn.mun)

In [50]:
# escolher municípios para saber as distâncias
Dedf.loc['Varginha', 'Elói Mendes']

47.23051026614047

In [51]:
Dedf.loc['Varginha', 'Três Corações']

54.948367582668006

In [54]:
# escolher outros pares de cidades


In [52]:
D_ep = squareform(pdist(X, metric='seuclidean'))
print(D_ep)

[[ 0.          5.15621062  3.2389154   3.43291207  4.69491486  3.60557178
   3.2738831   2.25106606  4.02830969  4.15085194  4.2588054   4.58874396
   5.41663239  4.03979462  2.37725668  5.00000257]
 [ 5.15621062  0.          5.5040704   5.74996729  5.09782209  5.0362405
   4.60873683  4.93865193  5.98512021  5.30458222  4.80553865  6.01474559
   5.77901989  5.91481489  4.51189756  5.97587947]
 [ 3.2389154   5.5040704   0.          2.72007124  4.24662697  2.73403273
   2.74758293  4.06724959  2.94814016  3.17048629  3.15867095  5.29642435
   3.66630133  4.10346613  2.7197381   5.80451962]
 [ 3.43291207  5.74996729  2.72007124  0.          5.39320995  4.23415171
   3.27470301  4.08621652  3.58358667  4.35023508  4.63662309  4.50264622
   4.95782929  4.16511081  3.01617759  6.19102298]
 [ 4.69491486  5.09782209  4.24662697  5.39320995  0.          3.0643241
   2.41411159  3.90426309  5.26600656  3.68675037  2.10152811  4.44968234
   2.32264275  6.22867425  4.01817319  6.70135627]
 [ 3.60

In [53]:
D_m = squareform(pdist(X, metric='mahalanobis'))
print(D_m)

[[ 0.          5.02039848  3.30928281  5.1767278   4.93351469  3.93390025
   3.81444334  3.46614986  4.74268557  4.95738691  5.1573883   4.47197468
   4.33245553  4.07167026  3.97632966  4.37725311]
 [ 5.02039848  0.          4.67567778  5.13646435  5.32966123  4.9672205
   4.64743563  4.80605996  5.31805228  5.24754401  4.98714698  5.23761037
   4.85004398  4.96389868  4.70439855  5.00282396]
 [ 3.30928281  4.67567778  0.          4.05103     4.51898702  4.01395339
   4.01602647  4.02390272  3.9061566   3.57775865  4.08758481  4.73364817
   3.29002362  3.13313753  4.1804486   4.61032237]
 [ 5.1767278   5.13646435  4.05103     0.          5.08939646  4.46878875
   4.1021185   4.21055201  4.88475715  4.95635616  4.71366742  4.67810908
   4.77962736  3.84687379  3.47526645  5.21305865]
 [ 4.93351469  5.32966123  4.51898702  5.08939646  0.          5.12532865
   3.87637808  4.10963974  5.26118947  5.19973124  4.58722081  5.38182372
   4.54179795  4.59396031  4.60590539  5.06013251]
 [ 3.9