In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import geopandas as gpd 
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append("../") 

from utils.paths import make_dir_line

modality = 'u'
project = 'Ciencia de los datos'
data = make_dir_line(modality, project)

raw = data('raw')
processed = data('processed')

# Por RIP - Localidades

Son 5 localidades

In [4]:
geo = gpd.read_file(raw / "Limiti01012023_g/RipGeo01012023_g/RipGeo01012023_g_WGS84.shp")
geo.geometry = geo.geometry.to_crs(epsg = 4326)
geo = geo.loc[:,['COD_RIP','DEN_RIP']]

lista_verificacion = geo['DEN_RIP'].unique()
print('Lista de verificacion: ', lista_verificacion)

geo.head()

Lista de verificacion:  ['Nord-Ovest' 'Nord-Est' 'Centro' 'Sud' 'Isole']


Unnamed: 0,COD_RIP,DEN_RIP
0,1,Nord-Ovest
1,2,Nord-Est
2,3,Centro
3,4,Sud
4,5,Isole


In [5]:
ter = pd.read_parquet(processed / 'ter2.parquet.gzip')
ter.rename(columns={'Territory':'DEN_RIP'}, inplace=True)
ter = ter[ter['DEN_RIP'].isin(lista_verificacion)]
ter.head()

Unnamed: 0,ITTER107,DEN_RIP
4,ITC,Nord-Ovest
62,ITD,Nord-Est
120,ITE,Centro
174,ITF,Sud
234,ITG,Isole


In [6]:
df_aux = pd.merge(ter, geo, on=['DEN_RIP'])
print(ter.shape, geo.shape, df_aux.shape)
df_aux

(5, 2) (5, 2) (5, 3)


Unnamed: 0,ITTER107,DEN_RIP,COD_RIP
0,ITC,Nord-Ovest,1
1,ITD,Nord-Est,2
2,ITE,Centro,3
3,ITF,Sud,4
4,ITG,Isole,5


In [7]:
df_aux.to_parquet(processed / 'g1.parquet.gzip', compression='gzip')

# Por REG - Regiones

Son 20 regiones

In [8]:
geo = gpd.read_file(raw / "Limiti01012023_g/Reg01012023_g/Reg01012023_g_WGS84.shp")
geo.geometry = geo.geometry.to_crs(epsg = 4326)
geo = geo.loc[:,['COD_REG','DEN_REG']]

lista_verificacion = geo['DEN_REG'].unique()
print('Lista de verificacion: ', lista_verificacion)

geo.head()

Lista de verificacion:  ['Piemonte' "Valle d'Aosta" 'Lombardia' 'Trentino-Alto Adige' 'Veneto'
 'Friuli Venezia Giulia' 'Liguria' 'Emilia-Romagna' 'Toscana' 'Umbria'
 'Marche' 'Lazio' 'Abruzzo' 'Molise' 'Campania' 'Puglia' 'Basilicata'
 'Calabria' 'Sicilia' 'Sardegna']


Unnamed: 0,COD_REG,DEN_REG
0,1,Piemonte
1,2,Valle d'Aosta
2,3,Lombardia
3,4,Trentino-Alto Adige
4,5,Veneto


In [9]:
ter = pd.read_parquet(processed / 'ter2.parquet.gzip')
ter = ter.dropna()
ter.rename(columns={'Territory':'DEN_REG'}, inplace=True)

patron = r'^IT[C,D,E,F,G][0-9]$'  # Patrón para buscar "IT" seguido de una letra entre D y G y un solo dígito
ter = ter[ter["ITTER107"].str.match(patron)]

ter.head()

Unnamed: 0,ITTER107,DEN_REG
6,ITC1,Piemonte
24,ITC2,Valle d'Aosta
28,ITC3,Liguria
38,ITC4,Lombardia
66,ITD1,Provincia Autonoma Bolzano / Bozen


In [10]:
df_aux2 = pd.merge(ter, geo, on=['DEN_REG'])
df_aux2 = df_aux2.sort_values(by=['COD_REG'], ascending=True)
print(ter.shape, geo.shape, df_aux2.shape)
df_aux2

(21, 2) (20, 2) (20, 3)


Unnamed: 0,ITTER107,DEN_REG,COD_REG
0,ITC1,Piemonte,1
1,ITC2,Valle d'Aosta,2
3,ITC4,Lombardia,3
4,ITD2,Trentino-Alto Adige,4
5,ITD3,Veneto,5
6,ITD4,Friuli Venezia Giulia,6
2,ITC3,Liguria,7
7,ITD5,Emilia-Romagna,8
8,ITE1,Toscana,9
9,ITE2,Umbria,10


In [11]:
df_aux2.to_parquet(processed / 'g2.parquet.gzip', compression='gzip')

# Por PROV - Provincias

Son 93 provincias

In [12]:
geo = gpd.read_file(raw / "Limiti01012023_g/ProvCM01012023_g/ProvCM01012023_g_WGS84.shp")
geo.geometry = geo.geometry.to_crs(epsg = 4326)
# geo['DEN_PROV'] = np.where(geo['DEN_PROV'] == '-', np.nan, geo['DEN_PROV'])
geo['DEN_PROV'] = np.where(geo['DEN_PROV'] == '-', geo['DEN_CM'], geo['DEN_PROV'])
geo = geo.dropna(subset=['DEN_PROV'])
geo = geo.loc[:,['COD_PROV','DEN_PROV']]

lista_verificacion = geo['DEN_PROV'].unique()
print('Lista de verificacion: ', lista_verificacion)

geo.head()

Lista de verificacion:  ['Torino' 'Vercelli' 'Novara' 'Cuneo' 'Asti' 'Alessandria' 'Aosta'
 'Imperia' 'Savona' 'Genova' 'La Spezia' 'Varese' 'Como' 'Sondrio'
 'Milano' 'Bergamo' 'Brescia' 'Pavia' 'Cremona' 'Mantova' 'Bolzano'
 'Trento' 'Verona' 'Vicenza' 'Belluno' 'Treviso' 'Venezia' 'Padova'
 'Rovigo' 'Udine' 'Gorizia' 'Trieste' 'Piacenza' 'Parma'
 "Reggio nell'Emilia" 'Modena' 'Bologna' 'Ferrara' 'Ravenna'
 "Forli'-Cesena" 'Pesaro e Urbino' 'Ancona' 'Macerata' 'Ascoli Piceno'
 'Massa Carrara' 'Lucca' 'Pistoia' 'Firenze' 'Livorno' 'Pisa' 'Arezzo'
 'Siena' 'Grosseto' 'Perugia' 'Terni' 'Viterbo' 'Rieti' 'Roma' 'Latina'
 'Frosinone' 'Caserta' 'Benevento' 'Napoli' 'Avellino' 'Salerno'
 "L'Aquila" 'Teramo' 'Pescara' 'Chieti' 'Campobasso' 'Foggia' 'Bari'
 'Taranto' 'Brindisi' 'Lecce' 'Potenza' 'Matera' 'Cosenza' 'Catanzaro'
 'Reggio di Calabria' 'Trapani' 'Palermo' 'Messina' 'Agrigento'
 'Caltanissetta' 'Enna' 'Catania' 'Ragusa' 'Siracusa' 'Sassari' 'Nuoro'
 'Cagliari' 'Pordenone' 'Isernia'

Unnamed: 0,COD_PROV,DEN_PROV
0,1,Torino
1,2,Vercelli
2,3,Novara
3,4,Cuneo
4,5,Asti


In [13]:
geo.shape

(107, 2)

In [14]:
ter = pd.read_parquet(processed / 'ter2.parquet.gzip')
ter = ter.dropna()
ter.rename(columns={'Territory':'DEN_PROV'}, inplace=True)

patron = r'^IT((\d{3})|([C,D,E,F,G]((\d[A-Z])|(\d{2}))))$'
ter = ter[ter["ITTER107"].str.match(patron)]

ter.head()

Unnamed: 0,ITTER107,DEN_PROV
8,ITC11,Torino
10,ITC12,Vercelli
12,ITC13,Biella
14,ITC14,Verbano-Cusio-Ossola
16,ITC15,Novara


In [15]:
df_aux3 = pd.merge(ter, geo, on=['DEN_PROV'])
df_aux3 = df_aux3.sort_values(by=['COD_PROV'], ascending=True)
print(ter.shape, geo.shape, df_aux3.shape)
df_aux3.head(5)

(107, 2) (107, 2) (107, 3)


Unnamed: 0,ITTER107,DEN_PROV,COD_PROV
0,ITC11,Torino,1
1,ITC12,Vercelli,2
4,ITC15,Novara,3
5,ITC16,Cuneo,4
6,ITC17,Asti,5


In [16]:
df_aux3.to_parquet(processed / 'g3.parquet.gzip', compression='gzip')
df_aux3.to_csv(processed / 'g3.csv', encoding = 'utf-8-sig', index = False)

# Por COD_CM - Comunas

Son ## comunas

In [17]:
geo = gpd.read_file(raw / "Limiti01012023_g/Com01012023_g/Com01012023_g_WGS84.shp")
geo.geometry = geo.geometry.to_crs(epsg = 4326)
geo.head()

Unnamed: 0,COD_RIP,COD_REG,COD_PROV,COD_CM,COD_UTS,PRO_COM,PRO_COM_T,COMUNE,COMUNE_A,CC_UTS,Shape_Leng,geometry
0,1,1,1,201,201,1001,1001,AgliÃ¨,,0,16097.848297,"POLYGON ((7.78266 45.38717, 7.79106 45.38630, ..."
1,1,1,1,201,201,1002,1002,Airasca,,0,16684.218599,"POLYGON ((7.48795 44.93917, 7.49307 44.93949, ..."
2,1,1,1,201,201,1003,1003,Ala di Stura,,0,29892.941951,"POLYGON ((7.27324 45.33927, 7.27360 45.33890, ..."
3,1,1,1,201,201,1004,1004,Albiano d'Ivrea,,0,16192.65648,"POLYGON ((7.92507 45.44821, 7.92682 45.44821, ..."
4,1,1,1,201,201,1006,1006,Almese,,0,16077.245089,"POLYGON ((7.43490 45.13442, 7.43524 45.13388, ..."


In [19]:
geo['PRO_COM_T'].nunique()

7901

In [20]:
geo.shape

(7901, 12)

In [18]:
print('Vane y Oscar')

Vane y Oscar
