In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import geopandas as gpd 
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append("../") 

from utils.paths import make_dir_line

modality = 'u'
project = 'Ciencia de los datos'
data = make_dir_line(modality, project)

raw = data('raw')
processed = data('processed')
models = data('models')

In [4]:
ter = pd.read_parquet(processed / 'ter.parquet.gzip')
ter.rename(columns={'Territory':'DEN_RIP'}, inplace=True)
ter.head()

Unnamed: 0,ITTER107,DEN_RIP
0,IT,Italy
2,ITCD,Nord
4,ITC,Nord-Ovest
6,ITC1,Piemonte
8,ITC11,Torino


In [5]:
chunk = pd.read_parquet(processed / 'chunk.parquet.gzip')
chunk.head()

Unnamed: 0,ITTER107,SEXISTAT1,Age,STATCIV2,Value
0,IT,1,0,1,205371.0
1,IT,2,0,1,195255.0
2,ITCD,1,0,1,94721.0
3,ITCD,2,0,1,89834.0
4,ITC,1,0,1,53931.0


In [6]:
geo = gpd.read_file(raw / "Limiti01012023_g/RipGeo01012023_g/RipGeo01012023_g_WGS84.shp")
geo.geometry = geo.geometry.to_crs(epsg = 4326)
geo.head()

Unnamed: 0,COD_RIP,DEN_RIP,Shape_Leng,Shape_Area,geometry
0,1,Nord-Ovest,2330183.0,57929580000.0,"MULTIPOLYGON (((9.85132 44.02340, 9.85122 44.0..."
1,2,Nord-Est,2327765.0,62385090000.0,"MULTIPOLYGON (((10.48080 44.18949, 10.48069 44..."
2,3,Centro,2010203.0,58018650000.0,"MULTIPOLYGON (((13.45526 40.78719, 13.45309 40..."
3,4,Sud,2517097.0,73777950000.0,"MULTIPOLYGON (((15.80124 39.69743, 15.80108 39..."
4,5,Isole,2775538.0,49917780000.0,"MULTIPOLYGON (((12.55987 35.50930, 12.55969 35..."


In [7]:
geo.head(10)

Unnamed: 0,COD_RIP,DEN_RIP,Shape_Leng,Shape_Area,geometry
0,1,Nord-Ovest,2330183.0,57929580000.0,"MULTIPOLYGON (((9.85132 44.02340, 9.85122 44.0..."
1,2,Nord-Est,2327765.0,62385090000.0,"MULTIPOLYGON (((10.48080 44.18949, 10.48069 44..."
2,3,Centro,2010203.0,58018650000.0,"MULTIPOLYGON (((13.45526 40.78719, 13.45309 40..."
3,4,Sud,2517097.0,73777950000.0,"MULTIPOLYGON (((15.80124 39.69743, 15.80108 39..."
4,5,Isole,2775538.0,49917780000.0,"MULTIPOLYGON (((12.55987 35.50930, 12.55969 35..."


In [20]:
lista_verificacion = geo['DEN_RIP'].unique()
print(lista_verificacion)

['Nord-Ovest' 'Nord-Est' 'Centro' 'Sud' 'Isole']


In [21]:
df_incluidos = ter[ter['DEN_RIP'].isin(lista_verificacion)]
df_incluidos.head()

Unnamed: 0,ITTER107,DEN_RIP
4,ITC,Nord-Ovest
62,ITD,Nord-Est
120,ITE,Centro
174,ITF,Sud
234,ITG,Isole


In [22]:
df_results = pd.merge(chunk, df_incluidos, on=['ITTER107'])
print(df_results.shape)

(6050, 6)


In [23]:
df_results.head()

Unnamed: 0,ITTER107,SEXISTAT1,Age,STATCIV2,Value,DEN_RIP
0,ITC,1,0,1,53931.0,Nord-Ovest
1,ITC,2,0,1,51223.0,Nord-Ovest
2,ITC,2,1,1,51959.0,Nord-Ovest
3,ITC,1,1,1,54899.0,Nord-Ovest
4,ITC,2,2,1,54863.0,Nord-Ovest


In [24]:
df_results = df_results.loc[:,['ITTER107','DEN_RIP','SEXISTAT1','Value']]
df_results = df_results.groupby(['ITTER107', 'DEN_RIP', 'SEXISTAT1'], as_index=False).sum()
df_results.head()

Unnamed: 0,ITTER107,DEN_RIP,SEXISTAT1,Value
0,ITC,Nord-Ovest,1,7741607.0
1,ITC,Nord-Ovest,2,8090334.0
2,ITD,Nord-Est,1,5657451.0
3,ITD,Nord-Est,2,5883881.0
4,ITE,Centro,1,5685827.0


In [25]:
print(df_results.shape, geo.shape)
geo_data = pd.merge(df_results, geo, on=['DEN_RIP'])
print(geo_data.shape)

(10, 4) (5, 5)
(10, 8)


In [26]:
geo_data = gpd.GeoDataFrame(geo_data, geometry = geo_data.geometry)
geo_data.geometry = geo_data.geometry.to_crs(epsg = 4326)
print(type(geo_data))
geo_data.head()

<class 'geopandas.geodataframe.GeoDataFrame'>


Unnamed: 0,ITTER107,DEN_RIP,SEXISTAT1,Value,COD_RIP,Shape_Leng,Shape_Area,geometry
0,ITC,Nord-Ovest,1,7741607.0,1,2330183.0,57929580000.0,"MULTIPOLYGON (((9.85132 44.02340, 9.85122 44.0..."
1,ITC,Nord-Ovest,2,8090334.0,1,2330183.0,57929580000.0,"MULTIPOLYGON (((9.85132 44.02340, 9.85122 44.0..."
2,ITD,Nord-Est,1,5657451.0,2,2327765.0,62385090000.0,"MULTIPOLYGON (((10.48080 44.18949, 10.48069 44..."
3,ITD,Nord-Est,2,5883881.0,2,2327765.0,62385090000.0,"MULTIPOLYGON (((10.48080 44.18949, 10.48069 44..."
4,ITE,Centro,1,5685827.0,3,2010203.0,58018650000.0,"MULTIPOLYGON (((13.45526 40.78719, 13.45309 40..."


In [27]:
geo_data.to_file(models / 'geo_data.geojson', driver='GeoJSON')  

In [None]:
print('Vane')