# Calculate Distances
This notebook is for generate features about distances from water sources:<br>
<ul>
  <li>min_dist: distance in meter between exam's address and nearest water source</li>
  <li>count_water_sources: number of water sources contained in the address bounding box</li>
</ul>

STEPS to obtain features just explained: 
<ol>
  <li>Retrive water geometry.</li>
  <li>Retrive address geometry.</li>
  <li>Create bounding box.</li>
  <li>Calculate all distances.</li>
  <li>Retrieve minimum distance from water</li>
  <li>Count number of water resources inside bounding box</li>
</ol>

In [1]:
import shapely.wkt
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
from shapely.geometry import Point
from geopandas import GeoDataFrame, GeoSeries
import numpy as np
from geocode_utils import add_complete_geocode_address
import os
from distance_utils import create_feature_dict, DictDistFeatures, DistanceFeature
import jsonpickle
from split_utils import split_in_multiple_csv
import pyarrow.feather as feather

water_folder = 'C:/Users/manue/Documents/Tesi/waterlayer/' 
address_file_prefix = "dataframe_sigla"
target_region = ["Liguria","VDA","Piemonte","Sicilia","Marche","Abruzzo","Toscana","Campania","Puglia","EmiliaRomagna","TAA","Sardegna","Molise","Calabria","Lazio","FVG","Basilicata","Umbria","Lombardia","Veneto"] #"Piemonte","Liguria","VDA","Piemonte","Sicilia","Marche","Abruzzo","Toscana","Campania","Puglia","EmiliaRomagna","TAA","Sardegna","Molise","Calabria","Lazio","FVG","Basilicata","Umbria","Lombardia","Veneto"
diseases = ["epatite","leishmania","leptospira","salmonella"] #,"epatite","leishmania","leptospira","salmonella"
splitted_regions = ["Piemonte", "Lombardia","Veneto"]
splitted_diseases = {"Piemonte": ["salmonella"],
                     "Lombardia": ["leishmania","leptospira","salmonella"],
                     "Veneto" : ["salmonella"]}
folder_distance = "distances"
bounding_box_km_offset = 2.5


This function is for generate bounding box around address points. <br>
km_offset: half length of bounding box sqare side in km.

In [20]:
def create_address_bounding_box(address, water, km_offset):
    '''
    Latitude:  1 deg = 110.54 km
    Longitude: 1 deg = 111.320*cos(latitude) km
    '''
    #calculate bounding box of each address point of 2*km_offset km side  
    offset_lat = km_offset/110.54 
    bounds = address.bounds
    rad = ((bounds['miny'])*2*np.pi)/360
    offset_long_const = km_offset /(111.32 * np.cos(rad))
    bbox = bounds + [0, -offset_lat, 0, offset_lat]
    bbox['minx'] = bounds['minx'] -offset_long_const
    bbox['maxx'] = bounds['maxx'] +offset_long_const

    #get intersection between boundingbox aroud addresses and water sources
    hits = bbox.apply(lambda row: list(water['geometry'].sindex.intersection(row)), axis=1)
    
    return hits

In [21]:
region_n_files = {}
#this is only for split some files. It is tooo big for memory
for region in splitted_regions:
    for disease in splitted_diseases[region]:
        infolder = region
        in_csv = address_file_prefix + '_' + region + '_' + disease + '_output.csv'
        outfolder = region + '/split' 
        CHECK_FOLDER = os.path.isdir(outfolder)
        if not CHECK_FOLDER:
            os.makedirs(outfolder)

        region_n_files[region] = split_in_multiple_csv(in_csv, infolder, outfolder, 500)

  region_n_files[region] = split_in_multiple_csv(in_csv, infolder, outfolder, 500)


In [22]:
for region in target_region:
    #STEP 1
    water_file_name = region + "riverLakeUnion"
    print("\nStart Processing water for region " + region)
    
    #retrive geometry from gpkg files (water sources files)
    water_layer_full = gpd.read_file(water_folder + water_file_name + ".gpkg", layer=water_file_name)
    print("Length before remove fountain")
    print(len(water_layer_full))
    
    #reduce gpkg to usefull columns only
    if 'amenity' in water_layer_full.columns:
        water_layer = water_layer_full[['osm_id','osm_type','waterway','name','osm_id_2','osm_type_2','amenity','name_2','geometry']]
    else:
        water_layer = water_layer_full[['osm_id','osm_type','waterway','name','osm_id_2','osm_type_2','name_2','geometry']]
        
    if 'amenity' in water_layer.columns:
        fountain_row_idx = water_layer.index[water_layer['amenity']=='fountain'].tolist()
        water_layer = water_layer.drop(fountain_row_idx).reset_index()
        print("Length AFTER remove fountain")
        print(len(water_layer))
        
    # this creates and also provides us access to the spatial index
    water_layer.sindex
        
    for disease in diseases:
        print("\nProcessing " + region + " " + disease)    
        num_files = 1
        if os.path.exists(water_folder + water_file_name + ".gpkg"):
            
            if region in splitted_regions and disease in splitted_diseases:
                num_files = region_n_files[region]
            
            #STEP 2 
            for i in range(num_files):
            #for i in range(7,8,1):    
                #retrive geometry of points from csv files (files with geocode_address)
                addresses_file = region + "/" 
                if region in splitted_regions and disease in splitted_diseases:
                    addresses_file = addresses_file + 'split/'
                addresses_file = addresses_file + address_file_prefix + "_" + region + "_" + disease + "_output" 
                if region in splitted_regions and disease in splitted_diseases:
                    addresses_file = addresses_file + str(i+1)
                addresses_file = addresses_file + ".csv"
                
                if os.path.exists(addresses_file):
                    adresses_df = pd.read_csv(addresses_file)
                    if len(adresses_df)>0:
                        adresses_df = add_complete_geocode_address(adresses_df)
                        
                        gdf_addr = gpd.GeoDataFrame(
                            adresses_df, crs={'init': 'epsg:4326'},
                            geometry=[Point(xy) for xy in zip(adresses_df.longitude, adresses_df.latitude)])
                        
                        #STEP 3
                        hits = create_address_bounding_box(gdf_addr, water_layer, bounding_box_km_offset)
                        #remove lines from series with no water source in selected bounding box
                        hits = hits[hits.map(lambda d: len(d)) > 0]
                        
                        if hits.size>0:
                            tmp = pd.DataFrame({
                                "pt_address_idx": np.repeat(hits.index, hits.apply(len)),
                                "water_i": np.concatenate(hits.values)
                            })

                            #join water info
                            tmp = tmp.join(water_layer.reset_index(drop=True), on="water_i")
                            #join address geometry (point)
                            tmp = tmp.join(gdf_addr.geometry.rename("point"), on="pt_address_idx")
                            #join info on string address
                            tmp = tmp.join(gdf_addr.geocode_address, on ="pt_address_idx")

                            # Convert back to a GeoDataFrame, so we can do spatial operations
                            tmp = GeoDataFrame(tmp, geometry="geometry", crs=gdf_addr.crs)
                            
                            #STEP 4
                            #calculate distances
                            utm = tmp.estimate_utm_crs()
                            tmp["meters_dist"] = tmp.to_crs(utm).distance(GeoSeries(tmp.point).to_crs(utm))

                            #Check if directory exists, if not, create it
                            CHECK_FOLDER = os.path.isdir(folder_distance)
                            if not CHECK_FOLDER:
                                os.makedirs(folder_distance)
                            
                            #save results in  feather 
                            feather_path = folder_distance + "/" + region + "_" + disease + "_water_dist"
                            #csv_path = folder_distance + "/" + region + "_" + disease + "_water_dist"
                            if region in splitted_regions and disease in splitted_diseases:
                                feather_path = feather_path + str(i+1)
                                #csv_path = csv_path + str(i+1)
                            #csv_path = csv_path + ".csv"
                            
                            tmp.to_feather(feather_path)
                            #tmp.to_csv(csv_path, index=False)
                        else:
                            print("ATTENTION for " + region + " " + disease + " there are no water source in bounding box of ray " + 
                                str(bounding_box_km_offset) + " km\n So related water distance file is not created!")



Start Processing water for region Liguria
Length before remove fountain
33426
Length AFTER remove fountain
33420

Processing Liguria epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Liguria leishmania


  exec(code_obj, self.user_global_ns, self.user_ns)
  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Liguria leptospira


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Liguria salmonella


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region VDA
Length before remove fountain
7902
Length AFTER remove fountain
7901

Processing VDA epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing VDA leishmania


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing VDA leptospira


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing VDA salmonella


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Piemonte
Length before remove fountain
262284
Length AFTER remove fountain
261982

Processing Piemonte epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Piemonte leishmania


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Piemonte leptospira


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Piemonte salmonella


  exec(code_obj, self.user_global_ns, self.user_ns)
  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Sicilia
Length before remove fountain
21815
Length AFTER remove fountain
21711

Processing Sicilia epatite

Processing Sicilia leishmania


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing Sicilia leptospira

Processing Sicilia salmonella



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Marche
Length before remove fountain
2574
Length AFTER remove fountain
2569

Processing Marche epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Marche leishmania


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing Marche leptospira

Processing Marche salmonella



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Abruzzo
Length before remove fountain
3728
Length AFTER remove fountain
3703

Processing Abruzzo epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing Abruzzo leishmania



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing Abruzzo leptospira

Processing Abruzzo salmonella



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Toscana
Length before remove fountain
55927
Length AFTER remove fountain
55898

Processing Toscana epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Toscana leishmania


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Toscana leptospira

Processing Toscana salmonella


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Campania
Length before remove fountain
3448
Length AFTER remove fountain
3442

Processing Campania epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Campania leishmania


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Campania leptospira

Processing Campania salmonella


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Puglia
Length before remove fountain
3204
Length AFTER remove fountain
3180

Processing Puglia epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing Puglia leishmania



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing Puglia leptospira

Processing Puglia salmonella



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region EmiliaRomagna
Length before remove fountain
18968
Length AFTER remove fountain
18945

Processing EmiliaRomagna epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing EmiliaRomagna leishmania


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing EmiliaRomagna leptospira

Processing EmiliaRomagna salmonella


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region TAA
Length before remove fountain
30993
Length AFTER remove fountain
30989

Processing TAA epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing TAA leishmania



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing TAA leptospira

Processing TAA salmonella



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Sardegna
Length before remove fountain
31698
Length AFTER remove fountain
31682

Processing Sardegna epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing Sardegna leishmania



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing Sardegna leptospira

Processing Sardegna salmonella



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Molise
Length before remove fountain
847
Length AFTER remove fountain
843

Processing Molise epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing Molise leishmania

Processing Molise leptospira



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Molise salmonella

Start Processing water for region Calabria
Length before remove fountain
2974
Length AFTER remove fountain
2970

Processing Calabria epatite

Processing Calabria leishmania

Processing Calabria leptospira

Processing Calabria salmonella


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Lazio
Length before remove fountain
7031
Length AFTER remove fountain
6793

Processing Lazio epatite

Processing Lazio leishmania


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing Lazio leptospira

Processing Lazio salmonella



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region FVG
Length before remove fountain
15134
Length AFTER remove fountain
15133

Processing FVG epatite

Processing FVG leishmania


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing FVG leptospira

Processing FVG salmonella



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Basilicata
Length before remove fountain
6989

Processing Basilicata epatite

Processing Basilicata leishmania

Processing Basilicata leptospira

Processing Basilicata salmonella


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Umbria
Length before remove fountain
2480
Length AFTER remove fountain
2473

Processing Umbria epatite

Processing Umbria leishmania

Processing Umbria leptospira


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)
  in_crs_string = _prepare_from_proj_string(in_crs_string)



Processing Umbria salmonella



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Lombardia
Length before remove fountain
69987
Length AFTER remove fountain
69985

Processing Lombardia epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Lombardia leishmania


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Lombardia leptospira


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Lombardia salmonella


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Start Processing water for region Veneto
Length before remove fountain
34501
Length AFTER remove fountain
34500

Processing Veneto epatite


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Veneto leishmania


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)



Processing Veneto leptospira

Processing Veneto salmonella


  in_crs_string = _prepare_from_proj_string(in_crs_string)

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  tmp.to_feather(feather_path)


In [23]:
bb_km_str = str(bounding_box_km_offset)
bb_km_str = bb_km_str.replace(".", "_")
json_output_filename = "distances_features_" + bb_km_str + ".json"

addr_dict = {}
    
file_exists = os.path.exists(json_output_filename)
if file_exists:
    with open(json_output_filename, "r") as json_file:
        geo_dict_obj = jsonpickle.decode(json_file.read())            
        addr_dict = geo_dict_obj.features_dict                                 
try:
    for region in target_region:
        for disease in diseases:
            num_files = 1
            if region in splitted_regions and disease in splitted_diseases:
                num_files = region_n_files[region]
            
            for i in range(num_files):
                distance_to_water_file = folder_distance + "/" + region + "_" + disease + "_" + "water_dist"
                if region in splitted_regions and disease in splitted_diseases:
                    distance_to_water_file = distance_to_water_file + str(i+1)
                
                if os.path.exists(distance_to_water_file):
                    df_water = pd.read_feather(distance_to_water_file)
                    #check_start = df_water.loc[df_water['geocode_address'] == 'LOC.PRACCIA 16,15010,MORBELLO,AL,Italia', 'pt_address_idx']
                    #print(check_start)
                    #df_water.to_csv(region + "_" + disease + "_starting_from.csv", index=False)
                    
                    #group by address
                    counting = df_water.groupby(['pt_address_idx']).size().reset_index(name='counts')
                    #check_count = counting.loc[counting['pt_address_idx'] == 12068, 'counts']
                    #print(check_count)
                    #counting.to_csv(region + "_" + disease + "_counting.csv", index=False)
                    
                    # Sort on ascending snap distance, so that closest goes to top
                    sorted = df_water.sort_values(by=["meters_dist"])
                    #check_sorted_idx = sorted.loc[sorted['geocode_address'] == 'LOC.PRACCIA 16,15010,MORBELLO,AL,Italia', 'pt_address_idx']
                    #print(check_sorted_idx)
                    #check_sorted_meters = sorted.loc[sorted['geocode_address'] == 'LOC.PRACCIA 16,15010,MORBELLO,AL,Italia', 'meters_dist']
                    #print(check_sorted_meters)
                    #print(df_water.loc[df_water['pt_address_idx'] == 5, 'meters_dist'])
                    #sorted.to_csv(region + "_" + disease + "_sorted.csv", index=False)
                    
                    #STEP 5
                    #water source at min distance
                    closest = sorted.groupby("pt_address_idx").first()
                    #check_closest = closest.loc[closest['geocode_address'] == 'LOC.PRACCIA 16,15010,MORBELLO,AL,Italia']
                    #print(check_closest)
                    #closest.to_csv(region + "_" + disease + "_closest.csv", index=False)
                    
                    #STEP 6
                    closest_count = closest.join(counting.set_index('pt_address_idx'), on="pt_address_idx")
                    #check_closest_count = closest_count.loc[closest_count['geocode_address'] == 'LOC.PRACCIA 16,15010,MORBELLO,AL,Italia', 'counts']
                    #print(check_closest_count)
                    #closest_count.to_csv(region + "_" + disease + "_closest_end.csv", index=False)
                    #get number of water source
                    #counting = df_water.groupby("pt_address_idx").size() 
                    
                    #print(counting.head(7))
                    #closest.to_csv(region + "_" + disease + "_counting.csv", index=False)
                    
                    #closest = closest.join(counting, on ="pt_address_idx")
                    #closest.to_csv(region + "_" + disease + "_closest_final.csv", index=False)

                    addr_dict = create_feature_dict(addr_dict, closest_count)
except Exception as e:
    print('Exception during create features')
    print(e)
finally:
    out_dict = DictDistFeatures(addr_dict)        
        
    with open(json_output_filename, 'w') as file:
        out_dict_jason = jsonpickle.encode(out_dict)
        file.write(out_dict_jason)


Following step is for add to the dataset latitude, longitude and distance feature.

In [2]:
bb_km_str = str(bounding_box_km_offset)
bb_km_str = bb_km_str.replace(".", "_")
json_output_filename = "distances_features_" + bb_km_str + ".json"

df_feather = 'dataframe_sigla'
df_sigla = pd.read_feather(df_feather)

df_sigla["min_dist"] = -1
df_sigla["count_water_sources"] = -1
df_sigla = add_complete_geocode_address(df_sigla)

addr_dict = {}
    
file_exists = os.path.exists(json_output_filename)
if file_exists:
    with open(json_output_filename, "r") as json_file:
        geo_dict_obj = jsonpickle.decode(json_file.read())            
        addr_dict = geo_dict_obj.features_dict  
        
#add latitude, longitude, min_dist, count_water_sources to dataframe sigla
for index, row in df_sigla.iterrows():
    address = row['geocode_address']
    if address in addr_dict: 
        df_sigla.loc[index,"latitude"] = addr_dict[address].latitude
        df_sigla.loc[index,"longitude"] = addr_dict[address].longitude
        df_sigla.loc[index,"min_dist"] = addr_dict[address].min_dist
        df_sigla.loc[index,"count_water_sources"] = addr_dict[address].water_count

#save as feather
df_sigla.to_feather(df_feather + '_water_dist_' + bb_km_str + '_out')
#df_sigla.to_csv("df_sigla_distances_test_" + bb_km_str + ".csv", index=False)

#substitute mid_dist = -1 with bounding_box_km_offset value and save as feather
df_sigla.loc[df_sigla['min_dist'] == -1, 'min_dist'] = bounding_box_km_offset * 1000
df_sigla.to_feather(df_feather + '_water_dist_' + bb_km_str)
    

Find rows with withrawal address with no water sources. Count 

In [4]:

#bb_km_str = "2_5"

print("Sigla dataframe Total row " + str(len(df_sigla)))

no_dist = df_sigla[df_sigla['min_dist']==-1]
print("Number of exams rows with no water sources around " + str(bounding_box_km_offset) + " km: " + str(len(no_dist)) + "\n")

#create files for rows with no water sources in 'bb_km_str' km
no_dist.to_csv("df_sigla_no_dist_test_" + bb_km_str + ".csv", index=False)

#no_dist_file = pd.read_csv("df_sigla_no_dist_test_" + bb_km_str + ".csv")
count_df = no_dist.groupby(['PROVINCIA PRELIEVO', 'COMUNE PRELIEVO', 'INDIRIZZO PRELIEVO']).size()
count_df.to_csv("conteggio_no_dist_"+ bb_km_str +".csv", index=True)

print("Count addresses with no sources water grouped by provincia, comune, indirizzo:" )
print(count_df) 

Total row
218169
Number of exams row with no water sources around 2_5 km
1162
PROVINCIA PRELIEVO  COMUNE PRELIEVO          INDIRIZZO PRELIEVO    
AL                  ALFIANO NATTA            VIA ASTI 4                1
                    TERZO                    VIA COTELLA 1             1
AT                  MONTECHIARO D'ASTI       V. STAZIONE 73            1
                    ROCCHETTA PALAFEA        VIA CORNIGLIANO 12        7
BA                  ALTAMURA                 V. GRAVISCELLA CS         1
                                                                      ..
VA                  VARESE                   VIA MONTE BIANCO 162      1
                    VIGGIU'                  VIA DEL ROCCOLO 33        1
VR                  ILLASI                   VIA DANTE ALIGHIERI 18    1
                    ROVERE' VERONESE         VIA MASO DI SOTTO 8       5
VT                  CASTIGLIONE IN TEVERINA  VIA MORANDI 2             1
Length: 206, dtype: int64


In [26]:
'''
from shapely import wkt
#Check if osm_id 934212353 and 565848958 water source is in file of water source of Lazio
#Check if osm_id 540137268 is in file of water of Veneto
#this check is for this address "VIA VIGNE NUOVE 64,00045,GENZANO DI ROMA,RM,Italia"

water_file_name = 'Piemonte' + "riverLakeUnion"

water_layer = gpd.read_file(water_folder + water_file_name + ".gpkg", layer=water_file_name)
print(water_layer.columns)
#print(type(df_lazio_water.loc[2,'osm_id']))
#wl = water_layer.loc[water_layer['osm_id'] == '202216969', 'geometry']
#wl_bbox_test = water_layer.loc[water_layer['osm_id'] == '202216969']
#print(wl.values[0])
#print(len(wl))

#check if exist amenity column in Basilicata
#print('amenity' in water_layer.columns)
#print(water_layer[water_layer['amenity']=='fountain'])
#fountain_row_idx = water_layer.index[water_layer['amenity']=='fountain'].tolist()
#print(fountain_row_idx)

#gdfl = gpd.GeoDataFrame(gpd.GeoSeries(wl))
gdfl = gpd.GeoDataFrame(geometry=[shapely.wkt.loads(wkt.dumps(wl.values[0]))], crs="EPSG:4326")
gdfp = gpd.GeoDataFrame(geometry=[shapely.wkt.loads("POINT (8.51111 44.60652)")], crs="EPSG:4326")


utm = gdfl.estimate_utm_crs()
#distances in meters from line in gdf1 and point in gdfp
dist = gdfl.to_crs(utm).distance(gdfp.to_crs(utm))
print("Distance\n")
print(dist)


#test for check boundig box and intersection
gdfp = gpd.GeoDataFrame(geometry=[shapely.wkt.loads("POINT (8.4200414 45.5978131)")], crs="EPSG:4326")

hits = create_address_bounding_box(gdfp, water_layer, bounding_box_km_offset)
print(typeof(hits))
'''

'\nfrom shapely import wkt\n#Check if osm_id 934212353 and 565848958 water source is in file of water source of Lazio\n#Check if osm_id 540137268 is in file of water of Veneto\n#this check is for this address "VIA VIGNE NUOVE 64,00045,GENZANO DI ROMA,RM,Italia"\n\nwater_file_name = \'Piemonte\' + "riverLakeUnion"\n\nwater_layer = gpd.read_file(water_folder + water_file_name + ".gpkg", layer=water_file_name)\nprint(water_layer.columns)\n#print(type(df_lazio_water.loc[2,\'osm_id\']))\n#wl = water_layer.loc[water_layer[\'osm_id\'] == \'202216969\', \'geometry\']\n#wl_bbox_test = water_layer.loc[water_layer[\'osm_id\'] == \'202216969\']\n#print(wl.values[0])\n#print(len(wl))\n\n#check if exist amenity column in Basilicata\n#print(\'amenity\' in water_layer.columns)\n#print(water_layer[water_layer[\'amenity\']==\'fountain\'])\n#fountain_row_idx = water_layer.index[water_layer[\'amenity\']==\'fountain\'].tolist()\n#print(fountain_row_idx)\n\n#gdfl = gpd.GeoDataFrame(gpd.GeoSeries(wl))\ngdfl 

In [28]:


#check if this address "VIA VIGNE NUOVE 64,00045,GENZANO DI ROMA,RM,Italia" is in 'dataframe_sigla_water_others' file
'''
df = feather.read_feather('dataframe_sigla_water')
addr = df.loc[df['INDIRIZZO PRELIEVO'] == 'LOC. PIANBOSCO 5']
#print(addr)
df = feather.read_feather('dataframe_sigla_water_others') 
addr = df.loc[df['INDIRIZZO PRELIEVO'] == 'VIA VIGNE NUOVE 64']
#print(addr) 

addr = df.loc[df['PROVINCIA PRELIEVO'] == 'RM']
#print(addr)
#print(len(addr))

sud_sardegna = df.loc[df['PROVINCIA PRELIEVO'] == 'SU']
#print(sud_sardegna)
#print(len(sud_sardegna))
'''
#bb_km_str = "2_5"


PROVINCIA PRELIEVO  COMUNE PRELIEVO          INDIRIZZO PRELIEVO    
AL                  ALFIANO NATTA            VIA ASTI 4                1
                    TERZO                    VIA COTELLA 1             1
AT                  MONTECHIARO D'ASTI       V. STAZIONE 73            1
                    ROCCHETTA PALAFEA        VIA CORNIGLIANO 12        7
BA                  ALTAMURA                 V. GRAVISCELLA CS         1
                                                                      ..
VA                  VARESE                   VIA MONTE BIANCO 162      1
                    VIGGIU'                  VIA DEL ROCCOLO 33        1
VR                  ILLASI                   VIA DANTE ALIGHIERI 18    1
                    ROVERE' VERONESE         VIA MASO DI SOTTO 8       5
VT                  CASTIGLIONE IN TEVERINA  VIA MORANDI 2             1
Length: 194, dtype: int64


In [29]:
#this is for test. Run this before re-execute code and verify if the number of row increases
'''
import pyarrow.feather as feather

df = feather.read_feather('dataframe_sigla_water_others')
addr = df.loc[df['PROVINCIA PRELIEVO'] == 'VT']
print(len(addr))


df = feather.read_feather('distances/Piemonte_salmonella_water_dist')
print(len(df))
'''

"\nimport pyarrow.feather as feather\n\ndf = feather.read_feather('dataframe_sigla_water_others')\naddr = df.loc[df['PROVINCIA PRELIEVO'] == 'VT']\nprint(len(addr))\n\n\ndf = feather.read_feather('distances/Piemonte_salmonella_water_dist')\nprint(len(df))\n"

In [30]:
#this code is for some checks
'''
bb_km_str = "2_5"
#code for test if json contains specific address
json_output_filename = "distances_features_" + bb_km_str + ".json"

with open(json_output_filename, "r") as json_file:
    geo_dict_obj = jsonpickle.decode(json_file.read())            
    addr_dict = geo_dict_obj.features_dict
print(addr_dict['LOC. PRACCIA 16,15010,MORBELLO,AL,Italia'].latitude)
print(addr_dict['LOC. PRACCIA 16,15010,MORBELLO,AL,Italia'].longitude)
print(addr_dict['LOC. PRACCIA 16,15010,MORBELLO,AL,Italia'].min_dist)
print(addr_dict['LOC. PRACCIA 16,15010,MORBELLO,AL,Italia'].water_count)


#remove this address, because latitude and longitude are wrong. 
del addr_dict['LOC.PRACCIA,15010,MORBELLO,AL,Italia']

if 'CONTRADA S. ELIA,72015,FASANO,BR,Italia' in addr_dict:
    print('CONTRADA S. ELIA,72015,FASANO,BR,Italia is already in')
else:
    print('CONTRADA S. ELIA,72015,FASANO,BR,Italia ---->DELETED!')

if 'LOC.PRACCIA,15010,MORBELLO,AL,Italia' in addr_dict:
    print('LOC.PRACCIA,15010,MORBELLO,AL,Italia is already in')
else:
    print('LOC.PRACCIA,15010,MORBELLO,AL,Italia ---->DELETED!')

out_dict = DictDistFeatures(addr_dict)        
        
with open(json_output_filename, 'w') as file:
    out_dict_jason = jsonpickle.encode(out_dict)
    file.write(out_dict_jason)
        
#read complete feather and check if exists address
region = 'Piemonte'
disease = 'leishmania'
feather_path = folder_distance + "/" + region + "_" + disease + "_water_dist"

df_water_dist = pd.read_feather(feather_path)
#find = df_water_dist.loc[df_water_dist['geocode_address'].str.contains("15010,MORBELLO", case=False)]
#find = df_water_dist.loc[df_water_dist['geocode_address']=="15010,MORBELLO,AL,Italia"]
print(df_water_dist)
#print(find)
'''


'\nbb_km_str = "2_5"\n#code for test if json contains specific address\njson_output_filename = "distances_features_" + bb_km_str + ".json"\n\nwith open(json_output_filename, "r") as json_file:\n    geo_dict_obj = jsonpickle.decode(json_file.read())            \n    addr_dict = geo_dict_obj.features_dict\nprint(addr_dict[\'LOC. PRACCIA 16,15010,MORBELLO,AL,Italia\'].latitude)\nprint(addr_dict[\'LOC. PRACCIA 16,15010,MORBELLO,AL,Italia\'].longitude)\nprint(addr_dict[\'LOC. PRACCIA 16,15010,MORBELLO,AL,Italia\'].min_dist)\nprint(addr_dict[\'LOC. PRACCIA 16,15010,MORBELLO,AL,Italia\'].water_count)\n\n\n#remove this address, because latitude and longitude are wrong. \ndel addr_dict[\'LOC.PRACCIA,15010,MORBELLO,AL,Italia\']\n\nif \'CONTRADA S. ELIA,72015,FASANO,BR,Italia\' in addr_dict:\n    print(\'CONTRADA S. ELIA,72015,FASANO,BR,Italia is already in\')\nelse:\n    print(\'CONTRADA S. ELIA,72015,FASANO,BR,Italia ---->DELETED!\')\n\nif \'LOC.PRACCIA,15010,MORBELLO,AL,Italia\' in addr_dict:\

In [8]:
for region in target_region:
    for disease in diseases:
        num_files = 1
        if region in splitted_regions and disease in splitted_diseases:
            num_files = region_n_files[region]
        
        for i in range(num_files):
            addresses_file = region + "/" 
            if region in splitted_regions and disease in splitted_diseases:
                addresses_file = addresses_file + 'split/'
            addresses_file = addresses_file + address_file_prefix + "_" + region + "_" + disease + "_output" 
            if region in splitted_regions and disease in splitted_diseases:
                addresses_file = addresses_file + str(i+1)
            addresses_file = addresses_file + ".csv"
            if os.path.exists(addresses_file):
                adresses_df = pd.read_csv(addresses_file)
                inds = np.where(adresses_df['latitude'].isna())[0]
                print('Num nan ' + region + ' ' + disease + ' is:' + str(len(inds)))
            

Num nan Liguria epatite is:0
Num nan Liguria leishmania is:0
Num nan Liguria leptospira is:0
Num nan Liguria salmonella is:0
Num nan VDA epatite is:0
Num nan VDA leishmania is:0
Num nan VDA leptospira is:0
Num nan VDA salmonella is:0
Num nan Piemonte epatite is:0
Num nan Piemonte leishmania is:0
Num nan Piemonte leptospira is:0
Num nan Piemonte salmonella is:0
Num nan Sicilia epatite is:0
Num nan Sicilia leishmania is:0
Num nan Sicilia leptospira is:0
Num nan Sicilia salmonella is:0
Num nan Marche epatite is:0
Num nan Marche leishmania is:0
Num nan Marche leptospira is:0
Num nan Marche salmonella is:0
Num nan Abruzzo epatite is:0
Num nan Abruzzo leishmania is:0
Num nan Abruzzo leptospira is:0
Num nan Abruzzo salmonella is:0
Num nan Toscana epatite is:0
Num nan Toscana leishmania is:0
Num nan Toscana leptospira is:0
Num nan Toscana salmonella is:0
Num nan Campania epatite is:0
Num nan Campania leishmania is:0
Num nan Campania leptospira is:0
Num nan Campania salmonella is:0
Num nan Pugl