# kmz to csv converter for SITG geodata

Geodata from the [SITG catalog](http://ge.ch/sitg/sitg_catalog/sitg_donnees) contain either good data but a very unconvenient geodata format, or good geodata and all other data nested in a very unconvenient `<description>` tag. This script extracts and merges:
* data from the **csv archive** (example: CSV_GOL_DECHETTERIE.zip)
* data from the **kmz archive** (example: KML_GOL_DECHETTERIE)

You need to download both file formats of the dataset. No need to extract the contents of the zip, the script does it for you.

This is a work in progress, feel free to contribute! It is provided “as is”, without warranty of any kind.

In [1]:
from zipfile import ZipFile
import glob

# Unzip


In [2]:
zipFiles = glob.glob('*.zip')
zipFiles

['CSV_GOL_DECHETTERIE.zip', 'KML_GOL_DECHETTERIE.zip']

In [3]:
kmz_filename = ''
csv_filename = ''

for zipFile in zipFiles:
    folderContent = ZipFile(zipFile, 'r')
    filenames = [item.filename for item in folderContent.filelist]
    print('Extracting', filenames[0])
    
    if filenames[0][-4:] == '.kmz':
        kmz_filename = filenames[0]
        print('>> kmz file found:', kmz_filename)
    elif filenames[0][-4:] == '.csv':
        csv_filename = filenames[0]
        print('>> csv file found:', csv_filename)
    
    folderContent.extract(filenames[0])
    folderContent.close()

Extracting GOL_DECHETTERIE.csv
>> csv file found: GOL_DECHETTERIE.csv
Extracting GOL_DECHETTERIE.kmz
>> kmz file found: GOL_DECHETTERIE.kmz


# Extract kmz

In [4]:
kmz = ZipFile(kmz_filename, 'r')
kml = kmz.open('doc.kml', 'r')

These four cells comes from **tylerjw's great kmz parser**:

https://github.com/tylerjw/kmz_parser

In [5]:
import xml.sax, xml.sax.handler
class PlacemarkHandler(xml.sax.handler.ContentHandler):
    def __init__(self):
        self.inName = False # handle XML parser events
        self.inPlacemark = False
        self.mapping = {}
        self.buffer = ""
        self.name_tag = ""
       
    def startElement(self, name, attributes):
        if name == "Placemark": # on start Placemark tag
            self.inPlacemark = True
            self.buffer = ""
        if self.inPlacemark:
            if name == "name": # on start title tag
                self.inName = True # save name text to follow
           
    def characters(self, data):
        if self.inPlacemark: # on text within tag
            self.buffer += data # save text if in title
           
    def endElement(self, name):
        self.buffer = self.buffer.strip('\n\t')
       
        if name == "Placemark":
            self.inPlacemark = False
            self.name_tag = "" #clear current name
       
        elif name == "name" and self.inPlacemark:
            self.inName = False # on end title tag           
            self.name_tag = self.buffer.strip()
            self.mapping[self.name_tag] = {}
        elif self.inPlacemark:
            if name in self.mapping[self.name_tag]:
                self.mapping[self.name_tag][name] += self.buffer
            else:
                self.mapping[self.name_tag][name] = self.buffer
        self.buffer = ""

In [6]:
parser = xml.sax.make_parser()
handler = PlacemarkHandler()
parser.setContentHandler(handler)
parser.parse(kml)
kmz.close()

In [7]:
def build_table(mapping):
    sep = ';'
    count = 0
    
    output = 'Name' + sep + 'Coordinates\n'
    points = ''
    lines = ''
    shapes = ''
    for key in mapping:
        coord_str = mapping[key]['coordinates'] + sep
       
        if 'LookAt' in mapping[key]: #points
            points += key + sep + coord_str + "\n"
        elif 'LineString' in mapping[key]: #lines
            lines += key + sep + coord_str + "\n"
        else: #shapes
            shapes += key + sep + coord_str + "\n"
        count += 1
    output += points + lines + shapes
    print(count, "objects found.")
    return output

In [8]:
outstr = build_table(handler.mapping)
out_filename = kmz_filename[:-4] + "-geo.csv" #output filename same as input plus .csv
print('Saving the geodata as', out_filename)
f = open(out_filename, "w")
f.write(outstr)
f.close()

674 objects found.
Saving the geodata as GOL_DECHETTERIE-geo.csv


# Let's merge the two datasets

In [9]:
import pandas as pd

In [10]:
df = pd.read_csv(csv_filename, delimiter=';', encoding='Windows 1252')
geo = pd.read_csv(csv_filename[:-4] + '-geo.csv', delimiter=';')
print("df length =", len(df), "\ngeodata length =", len(geo))

df length = 675 
geodata length = 674


In [12]:
geo['id'] = geo.index.astype(str)

## Let's have a look

In [15]:
geo.head(2)

Unnamed: 0,Name,Coordinates,id
23-094,"6.121241069757926,46.21483207228757...",,23-094
23-053,"6.130219608016631,46.21463137886302...",,23-053


In [16]:
df.head(2)

Unnamed: 0,NUMERO_SITE,NO_VGE,STATUT,BENNE,MODELE_BENNE,PROFONDEUR,COMMUNE,ADRESSE,HORAIRES,RIVE,...,BRANCHE,FRIGO,SPECIAUX_MENAGER,ORDURE_MENAGERE,DECHETS_CUISINE,DECHETS_JARDINS,DECHETS_ORGANIQUES,AUTRE_DECHET,SHAPE.AREA,SHAPE.LEN
0,26-013,,Existante,En surface,Non renseigné,0.0,Grand-saconnex,Chemin edouard-sarasin - devant les commerces ...,,Droite,...,,,,,,,,,18.990061,24.07029
1,45-002,,Existante,Enterrée,Villiger,2.85,Vandoeuvres,Route de meinier - parking du centre communal,,Gauche,...,,,,Oui,,,,,22.452571,25.052456


In [20]:
id_column = df.columns[0]
print("The dataset's id column should be", id_column, "(edit next line if it seems wrong).")
# id_column = 'custom_column_name

The dataset's id column should be NUMERO_SITE (edit next line if it seems wrong).


In [21]:
df2 = df.merge(geo, left_on=id_column, right_on='id')

In [23]:
merged_filename = csv_filename[:-4]+'_export.csv'
df2.shape
df2.head()
df2.to_csv(csv_filename[:-4]+'_export.csv', encoding='utf-8', delimiter=';', index=False)
print("The merged data was saved as", merged_filename)

The merged data was saved as GOL_DECHETTERIE_export.csv
