In [None]:
import os
import shutil
import subprocess
import numpy as np
import pandas as pd
from utils.coords import *
from utils.shapefiles import *
from analysis.peaksdata import *

# Data

Download prominence and isolation lists from Andrew Kirmse project:
https://github.com/akirmse/mountains

In [None]:
# path to prominence and isolation files
prominenceDB = 'data/prominence-p100.txt'
isolationDB  = 'data/alliso-sorted.txt'

if not os.path.exists(prominenceDB) or not os.path.exists(isolationDB):
    print('ERROR: peak databases not found!')

In [None]:
# region shapefiles
regionShapesDir = 'data/regionShapes'
regionShapes = [f for f in os.listdir(regionShapesDir) if f.endswith('.shp')]

# Filter and unify prominence and isolation peak lists

In [None]:
regionPeaksDir = 'data/regionPeaks'

if not os.path.exists(regionPeaksDir):
    os.makedirs(regionPeaksDir)

In [None]:
# process each region to filter the database peaks that are inside
# since this process takes a long time, we provide the functions as standalone scripts for batch processing
for region in regionShapes:
    print(region)
    
    print(' 1/3 Filtering prominence DB...')
    subprocess.call('python analysis/filterPoints.py "%s" data/prominence-p100.txt prom.txt' 
                    % (os.path.join(regionShapesDir, region)))
    print(' 2/3 Filtering isolation DB...')
    subprocess.call('python analysis/filterPoints.py "%s" data/alliso-sorted.txt isol.txt' 
                    % (os.path.join(regionShapesDir, region)))

    # merge lists
    print(' 3/3 Merging lists...')
    subprocess.call('python analysis/mergePeaklists.py isol.txt prom.txt tmppeaks.csv --deleteOriginals')

    # move results to output dir
    shutil.move("tmppeaks.csv", os.path.join(regionPeaksDir, region.replace(".shp", ".csv")))
    
print('done!')

# Compute statistics 

In [None]:
regionStatsDir = 'data/regionStats'

if not os.path.exists(regionStatsDir):
    os.makedirs(regionStatsDir)

In [None]:
# statistics disk radius
diskRadius = 30

In [None]:
writeFeatures = ['elevation', 'elevRel', 'prominence', 'promRel', 
                 'dominance', 'domGroup', 'relevance',
                 'isolation', 'isolDir', 'saddleDist', 'saddleDir']

def writeHeaderToFile(fout, distributions):
    fout.write('lat,lon,peaks')
    for feat in writeFeatures:
        if feat in ['elevation', 'prominence', 'isolDir', 'saddleDir']:
            for val in distributions[feat]['bins'][:-1]:
                fout.write(',%s_%d' % (feat, int(val)))
        elif feat == 'domGroup':
            for val in distributions[feat]['bins'][:-1]:
                fout.write(',%s_%.2f' % (feat, 100*val))
        else:
            for val in distributions[feat]['bins'][:-1]:
                fout.write(',%s_%.2f' % (feat, val))
    fout.write('\n')

def writeLocationStatsToFile(fout, lat, lon, npeaks, distributions):
    fout.write('%.4f,%.4f,%d'%(lat, lon, npeaks))
    for feat in writeFeatures:
        for val in distributions[feat]['hist']:
            fout.write(',%d' % val)
    fout.write('\n')

In [None]:
# process each region (note: it takes a long time!)
for region in regionShapes:
    
    # sample stats locations inside polygon, separated at least 1/2 radius distance
    sampleLocations = sampleShapefileLocations(os.path.join(regionShapesDir, region), diskRadius)

    # region peaks DB
    df = pd.read_csv(os.path.join(regionPeaksDir, region.replace('.shp', '.csv')))
    df = addExtraColumns(df)
    
    # normalize distance columns
    df['isolation']  /= diskRadius
    df['saddleDist'] /= diskRadius
    
    # results file
    fout = open(os.path.join(regionStatsDir, region.replace('.shp', '.csv')), 'w')
    headerWritten = False
    
    # compute statistics
    for di,diskCenter in enumerate(sampleLocations):
        
        # filter peaks in disk using haversine distance
        peaks = filterPeaksHaversineDist(df, diskCenter, diskRadius)
                
        # skip if not enough peaks
        if peaks.shape[0] < 20:
            continue
        
        # compute statistics
        # diskRadius = 1   to have isolation/saddle dist histograms axis from 0 to 1, note we normalized distances before
        # detailed = False for the classification histograms, for synthesis we double the number of bins
        distributions = computeDistributions(peaks, diskRadius=1.0, detailed=False)
        
        # write dataset headers if first location
        if not headerWritten:
            writeHeaderToFile(fout, distributions)
            headerWritten = True
        
        # write data line
        writeLocationStatsToFile(fout, diskCenter[0], diskCenter[1], peaks.shape[0], distributions)
        
        print('%s: %3d/%3d samples'%(region, di+1, len(sampleLocations)), end='\r' if di+1 < len(sampleLocations) else '\n')
        
    fout.close()

print('done!')

# Create dataset

In [None]:
# file where the dataset will be stored
fileDataset = 'data/regions_%dkm.csv' % (int(diskRadius))

# regions to put in the dataset (for example, we could omit certain regions, we can also do it later in classifier)
datasetRegions = [f for f in os.listdir(regionStatsDir) if f.endswith('.csv')]

# if True: write frequencies, otherwise keep histogram counts
# we observed that frequencies work better in the classifier
normalize = True

In [None]:
alldf = []
for file in datasetRegions:
    
    # name
    terrainName = file.split('.')[0]
    
    # read dataframe 
    df = pd.read_csv(os.path.join(regionStatsDir, file))
    
    # keep number of peaks
    npeaks = df['peaks'].values
    
    # drop lat, lon, npeaks
    df.drop(['lat', 'lon', 'peaks'], axis=1, inplace=True)
    
    # normalize histogram columns?
    if normalize:
        for c in df.columns:
            df[c] = df[c].astype(np.float32)/npeaks
    
    # add terrain name column
    df.insert(0, 'terrain', terrainName)
    alldf.append(df)
    
    print('%4d %s' % (df.shape[0], terrainName))
    
alldf = pd.concat(alldf, ignore_index=True)
alldf.to_csv(fileDataset, float_format='%.4f', index=False)