In [None]:
import os
import itertools
import operator
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import matplotlib as mpl
import matplotlib.pyplot as plt
from utils.coords import *
from analysis.peaksdata import *

In [None]:
# font format for figures
# see: https://stackoverflow.com/questions/33955900/matplotlib-times-new-roman-appears-bold
del mpl.font_manager.weight_dict['roman']
mpl.font_manager._rebuild()

plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams["font.size"] = 12
plt.rcParams["font.weight"] = 'normal'
plt.rcParams['pdf.fonttype'] = 42 #TrueType

# Definitions

In [None]:
# full names, for visualization and figures
terrainFullName = {
    'alaska': 'Alaska',
    'alps': 'Alps',
    'altai': 'Altai',
    'andes_aconcagua': 'Andes-Aconcagua',
    'andes_bolivia': 'Andes-Bolivia',
    'andes_chile': 'Andes-Chile',
    'andes_colombia': 'Andes-Colombia',
    'andes_ecuador': 'Andes-Ecuador',
    'andes_peru': 'Andes-Peru',
    'apennines': 'Apennines',
    'appalachians': 'Appalachians',
    'turkey': 'Armenian-Highlands',
    'atlas': 'Atlas',
    'australia': 'Australia-GDR',
    'balkans': 'Balkan-Peninsula',
    'carpathian': 'Carpathians',
    'cascades': 'Cascades',
    'caucasus': 'Caucasus',
    'southafrica': 'Drakensberg',
    'ethiopia': 'Ethiopian-Highlands',
    'gobi': 'Gobi',
    'roraima': 'Guiana-Highlands',
    'highlands': 'Scottish-Highlands',
    'himalaya': 'Himalayas',
    'hindukush': 'Hindu-Kush',
    'iceland': 'Iceland',
    'japan': 'Japan',
    'kamchatka': 'Kamchatka',
    'karakoram': 'Karakoram',
    'kilimanjaro': 'Kilimanjaro',
    'laos': 'Laos',
    'yangshuo': 'Li-River',
    'nevada': 'Nevada',
    'newzealand': 'New-Zealand',
    'papua': 'Papua-New-Guinea',
    'pamir': 'Pamir',
    'patagonia': 'Patagonia',
    'pyrenees': 'Pyrenees',
    'rockies': 'Rockies-Canadian',
    'colorado': 'Rockies-Colorado',
    'sahara': 'Sahara',
    'norway': 'Scandes-Norway',
    'siberia': 'Siberia',
    'mexico': 'Sierra-Madre',
    'sierra': 'Sierra-Nevada',
    'taurus': 'Taurus',
    'tibet': 'Tibetan-Plateau',
    'tienshan': 'Tien-Shan',
    'urals': 'Urals',
    'zagros': 'Zagros'
}

In [None]:
# these are the 15 terrains used for the small confusion matrix in the article
chosenTerrains15 = [
    'alps',
    'himalaya',
    'karakoram',
    'norway',
    'sahara',
    'appalachians',
    'andes_chile',
    'newzealand',
    'rockies',
    'andes_peru',
    'alaska',
    'patagonia',
    'colorado',
    'caucasus',
    'gobi'
]

# Dataset

In [None]:
# input dataset path
dsWorld = 'data/regions_30km.csv'

In [None]:
# read dataset
regionsDataset = pd.read_csv(dsWorld, sep=',')
print('Dataset size', regionsDataset.shape)

# terrain labels in set
terrainLabels = np.unique(regionsDataset['terrain'])
print(terrainLabels.size, terrainLabels)

In [None]:
# which terrains do we want to use?
#chosenTerrains = sorted(chosenTerrains15, key=lambda x: terrainFullName[x])
chosenTerrains = sorted(terrainLabels, key=lambda x: terrainFullName[x])

# check our dataset
print('Chosen terrains:', np.unique(chosenTerrains).size)
print(chosenTerrains)

print()
print('Under-sampled terrains')
for t in chosenTerrains:
    if np.sum(regionsDataset['terrain'] == t) < 100:
        print('    ', np.sum(regionsDataset['terrain'] == t), t)

In [None]:
# append number of samples to terrain name?
appendSamplesToName = True
if appendSamplesToName:
    for t in terrainLabels:
        # under-represented terrains, append samples to name
        if appendSamplesToName and np.sum(regionsDataset['terrain'] == t) < 100:
            terrainFullName[t] = terrainFullName[t] + ' (' + str(np.sum(regionsDataset['terrain'] == t)) + ')'

# Common functions

In [None]:
# helper to get a train/val split
def getTrainVal(df, chosenTerrains, maxTrain = 80, maxValid = 20):
    terrainRows = {}
    for t in chosenTerrains:
        terrainRows[t] = df.index[df['terrain'] == t]

    terrainShuffledRows = {}
    for t in chosenTerrains:
        terrainShuffledRows[t] = np.random.permutation(terrainRows[t])

    trainSet = []
    validSet = []
    percTrain = maxTrain/(maxTrain + maxValid)
    for t in chosenTerrains:
        nSamples = terrainShuffledRows[t].size
        nTrain   = np.minimum(maxTrain, np.round(percTrain*nSamples).astype(np.int))
        nValid   = np.minimum(maxValid, nSamples - nTrain)
        for i in range(nTrain):
            trainSet.append(terrainShuffledRows[t][i])
        for i in range(nValid):
            validSet.append(terrainShuffledRows[t][nTrain + i])
            

    return trainSet, validSet

In [None]:
# helper to test only a set of the metrics
def getFeatureColumns(df, keepFeatures):
    # each feature contains many columns of the dataset (the histogram bins), so keep all of them
    keepCols = []
    for c in df.columns:
        if c.split('_')[0] in keepFeatures:
            keepCols.append(c)
        
    return keepCols

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix (Accuracy %)',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = np.round(100*cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]).astype(int)
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    ax = fig.add_subplot(111)
        
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=18)
    #plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)
    
    #ax.xaxis.set_ticks_position('both')
    #ax.xaxis.set_tick_params(labeltop=True)

    fmt = 'd' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 verticalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=18)
    plt.xlabel('Predicted label', fontsize=18)
    plt.tight_layout()

# A) Regions classifier accuracy test

In [None]:
# number of experiments
numRuns = 100

# we can experiment with only a subset of metrics (features)
# for example, note that we excluded 'elevation'
keepCols = getFeatureColumns(regionsDataset,
                             ['elevRel', 'prominence', 'promRel', 
                              'dominance', 'domGroup', 'relevance',
                              'isolation', 'isolDir', 'saddleDist', 'saddleDir'])
print(keepCols)

In [None]:
# Now let's execute the test and store the accumulated confusion matrix from all the runs

# confusion matrix
C = np.zeros((len(chosenTerrains), len(chosenTerrains)))

# accuracies
scores = []
for i in range(numRuns):    
    
    print('Run %3d/%3d' % (i+1,numRuns), end='\r' if i+1 < numRuns else '\n')
    
    trainSet, validSet = getTrainVal(regionsDataset, chosenTerrains, maxTrain=80, maxValid=20)
    Xt = regionsDataset[keepCols].values[trainSet,:]
    yt = regionsDataset['terrain'].values[trainSet]
    Xv = regionsDataset[keepCols].values[validSet,:]
    yv = regionsDataset['terrain'].values[validSet]

    model = RandomForestClassifier(n_estimators=100)
    model.fit(Xt, yt)
    scores.append(model.score(Xv, yv))
    
    yp = model.predict(Xv)
    C += confusion_matrix(yv, yp, labels=chosenTerrains)
    
print('done')

In [None]:
print('Mean accuracy:', '%.2f'%(100*np.mean(scores)), '%.2f'%(100*np.std(scores)))

In [None]:
classAcc = np.diag((C.astype('float') / C.sum(axis=1)[:, np.newaxis]))
classByAcc = np.argsort(classAcc)[::-1]

for c in classByAcc:
    print(chosenTerrains[c], '%.1f'%(classAcc[c]*100))
    
print()
print('Median: ', '%.1f'%(100*np.median(classAcc)))

In [None]:
# Plot confusion matrix
renamedTerrains = [terrainFullName[t] for t in chosenTerrains]

fig = plt.figure(figsize=(16,16))
plot_confusion_matrix(C, classes=renamedTerrains, title='Confusion matrix (Accuracy %)', normalize=True)
fig.savefig('confusionMatrix.pdf', dpi=300, bbox_inches='tight', pad_inches=0)

# B) Classify a terrain

In [None]:
# sample disk radius (should be the same as the one used to construct the dataset!)
diskRadius = 30 # km

In [None]:
# input peaks file
terrainPeaksFile = 'results/synth_alps.csv'

In [None]:
# read the peaks csv
peaks = pd.read_csv(terrainPeaksFile)
peaks = addExtraColumns(peaks)

# normalize distance columns
peaks['isolation']  /= diskRadius
peaks['saddleDist'] /= diskRadius

In [None]:
# compute BBox of terrain
minLat = peaks['latitude'].min()
maxLat = peaks['latitude'].max()
minLon = peaks['longitude'].min()
maxLon = peaks['longitude'].max()

In [None]:
# range for sampling random positions. If 0, we will always sample at center location
ctrLon = 0.5*(minLon + maxLon)
ctrLat = 0.5*(minLat + maxLat)
hrangeLat = np.maximum(0, 0.5*(maxLat - minLat) - km2deg(diskRadius))
hrangeLon = np.maximum(0, 0.5*(maxLon - minLon) - km2deg(diskRadius, ctrLat))

print('Center', ctrLat, ctrLon)
print('Range ', hrangeLat, hrangeLon)

In [None]:
# how many different locations we want to try
numSampleLocations = 20

# obtain positions
rndOffset = np.random.uniform(-1, 1, size=(numSampleLocations, 2))
sampleLocations = rndOffset*np.array([hrangeLat, hrangeLon]) + np.array([ctrLat, ctrLon])

In [None]:
# compute the distributions at each sampled location
sampledDistribs = {}
for c in regionsDataset.columns:
    if c == 'terrain':
        continue
    sampledDistribs[c] = []

for i,loc in enumerate(sampleLocations):

    # peaks inside disk
    locPeaks = filterPeaksHaversineDist(peaks, loc, diskRadius)
    npeaks = locPeaks.shape[0]
    
    # metrics distributions
    dists = computeDistributions(locPeaks, diskRadius=1.0, detailed=False)
    
    # append to arrays
    for feat in ['elevation', 'prominence', 'isolDir', 'saddleDir']:
        for hbin,hval in zip(dists[feat]['bins'][:-1], dists[feat]['hist']):
            sampledDistribs['%s_%d' % (feat, int(hbin))].append(hval/npeaks)                             
    for feat in ['domGroup']:
        for hbin,hval in zip(dists[feat]['bins'][:-1], dists[feat]['hist']):
            sampledDistribs['%s_%.2f' % (feat, 100*hbin)].append(hval/npeaks)                  
    for feat in ['elevRel', 'promRel', 'dominance', 'relevance', 'isolation', 'saddleDist']:
        for hbin,hval in zip(dists[feat]['bins'][:-1], dists[feat]['hist']):
            sampledDistribs['%s_%.2f' % (feat, hbin)].append(hval/npeaks)
            
    print('Sampled location %3d/%3d' % (i+1,numSampleLocations), end='\r' if i+1 < numSampleLocations else '\n')

# dataframe
samplesDataset = pd.DataFrame.from_dict(sampledDistribs)

### all features

In [None]:
# how many classifications per sample?
numTests = 100

# all features used to evaluate terrain classification previously
allClassifierFeatures =  ['elevRel', 'prominence', 'promRel', 
                          'dominance', 'domGroup', 'relevance',
                          'isolation', 'isolDir', 'saddleDist', 'saddleDir']
# only the features that were taken into account in our synthesis algorithm
synthesisFeatures = ['elevRel', 'prominence', 'dominance', 'isolation', 'isolDir']

keepCols = getFeatureColumns(regionsDataset, allClassifierFeatures)

In [None]:
# run the tests
predictedRegions = {}

for i in range(numTests):
    
    print('Test %3d/%3d' % (i+1,numTests), end='\r' if i+1 < numTests else '\n')

    # train model
    trainSet, _ = getTrainVal(regionsDataset, chosenTerrains, maxTrain=100, maxValid=0)
    Xt = regionsDataset[keepCols].values[trainSet,:]
    yt = regionsDataset['terrain'].values[trainSet]

    model = RandomForestClassifier(n_estimators=100)
    model.fit(Xt, yt)

    # predict
    preds = model.predict(samplesDataset[keepCols])
    
    # sum predictions
    for p in preds:
        predictedRegions[p] = predictedRegions.get(p, 0) + 1
    
print('done')

In [None]:
# raw predictions
print(predictedRegions)

In [None]:
# percentage of each predicted class
sortedPredictions = sorted(predictedRegions.items(), key=operator.itemgetter(1), reverse=True)
for k,v in sortedPredictions:
    print('%5.2f - %s' % (100*v/(numTests*numSampleLocations), terrainFullName[k]))