In [1]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import csv
import matplotlib.pyplot as plt

In [2]:
with open('data/_roads.tcv') as f:
    reader = csv.reader(f, delimiter="\t")
    data = list(reader)

For easier processing, we convert the list based data into a pandas dataframe

In [3]:
data_list = []
for ids in range(len(data)):
    road = data[ids][0]
    for triplet in range(1, len(data[ids]), 3):
        lrp = data[ids][triplet]
        lat = data[ids][triplet + 1]
        lon = data[ids][triplet + 2]
        lat = float(lat) if type(lat) != str else lat
        lon = float(lon) if type(lon) != str else lon
        data_list.append({'road': road, 'lrp': lrp, 'lat': lat, 'lon': lon})

dataset = pd.DataFrame(data_list)

In [4]:
dataset=dataset.drop(index=0)
dataset=dataset.drop(index=1)
dataset=dataset.reset_index(drop=True)

Now we can perform some basic checks for missing values and other obvious errors

In [5]:
dataset.isnull().sum()

road    0
lrp     0
lat     0
lon     0
dtype: int64

Since the tcv file stores all data as string, we have also read the data as a single string. In order to process this in pandas we need to convert the latitude and londitude data into float values.

In [6]:
#Converting all strings for data into floats
for i in dataset.index:
    if (type(dataset.road[i])!=str):
        junk=dataset.loc[i]
    if (type(dataset.lat[i])!=float):
        dataset.loc[i,'lat']=float(dataset.loc[i,'lat'])
        #print('via lat')
    if (type(dataset.lon[i])!=float):
        dataset.loc[i,'lon']=float(dataset.loc[i,'lon'])        
        #print('via lon')

In [7]:
list_lat=dataset['lat'].unique()
list_lon=dataset['lon'].unique()
typeList={'Floats':0,'Ints':0,'Strings':0}
for i in list_lat:
    if type(i)==float:
        typeList['Floats']+=1
    if type(i)==str:
        typeList['Strings']+=1
    if type(i)==int:
        typeList['Ints']+=1
print(typeList)
print(len(list_lat))

{'Floats': 42661, 'Ints': 0, 'Strings': 0}
42661


In [8]:
roadMap=dataset

In [9]:
for road in roadMap['road'].unique():
    for elem in roadMap.loc[roadMap['road'] == road].index:
        # NOT first or last LRP
        if elem != 0 and elem != 1 and elem != (len(roadMap.loc[roadMap['road'] == road])-1) and elem != (len(roadMap.loc[roadMap['road'] == road])-2):
                    
                #far from both neighbors
            if abs(roadMap.loc[elem, 'lat'] - roadMap.loc[elem-2, 'lat']) > 0.1 and abs(roadMap.loc[elem, 'lat'] - roadMap.loc[elem+2, 'lat']) > 0.1 :  
                    roadMap.loc[elem, 'lat'] = (roadMap.loc[elem-3, 'lat'] + roadMap.loc[elem+3, 'lat'])/2 # replace with average of neighbors
                    roadMap.loc[elem, 'lon'] = (roadMap.loc[elem-3, 'lon'] + roadMap.loc[elem+3, 'lon'])/2 # replace with average of neighbors
                    #print(road)
                    
                    #far from precursor only
            elif abs(roadMap.loc[elem, 'lat'] - roadMap.loc[elem-2, 'lat']) > 0.1 and elem > 6: 
                if abs(roadMap.loc[elem+3, 'lat'] - (roadMap.loc[elem-3, 'lat'] + abs(roadMap.loc[elem-6, 'lat'] - roadMap.loc[elem-3, 'lat']))) < abs(roadMap.loc[elem+3, 'lat'] - roadMap.loc[elem, 'lat']): # if extending linear trend of previous datapoints brings outlier closer to successor
                    roadMap.loc[elem, 'lat'] = roadMap.loc[elem-3, 'lat'] + abs(roadMap.loc[elem-6, 'lat'] - roadMap.loc[elem-3, 'lat'])
                    roadMap.loc[elem, 'lon'] = roadMap.loc[elem-3, 'lon'] + abs(roadMap.loc[elem-6, 'lon'] - roadMap.loc[elem-3, 'lon'])
                            
            # last LRP
            elif elem == len(roadMap.loc[roadMap['road'] == road])-1 or elem == len(roadMap.loc[roadMap['road'] == road])-2: 
                if abs(roadMap.loc[elem, 'lat'] - roadMap.loc[elem-3, 'lat']) > 0.1: #if far from precursor
                    roadMap.loc[elem, 'lat'] = roadMap.loc[elem-3, 'lat'] + abs(roadMap.loc[elem-6, 'lat'] - roadMap.loc[elem-3, 'lat']) #replace with linear extension of precursors
                    roadMap.loc[elem, 'lon'] = roadMap.loc[elem-3, 'lon'] + abs(roadMap.loc[elem-6, 'lon'] - roadMap.loc[elem-3, 'lon'])
                    #print(road)
           
            # first LRP
            elif elem == 0 or elem == 1: 
                if abs(roadMap.loc[elem, 'lat'] - roadMap.loc[elem+3, 'lat']) > 0.1: #if far from successor
                    roadMap.loc[elem, 'lat'] = roadMap.loc[elem+3, 'lat'] - abs(roadMap.loc[elem+3, 'lat'] - roadMap.loc[elem+6, 'lat']) #replace with linear extension of successor
                    roadMap.loc[elem, 'lon'] = roadMap.loc[elem+3, 'lon'] - abs(roadMap.loc[elem+3, 'lon'] - roadMap.loc[elem+6, 'lon'])


In [10]:
highList=pd.unique(roadMap.road)
stringTCV=['road\tlrp1\lat1\tlon1\tlat1\tlon1\n']
for ID in highList:
    road = roadMap.loc[roadMap.road==ID]
    road=road.reset_index()
    stringTCV+=ID
    stringTCV+='\t'
    for point in road.index:
        stringTCV+=roadMap.loc[point,'lrp']
        stringTCV+='\t'
        stringTCV+=str(roadMap.loc[point,'lat'])
        stringTCV+='\t'
        stringTCV+=str(roadMap.loc[point,'lon'])
        stringTCV+='\t'
    stringTCV+='\n'
        

In [11]:
strings = ''.join(stringTCV)

In [13]:
roadMap.to_csv('data/processed/_roads.tcv', sep='\t', index=False)