In [1]:
# import libraries
import pandas as pd
import random

## Convert Data function

In [2]:
import pandas as pd
import random


def convert_data():
    """
    Converts data conform demo data files.
    """

    # import data
    df = pd.read_excel('../data/bridges.xlsx')

    # slice data information
    df = df[["road", "km", "type", "name", "length", "condition", "lat", "lon", "zone"]]
    
    # HANDLING MISSING VALUES
    
    
    # change NaN values in name with dot i.e. '.'
    df['name'].fillna('.', inplace=True)
    
    # check NaN values in other columns, only length has missing values
    df.isnull().sum(axis = 0)
    
    # assign new dataframe to missing values
    missing = df[df.length.isnull()]
    
    # for some missing values, missing values can be retrieved from bridges with same chainage, 
    # get length of these bridges and replace missing value with this value. 
    for index in missing.index:  
        if df.loc[index, 'km'] == df.loc[index-1, 'km']: 
            # assign length of bridge with same chainage to variable new_length
            new_length = df.loc[index-1, 'length']
            # replace missing value with new length
            df.loc[index, 'length'] = new_length

        elif df.loc[index, 'km'] == df.loc[index+1, 'km']: 
            new_length = df.loc[index+1, 'length']
            df.loc[index, 'length'] = new_length
        
    # update missing value dataframe
    updated_missing = df[df.length.isnull()]
    
    # for the left-over missing values, replace length with average length of bridges for specific road
    for index in missing.index:
        road_name = df.loc[index, 'road']
        road_subset = df[df['road'] == road_name]
        average_length = road_subset.loc[:, 'length'].mean()
        df.loc[index, 'length'] = average_length
      
    
    # HANDLING DUPLICATES
    
    # change type of column first
    df['name'] = df['name'].astype(str)

    # replace modifications of right/left in bridge names
    df['name'] = df['name'].apply(lambda x: x.replace(')', ''))
    df['name'] = df['name'].apply(lambda x: x.replace('RIGHT', 'R'))
    df['name'] = df['name'].apply(lambda x: x.replace('LEFT', 'L'))
    df['name'] = df['name'].apply(lambda x: x.replace('Right', 'R'))
    df['name'] = df['name'].apply(lambda x: x.replace('Left', 'L'))
    
    # strip the trailing whitespaces 
    df['name'] = df['name'].apply(lambda x: x.strip())
    
    # define dataframe for duplicates
    duplicateRows = df[df['road'] == 'N1']
    # subset based on latitude and longitude
    duplicateRows = duplicateRows[duplicateRows.duplicated(subset=['lat', 'lon'])]
    # sort by chainage
    duplicateRows.sort_values(by=['km'])
    # change condition from letters to numbers in order to compare them
    df['conditionNum'] = 0
    df.loc[df['condition'] == 'A', 'conditionNum'] = 1
    df.loc[df['condition'] == 'B', 'conditionNum'] = 2
    df.loc[df['condition'] == 'C', 'conditionNum'] = 3
    df.loc[df['condition'] == 'D', 'conditionNum'] = 4
    # initialize a list for indexes to remove after running for-loop
    remove_index = []

    # for index in dataframe with duplicates
    for index in duplicateRows.index: 
        # retrieve latitude and longitude
        latitude = df.loc[index, 'lat'] 
        longitude = df.loc[index, 'lon']

        # define a subset of duplicates based on the latitude and longitude
        subset = df.loc[((df['lat'] == latitude) & (df['lon'] == longitude))] 

        # define lists for bridges with left, right, or neither in their name
        contains_left = []
        contains_right = []
        contains_none = []

        for index in subset.index: 
            # for every row in subset, retrieve condition and assign to set
            condition = subset.loc[index, 'conditionNum']
            # define the set with both index and condition
            condition_set = (index, condition)

            # retrieve name and whether L or R in name
            name = subset.loc[index, 'name']
            last_letter = name[-1:]
            # check whether last letter is R, L, or something else
            if last_letter == 'R': 
                contains_right.append(condition_set)

            elif last_letter == 'L': 
                contains_left.append(condition_set)

            else: 
                contains_none.append(condition_set)   

        # when no left and right in name, but other letters
        if len(contains_left) == 0 and len(contains_right) == 0 and len(contains_none) > 0: 
            # check whether conditions of bridges are equal
            if contains_none[0][1] == contains_none[1][1]: 
                # if so, pick random index and append to list
                random_none = random.choice(contains_none)
                remove_index.append(random_none[0])

            # if condition of one is greater than other, remove the highest one
            # better be safe than sorry
            elif contains_none[0][1] < contains_none[1][1]: 
                remove_index.append(contains_none[0][0])

            elif contains_none[0][1] > contains_none[1][1]: 
                remove_index.append(contains_none[1][0])

        # capitalized one is often an updated version, we assume. Hence, remove the left and right
        if len(contains_left) == 1 and len(contains_right) == 1 and len(contains_none) == 1: 
            for element in contains_left: 
                remove_index.append(element[0])
            for element in contains_right: 
                remove_index.append(element[0])

        # if two times left
        if len(contains_left) == 2: 
            # check whether conditions are equal
            if contains_left[0][1] == contains_left[1][1]: 
                # then randomly pick one
                random_left = random.choice(contains_left)
                remove_index.append(random_left[0])

            # else check which condition is better, remove that one
            elif contains_left[0][1] < contains_left[1][1]: 
                remove_index.append(contains_left[0][0])

            elif contains_left[0][1] > contains_left[1][1]: 
                remove_index.append(contains_left[1][0])

        # same structure as with left, now for right
        if len(contains_right) == 2: 
            if contains_right[0][1] == contains_right[1][1]: 
                random_left = random.choice(contains_right)
                remove_index.append(random_left[0])

            elif contains_right[0][1] < contains_right[1][1]: 
                remove_index.append(contains_right[0][0])

            elif contains_right[0][1] > contains_right[1][1]: 
                remove_index.append(contains_right[1][0])

        # if left and capital, remove left one
        if len(contains_left) == 1 and len(contains_none) == 1: 
            for element in contains_left: 
                remove_index.append(element[0])

        # if right and capital, remove right one
        if len(contains_right) == 1 and len(contains_none) == 1: 
            for element in contains_right: 
                remove_index.append(element[0])

        # if both right and left, keep both 
        if len(contains_right) == 1 and len(contains_left) == 1 and len(contains_none) == 0: 
            continue

    # only retrieve unique indexes in list, otherwise we remove all
    used = set()
    unique_indexes = [x for x in remove_index if x not in used and (used.add(x) or True)]

    # remove all the indexes in removing list
    for element in unique_indexes: 
        df = df.drop(index = element)
        
        
    # FORMAT DATAFRAME CONFORM DEMO FILES
    
    # reverse order 
    df = df.iloc[::-1]
    
    # add model type
    df['model_type'] = 'bridge'
    
    # subset dataframe based on N1 road
    df = df[df['road'] == 'N1']
    
    # sort values based on km, in reversed direction to drive in opposite direction
    df = df.sort_values(by = 'km', ascending = False)
    
    # only use centre of Chittagong, around 287 km from Dhaka
    df = df.loc[df['km'] < 287]
    
    # reset index
    df = df.reset_index()
    
    # drop unnecessary columns
    df = df.drop("conditionNum", axis='columns')
    df = df.drop("index", axis='columns')
    
    # import roads to get source and sink
    df_roads = pd.read_csv('../data/roads.csv')
    # select only N1 data entries
    df_roads = df_roads[df_roads['road'] == 'N1']
    
    # assign first column to new dataframe
    df_roads_0 = df_roads[0:1]
    # retrieve source characteristics
    road_name = df_roads_0.road[0]
    km = df_roads_0.chainage[0]
    lrp = df_roads_0.lrp[0]
    latitude = df_roads_0.lat[0]
    longitude = df_roads_0.lon[0]
    type_of_bridge = 'source'
    bridge_name = 'source'
    length = 0
    condition = 'A'
    
    # adding new row
    df.loc[len(df)] = [road_name, km, type_of_bridge, bridge_name, length, 
                  condition, latitude, longitude, type_of_bridge]  
    # sort index
    df.sort_index(inplace=True) 
    
    # get a point around centre of Chittagong, which is around 288 km away from Dhaka
    df_roads = df_roads.loc[df_roads['chainage'] < 288]
    
    # assign last column to new dataframe
    df_roads_last = df_roads[-1::]
    # reset index
    df_roads_last = df_roads_last.reset_index()
    # retrieve sink characteristics
    road_name = df_roads_last.loc[0, 'road']
    km = df_roads_last.loc[0, 'chainage']
    lrp = df_roads_last.loc[0, 'lrp']
    latitude = df_roads_last.loc[0, 'lat']
    longitude = df_roads_last.loc[0, 'lon']
    type_of_bridge = 'sink'
    bridge_name = 'sink'
    length = 0
    condition = 'A'
    
    # adding new row
    df.loc[-1] = [road_name, km, type_of_bridge, bridge_name, length, 
              condition, latitude, longitude, type_of_bridge]
    # shifting index
    df.index = df.index + 1  
    # reset index
    df.sort_index(inplace=True) 
    
    # convert dataframe to csv
    df.to_csv('../data/bridges_cleaned.csv')

## Missing Value Management

In [3]:
# import data
df = pd.read_excel('../data/bridges.xlsx')

In [4]:
df.zone.value_counts()

zone
Chittagong    2829
Rangpur       2749
Comilla       2575
Khulna        2189
Rajshahi      2151
Mymensingh    2039
Sylhet        1923
Dhaka         1665
Barisal       1348
Gopalganj      947
Name: count, dtype: int64

In [5]:
# slice data information
df = df[["road", "km", "type", "name", "length", "condition", "lat", "lon", "zone"]]

In [6]:
df.head()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone
0,N1,1.8,Box Culvert,.,11.3,A,23.698739,90.458861,Dhaka
1,N1,4.925,Box Culvert,.,6.6,A,23.694664,90.487775,Dhaka
2,N1,8.976,PC Girder Bridge,Kanch pur Bridge.,394.23,A,23.70506,90.523214,Dhaka
3,N1,10.88,Box Culvert,NOYAPARA CULVERT,6.3,A,23.694391,90.537574,Dhaka
4,N1,10.897,Box Culvert,ADUPUR CULVERT,6.3,A,23.694302,90.537707,Dhaka


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20415 entries, 0 to 20414
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   road       20415 non-null  object 
 1   km         20415 non-null  float64
 2   type       20415 non-null  object 
 3   name       20100 non-null  object 
 4   length     20406 non-null  float64
 5   condition  20415 non-null  object 
 6   lat        20415 non-null  float64
 7   lon        20415 non-null  float64
 8   zone       20415 non-null  object 
dtypes: float64(4), object(5)
memory usage: 1.4+ MB


In [8]:
df.isnull().sum(axis = 0)

road           0
km             0
type           0
name         315
length         9
condition      0
lat            0
lon            0
zone           0
dtype: int64

In [9]:
df['name'].fillna('.', inplace=True)

In [10]:
df.isnull().sum(axis = 0)

road         0
km           0
type         0
name         0
length       9
condition    0
lat          0
lon          0
zone         0
dtype: int64

In [11]:
missing = df[df.length.isnull()]

In [12]:
for index in missing.index:  
    if df.loc[index, 'km'] == df.loc[index-1, 'km']: 
        new_length = df.loc[index-1, 'length']
        df.loc[index, 'length'] = new_length

    elif df.loc[index, 'km'] == df.loc[index+1, 'km']: 
        new_length = df.loc[index+1, 'length']
        df.loc[index, 'length'] = new_length

In [13]:
updated_missing = df[df.length.isnull()]

In [14]:
for index in missing.index:
    road_name = df.loc[index, 'road']
    road_subset = df[df['road'] == road_name]
    average_length = road_subset.loc[:, 'length'].mean()
    df.loc[index, 'length'] = average_length

In [15]:
check_leftovers = df[df.length.isnull()]

In [16]:
check_leftovers

Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone


## Duplicate Management

In [17]:
df['name'] = df['name'].astype(str)

In [18]:
# first we need to strip the ')' 
df['name'] = df['name'].apply(lambda x: x.replace(')', ''))
df['name'] = df['name'].apply(lambda x: x.replace('RIGHT', 'R'))
df['name'] = df['name'].apply(lambda x: x.replace('LEFT', 'L'))
df['name'] = df['name'].apply(lambda x: x.replace('Right', 'R'))
df['name'] = df['name'].apply(lambda x: x.replace('Left', 'L'))

In [19]:
# first we need to strip the whitespaces 
df['name'] = df['name'].apply(lambda x: x.strip())

In [20]:
duplicateRows = df[df['road'] == 'N1']
duplicateRows = duplicateRows[duplicateRows.duplicated(subset=['lat', 'lon'])]
duplicateRows.sort_values(by=['km'])

Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone
12706,N1,8.976,PC Girder Bridge,KANCHPUR PC GIRDER BRIDGE,397.0,C,23.705060,90.523214,Dhaka
9,N1,12.660,PC Girder Bridge,MADAN PUR (R,26.3,A,23.685583,90.551208,Dhaka
10,N1,12.660,PC Girder Bridge,MADANPUR BRIDGE(L,26.3,A,23.685583,90.551208,Dhaka
14588,N1,17.134,PC Girder Bridge,LANGOLBANDO PC GIDER BRIDGE,159.5,B,23.654426,90.576730,Dhaka
14,N1,17.222,PC Girder Bridge,Darikandi Bridge (L,20.5,A,23.653972,90.577410,Dhaka
...,...,...,...,...,...,...,...,...,...
14737,N1,457.365,RCC Girder Bridge,NAITOM PARA RCC GIDER BRIDGE,15.1,B,20.897099,92.285061,Chittagong
463,N1,457.751,RCC Girder Bridge,NAITANG PARA RCC GIDER BRIDGE,9.0,A,20.895090,92.287952,Chittagong
14738,N1,458.213,RCC Girder Bridge,NAITOM PARA RCC GIDER BRIDGE,9.0,B,20.892271,92.290984,Chittagong
15732,N1,458.799,RCC Girder Bridge,Eskahal Bridge,37.3,C,20.890158,92.295718,Chittagong


In [21]:
remove_index = [] # the list of indexes that need to be removed after running for-loop

df['conditionNum'] = 0
df.loc[df['condition'] == 'A', 'conditionNum'] = 1
df.loc[df['condition'] == 'B', 'conditionNum'] = 2
df.loc[df['condition'] == 'C', 'conditionNum'] = 3
df.loc[df['condition'] == 'D', 'conditionNum'] = 4

for index in duplicateRows.index: 
    # retrieve latitude and longitude
    latitude = df.loc[index, 'lat'] 
    longitude = df.loc[index, 'lon']

    # define a subset of duplicates based on the latitude and longitude
    subset = df.loc[((df['lat'] == latitude) & (df['lon'] == longitude))] 
    
    # define lists for bridges with left, right, or neither in their name
    contains_left = []
    contains_right = []
    contains_none = []
    
    for index in subset.index: 
        # for every row in subset, retrieve condition and assign to set
        condition = subset.loc[index, 'conditionNum']
        condition_set = (index, condition)
        
        # retrieve name and whether L or R in name
        name = subset.loc[index, 'name']
        last_letter = name[-1:]
        
        if last_letter == 'R': 
            contains_right.append(condition_set)

        elif last_letter == 'L': 
            contains_left.append(condition_set)

        else: 
            contains_none.append(condition_set)   
   
    if len(contains_left) == 0 and len(contains_right) == 0 and len(contains_none) > 0:  
        if contains_none[0][1] == contains_none[1][1]: 
            random_none = random.choice(contains_none)
            remove_index.append(random_none[0])

        elif contains_none[0][1] < contains_none[1][1]: 
            remove_index.append(contains_none[0][0])

        elif contains_none[0][1] > contains_none[1][1]: 
            remove_index.append(contains_none[1][0])
            
    if len(contains_left) == 1 and len(contains_right) == 1 and len(contains_none) == 1: 
        for element in contains_left: 
            remove_index.append(element[0])
        for element in contains_right: 
            remove_index.append(element[0])
            
    if len(contains_left) == 2: 
        if contains_left[0][1] == contains_left[1][1]: 
            random_left = random.choice(contains_left)
            remove_index.append(random_left[0])

        elif contains_left[0][1] < contains_left[1][1]: 
            remove_index.append(contains_left[0][0])

        elif contains_left[0][1] > contains_left[1][1]: 
            remove_index.append(contains_left[1][0])

    if len(contains_right) == 2: 
        if contains_right[0][1] == contains_right[1][1]: 
            random_left = random.choice(contains_right)
            remove_index.append(random_left[0])

        elif contains_right[0][1] < contains_right[1][1]: 
            remove_index.append(contains_right[0][0])

        elif contains_right[0][1] > contains_right[1][1]: 
            remove_index.append(contains_right[1][0])
                
    if len(contains_left) == 1 and len(contains_none) == 1: 
        for element in contains_left: 
            remove_index.append(element[0])
            
    if len(contains_right) == 1 and len(contains_none) == 1: 
        for element in contains_right: 
            remove_index.append(element[0])
            
    if len(contains_right) == 1 and len(contains_left) == 1 and len(contains_none) == 0: 
        continue
        
                
print(remove_index)

[10, 10, 17, 18, 17, 18, 17, 18, 17, 18, 22, 21, 23, 21, 28, 29, 28, 29, 28, 29, 28, 29, 34, 32, 34, 33, 34, 32, 37, 36, 37, 36, 41, 42, 40, 39, 41, 39, 45, 46, 44, 46, 45, 46, 49, 52, 51, 50, 49, 52, 54, 55, 67, 191, 193, 204, 212, 214, 224, 233, 287, 302, 304, 302, 304, 302, 304, 302, 304, 323, 430, 433, 443, 450, 451, 463, 2, 24, 57, 12712, 12714, 58, 12722, 65, 66, 12729, 148, 14588, 15, 14592, 15, 14592, 15, 14592, 14596, 14595, 14594, 14595, 14594, 14595, 14599, 20, 14600, 20, 14600, 20, 23, 21, 37, 38, 54, 55, 54, 55, 14607, 14615, 14616, 14644, 12731, 14647, 162, 14673, 202, 226, 227, 236, 275, 307, 372, 375, 408, 14725, 14728, 438, 439, 446, 459, 460, 461, 464, 14662, 14663, 163, 164, 165, 172, 14668, 14669, 14671, 15631, 250, 257, 266, 270, 273, 15638, 288, 327, 353, 370, 374, 14707, 15685, 379, 15688, 14710, 15695, 15697, 398, 15701, 14718, 14720, 14721, 14722, 15714, 15716, 422, 14724, 14727, 434, 15723, 15725, 14732, 444, 448, 453, 457, 458, 14739, 15735, 14658, 14659, 156

In [22]:
print('all indexes to remove:', remove_index)
used = set()
unique_indexes = [x for x in remove_index if x not in used and (used.add(x) or True)]
print('\n')
print('unique indexes to remove:', unique_indexes)

all indexes to remove: [10, 10, 17, 18, 17, 18, 17, 18, 17, 18, 22, 21, 23, 21, 28, 29, 28, 29, 28, 29, 28, 29, 34, 32, 34, 33, 34, 32, 37, 36, 37, 36, 41, 42, 40, 39, 41, 39, 45, 46, 44, 46, 45, 46, 49, 52, 51, 50, 49, 52, 54, 55, 67, 191, 193, 204, 212, 214, 224, 233, 287, 302, 304, 302, 304, 302, 304, 302, 304, 323, 430, 433, 443, 450, 451, 463, 2, 24, 57, 12712, 12714, 58, 12722, 65, 66, 12729, 148, 14588, 15, 14592, 15, 14592, 15, 14592, 14596, 14595, 14594, 14595, 14594, 14595, 14599, 20, 14600, 20, 14600, 20, 23, 21, 37, 38, 54, 55, 54, 55, 14607, 14615, 14616, 14644, 12731, 14647, 162, 14673, 202, 226, 227, 236, 275, 307, 372, 375, 408, 14725, 14728, 438, 439, 446, 459, 460, 461, 464, 14662, 14663, 163, 164, 165, 172, 14668, 14669, 14671, 15631, 250, 257, 266, 270, 273, 15638, 288, 327, 353, 370, 374, 14707, 15685, 379, 15688, 14710, 15695, 15697, 398, 15701, 14718, 14720, 14721, 14722, 15714, 15716, 422, 14724, 14727, 434, 15723, 15725, 14732, 444, 448, 453, 457, 458, 14739, 1

In [23]:
for element in unique_indexes: 
    df = df.drop(index = element)

In [24]:
duplicateRows = df[df['road'] == 'N1']
duplicateRows = duplicateRows[duplicateRows.duplicated(subset=['lat', 'lon'])]
duplicateRows.sort_values(by=['km'])

Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
9,N1,12.66,PC Girder Bridge,MADAN PUR (R,26.3,A,23.685583,90.551208,Dhaka,1
14,N1,17.222,PC Girder Bridge,Darikandi Bridge (L,20.5,A,23.653972,90.57741,Dhaka,1
14590,N1,17.722,PC Girder Bridge,DARIKANDI BRIDGE (L,20.0,B,23.651571,90.581494,Dhaka,2
14593,N1,18.093,PC Girder Bridge,SONARKALI BRIDGE(R,24.5,B,23.649754,90.584549,Dhaka,2
14605,N1,35.408,RCC Girder Bridge,CHAR BOWSHIA BRIDGE(R,24.5,B,23.5314,90.687449,Dhaka,2
14648,N1,165.183,PC Girder Bridge,DHOOM GHAT PC GIRDER,220.6,B,22.914965,91.52629,Comilla,2
134,N1,201.999,Box Culvert,DULAHAJRA(R,4.6,A,22.61499,91.660267,Chittagong,1
14667,N1,255.505,Box Culvert,MUNSHA BOX CULVERT(L,10.8,B,22.308319,91.914215,Chittagong,2
428,N1,448.581,Box Culvert,NOYA PARA BOX CULVERT(R,1.2,A,20.964189,92.251882,Chittagong,1


In [25]:
for index in duplicateRows.index: 
    # retrieve latitude and longitude
    latitude = df.loc[index, 'lat'] 
    longitude = df.loc[index, 'lon']

    # define a subset of duplicates based on the latitude and longitude
    subset = df.loc[((df['lat'] == latitude) & (df['lon'] == longitude))] 
    display(subset)

Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
8,N1,12.66,PC Girder Bridge,Madanpur Bridge.(L,27.5,A,23.685583,90.551208,Dhaka,1
9,N1,12.66,PC Girder Bridge,MADAN PUR (R,26.3,A,23.685583,90.551208,Dhaka,1


Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
13,N1,17.222,PC Girder Bridge,Darikandi Bridge (R,20.45,A,23.653972,90.57741,Dhaka,1
14,N1,17.222,PC Girder Bridge,Darikandi Bridge (L,20.5,A,23.653972,90.57741,Dhaka,1


Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
133,N1,201.999,Slab Culvert,PURBOMOHADAB PUR SLAB CULVERT(L,4.6,A,22.61499,91.660267,Chittagong,1
134,N1,201.999,Box Culvert,DULAHAJRA(R,4.6,A,22.61499,91.660267,Chittagong,1


Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
427,N1,448.581,Box Culvert,NOYA PARA (MUSJID BOX CULVERT(L,1.2,A,20.964189,92.251882,Chittagong,1
428,N1,448.581,Box Culvert,NOYA PARA BOX CULVERT(R,1.2,A,20.964189,92.251882,Chittagong,1


Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
14589,N1,17.722,PC Girder Bridge,DARIKANDI BRIDGE (R,20.0,B,23.651571,90.581494,Dhaka,2
14590,N1,17.722,PC Girder Bridge,DARIKANDI BRIDGE (L,20.0,B,23.651571,90.581494,Dhaka,2


Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
14591,N1,18.093,PC Girder Bridge,SONARKALI BRIDGE(L,24.5,B,23.649754,90.584549,Dhaka,2
14593,N1,18.093,PC Girder Bridge,SONARKALI BRIDGE(R,24.5,B,23.649754,90.584549,Dhaka,2


Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
14604,N1,35.408,PC Girder Bridge,BORAKANDI BRIDGE(L,24.5,B,23.5314,90.687449,Dhaka,2
14605,N1,35.408,RCC Girder Bridge,CHAR BOWSHIA BRIDGE(R,24.5,B,23.5314,90.687449,Dhaka,2


Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
92,N1,165.183,PC Girder Bridge,DumGhat Bridge(L,222.43,A,22.914965,91.52629,Comilla,1
14648,N1,165.183,PC Girder Bridge,DHOOM GHAT PC GIRDER,220.6,B,22.914965,91.52629,Comilla,2


Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
174,N1,255.505,Box Culvert,tassi bazer(R,3.0,A,22.308319,91.914215,Chittagong,1
14667,N1,255.505,Box Culvert,MUNSHA BOX CULVERT(L,10.8,B,22.308319,91.914215,Chittagong,2


## Format data

In [26]:
df.head()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
0,N1,1.8,Box Culvert,.,11.3,A,23.698739,90.458861,Dhaka,1
1,N1,4.925,Box Culvert,.,6.6,A,23.694664,90.487775,Dhaka,1
3,N1,10.88,Box Culvert,NOYAPARA CULVERT,6.3,A,23.694391,90.537574,Dhaka,1
4,N1,10.897,Box Culvert,ADUPUR CULVERT,6.3,A,23.694302,90.537707,Dhaka,1
5,N1,11.296,Box Culvert,NAYABARI KASPUR BOX CULVERT,8.3,A,23.69236,90.540918,Dhaka,1


In [27]:
# reverse order 
df = df.iloc[::-1]

In [28]:
df.head()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,conditionNum
20414,Z8910,33.53,RCC Girder Bridge,Jor Bridge 2,6.0,D,22.474325,90.557335,Barisal,4
20413,Z8910,28.88,RCC Girder Bridge,Narainpur Bridge,9.5,D,22.51065,90.535314,Barisal,4
20412,Z8814,20.73,Baily with Steel Deck,Kalbari Bridge,30.5,D,22.282704,89.968512,Barisal,4
20411,Z8810,3.981,Steel Beam & RCC Slab,.,27.2,D,22.519353,90.310063,Barisal,4
20410,Z8806,83.728,Box Culvert,.,1.78,D,22.183448,90.299962,Barisal,4


In [29]:
df = df.reset_index()

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20263 entries, 0 to 20262
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         20263 non-null  int64  
 1   road          20263 non-null  object 
 2   km            20263 non-null  float64
 3   type          20263 non-null  object 
 4   name          20263 non-null  object 
 5   length        20263 non-null  float64
 6   condition     20263 non-null  object 
 7   lat           20263 non-null  float64
 8   lon           20263 non-null  float64
 9   zone          20263 non-null  object 
 10  conditionNum  20263 non-null  int64  
dtypes: float64(4), int64(2), object(5)
memory usage: 1.7+ MB


In [31]:
df.head()

Unnamed: 0,index,road,km,type,name,length,condition,lat,lon,zone,conditionNum
0,20414,Z8910,33.53,RCC Girder Bridge,Jor Bridge 2,6.0,D,22.474325,90.557335,Barisal,4
1,20413,Z8910,28.88,RCC Girder Bridge,Narainpur Bridge,9.5,D,22.51065,90.535314,Barisal,4
2,20412,Z8814,20.73,Baily with Steel Deck,Kalbari Bridge,30.5,D,22.282704,89.968512,Barisal,4
3,20411,Z8810,3.981,Steel Beam & RCC Slab,.,27.2,D,22.519353,90.310063,Barisal,4
4,20410,Z8806,83.728,Box Culvert,.,1.78,D,22.183448,90.299962,Barisal,4


In [32]:
df.tail()

Unnamed: 0,index,road,km,type,name,length,condition,lat,lon,zone,conditionNum
20258,5,N1,11.296,Box Culvert,NAYABARI KASPUR BOX CULVERT,8.3,A,23.69236,90.540918,Dhaka,1
20259,4,N1,10.897,Box Culvert,ADUPUR CULVERT,6.3,A,23.694302,90.537707,Dhaka,1
20260,3,N1,10.88,Box Culvert,NOYAPARA CULVERT,6.3,A,23.694391,90.537574,Dhaka,1
20261,1,N1,4.925,Box Culvert,.,6.6,A,23.694664,90.487775,Dhaka,1
20262,0,N1,1.8,Box Culvert,.,11.3,A,23.698739,90.458861,Dhaka,1


In [33]:
df = df.drop("conditionNum", axis='columns')

In [34]:
df = df.drop("index", axis='columns')

In [35]:
df.head()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone
0,Z8910,33.53,RCC Girder Bridge,Jor Bridge 2,6.0,D,22.474325,90.557335,Barisal
1,Z8910,28.88,RCC Girder Bridge,Narainpur Bridge,9.5,D,22.51065,90.535314,Barisal
2,Z8814,20.73,Baily with Steel Deck,Kalbari Bridge,30.5,D,22.282704,89.968512,Barisal
3,Z8810,3.981,Steel Beam & RCC Slab,.,27.2,D,22.519353,90.310063,Barisal
4,Z8806,83.728,Box Culvert,.,1.78,D,22.183448,90.299962,Barisal


In [36]:
df['model_type'] = 'bridge'

In [37]:
df = df[df['road'] == 'N1']

In [38]:
df.head()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,model_type
1032,N1,453.372,Slab Culvert,DONMIA,2.7,D,20.925389,92.264945,Chittagong,bridge
1033,N1,436.789,RCC Girder Bridge,Naya Bazar(2 Bridge,9.4,D,21.056294,92.226645,Chittagong,bridge
1034,N1,429.081,RCC Girder Bridge,Tasse Bridge,12.8,D,21.114558,92.198021,Chittagong,bridge
1035,N1,427.622,RCC Girder Bridge,Wheke Kang Bridge,6.4,D,21.125104,92.191367,Chittagong,bridge
1036,N1,426.915,RCC Girder Bridge,Balu Khali Bridge,7.9,D,21.129768,92.187047,Chittagong,bridge


In [39]:
df = df.sort_values(by = 'km', ascending = False)

In [40]:
df.head()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,zone,model_type
4677,N1,460.113,RCC Girder Bridge,YAKNAT BRIDGE,35.9,C,20.880985,92.297777,Chittagong,bridge
4678,N1,459.866,Box Culvert,BUS STAND TEKNAF BOX CULVERT,1.5,C,20.88307,92.298163,Chittagong,bridge
19892,N1,459.681,Box Culvert,BUS STAND TEKNAF BOX CULVERT,1.5,A,20.884567,92.298716,Chittagong,bridge
4679,N1,458.799,RCC Girder Bridge,Eskahal Bridge,37.3,C,20.890158,92.295718,Chittagong,bridge
5658,N1,458.213,RCC Girder Bridge,NAITOM PARA RCC GIDER BRIDGE,9.0,B,20.892271,92.290984,Chittagong,bridge


In [41]:
# just take the dataframe which is 287 km away from Dhaka
df = df.loc[df['km'] < 287]

In [42]:
df = df.reset_index()

In [43]:
df

Unnamed: 0,index,road,km,type,name,length,condition,lat,lon,zone,model_type
0,20112,N1,286.631,RCC Girder Bridge,NOYAKHALAR MUK,28.0,A,22.127734,92.071001,Chittagong,bridge
1,20113,N1,286.378,PC Girder Bridge,PATANER PULL(BRIDGE,34.6,A,22.129916,92.070602,Chittagong,bridge
2,20114,N1,286.080,Box Culvert,PATATR PULL CULVERT,15.4,A,22.132521,92.070157,Chittagong,bridge
3,20115,N1,285.580,Box Culvert,MOULOVER DOKAN,12.4,A,22.136952,92.069930,Chittagong,bridge
4,20116,N1,284.134,Box Culvert,KATGOR CULVERT,9.2,A,22.149852,92.069633,Chittagong,bridge
...,...,...,...,...,...,...,...,...,...,...,...
260,20260,N1,10.880,Box Culvert,NOYAPARA CULVERT,6.3,A,23.694391,90.537574,Dhaka,bridge
261,5777,N1,10.543,Box Culvert,KATCHPUR BOX CULVERT,8.0,B,23.696400,90.535099,Dhaka,bridge
262,7651,N1,8.976,PC Girder Bridge,KANCHPUR PC GIRDER BRIDGE,397.0,C,23.705060,90.523214,Dhaka,bridge
263,20261,N1,4.925,Box Culvert,.,6.6,A,23.694664,90.487775,Dhaka,bridge


In [44]:
df = df.drop("index", axis='columns')

In [45]:
df = df.drop("zone", axis='columns')

In [46]:
df.head()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,model_type
0,N1,286.631,RCC Girder Bridge,NOYAKHALAR MUK,28.0,A,22.127734,92.071001,bridge
1,N1,286.378,PC Girder Bridge,PATANER PULL(BRIDGE,34.6,A,22.129916,92.070602,bridge
2,N1,286.08,Box Culvert,PATATR PULL CULVERT,15.4,A,22.132521,92.070157,bridge
3,N1,285.58,Box Culvert,MOULOVER DOKAN,12.4,A,22.136952,92.06993,bridge
4,N1,284.134,Box Culvert,KATGOR CULVERT,9.2,A,22.149852,92.069633,bridge


In [47]:
df_roads = pd.read_csv('../data/roads.csv')

In [48]:
# select only N1 data entries
df_roads = df_roads[df_roads['road'] == 'N1']

In [49]:
df_roads.head()

Unnamed: 0,road,chainage,lrp,lat,lon,gap,type,name
0,N1,0.0,LRPS,23.706028,90.443333,,Others,Start of Road after Jatrabari Flyover infront...
1,N1,0.814,LRPSa,23.702917,90.450417,,Culvert,Box Culvert
2,N1,0.822,LRPSb,23.702778,90.450472,,CrossRoad,Intersection with Z1101
3,N1,1.0,LRP001,23.702139,90.451972,,KmPost,Km post missing
4,N1,2.0,LRP002,23.697889,90.460583,,KmPost,Km post missing


In [50]:
df_roads_0 = df_roads[0:1]

In [51]:
road_name = df_roads_0.road[0]
km = df_roads_0.chainage[0]
lrp = df_roads_0.lrp[0]
latitude = df_roads_0.lat[0]
longitude = df_roads_0.lon[0]
type_of_bridge = 'sink'
bridge_name = 'sink'
length = 0
condition = 'A'

In [68]:
df.head()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,model_type
1,N1,287.453,source,source,0.0,A,22.120805,92.071944,source
2,N1,286.631,RCC Girder Bridge,NOYAKHALAR MUK,28.0,A,22.127734,92.071001,bridge
3,N1,286.378,PC Girder Bridge,PATANER PULL(BRIDGE,34.6,A,22.129916,92.070602,bridge
4,N1,286.08,Box Culvert,PATATR PULL CULVERT,15.4,A,22.132521,92.070157,bridge
5,N1,285.58,Box Culvert,MOULOVER DOKAN,12.4,A,22.136952,92.06993,bridge


In [67]:
len(df)

267

In [52]:
df.loc[len(df)] = [road_name, km, type_of_bridge, bridge_name, length, 
              condition, latitude, longitude, type_of_bridge]  # adding a row
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True) 

In [53]:
df.tail()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,model_type
262,N1,10.543,Box Culvert,KATCHPUR BOX CULVERT,8.0,B,23.6964,90.535099,bridge
263,N1,8.976,PC Girder Bridge,KANCHPUR PC GIRDER BRIDGE,397.0,C,23.70506,90.523214,bridge
264,N1,4.925,Box Culvert,.,6.6,A,23.694664,90.487775,bridge
265,N1,1.8,Box Culvert,.,11.3,A,23.698739,90.458861,bridge
266,N1,0.0,sink,sink,0.0,A,23.706028,90.443333,sink


In [54]:
df_roads.tail()

Unnamed: 0,road,chainage,lrp,lat,lon,gap,type,name
1334,N1,461.476,LRP466a,20.86886,92.298222,,Culvert,Box culvert
1335,N1,461.904,LRP466b,20.865028,92.29825,BS,Bridge,Bridge start
1336,N1,461.946,LRP466c,20.864667,92.298194,BE,Bridge,Bridge end
1337,N1,462.124,LRP467,20.862972,92.298083,,KmPost,Infor.missing
1338,N1,462.254,LRPE,20.862917,92.298083,,Others,"End of Road at Shapla Chattar ,Teknaf Meet wit..."


In [55]:
df_roads = df_roads.loc[df_roads['chainage'] < 288]

In [56]:
df_roads_last = df_roads[-1::]

In [57]:
df_roads_last = df_roads_last.reset_index()

In [58]:
df_roads_last

Unnamed: 0,index,road,chainage,lrp,lat,lon,gap,type,name
0,694,N1,287.453,LRP291b,22.120805,92.071944,,Culvert,Box culvert


In [59]:
df.head()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,model_type
1,N1,286.631,RCC Girder Bridge,NOYAKHALAR MUK,28.0,A,22.127734,92.071001,bridge
2,N1,286.378,PC Girder Bridge,PATANER PULL(BRIDGE,34.6,A,22.129916,92.070602,bridge
3,N1,286.08,Box Culvert,PATATR PULL CULVERT,15.4,A,22.132521,92.070157,bridge
4,N1,285.58,Box Culvert,MOULOVER DOKAN,12.4,A,22.136952,92.06993,bridge
5,N1,284.134,Box Culvert,KATGOR CULVERT,9.2,A,22.149852,92.069633,bridge


In [60]:
road_name = df_roads_last.loc[0, 'road']
km = df_roads_last.loc[0, 'chainage']
lrp = df_roads_last.loc[0, 'lrp']
latitude = df_roads_last.loc[0, 'lat']
longitude = df_roads_last.loc[0, 'lon']
type_of_bridge = 'source'
bridge_name = 'source'
length = 0
condition = 'A'

In [61]:
df.loc[-1] = [road_name, km, type_of_bridge, bridge_name, length, 
              condition, latitude, longitude, type_of_bridge]

In [62]:
df.head()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,model_type
1,N1,286.631,RCC Girder Bridge,NOYAKHALAR MUK,28.0,A,22.127734,92.071001,bridge
2,N1,286.378,PC Girder Bridge,PATANER PULL(BRIDGE,34.6,A,22.129916,92.070602,bridge
3,N1,286.08,Box Culvert,PATATR PULL CULVERT,15.4,A,22.132521,92.070157,bridge
4,N1,285.58,Box Culvert,MOULOVER DOKAN,12.4,A,22.136952,92.06993,bridge
5,N1,284.134,Box Culvert,KATGOR CULVERT,9.2,A,22.149852,92.069633,bridge


In [63]:
df.sort_index(inplace=True) 

In [64]:
# shifting index
df.index = df.index + 1  

In [65]:
df.tail()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,model_type
263,N1,10.543,Box Culvert,KATCHPUR BOX CULVERT,8.0,B,23.6964,90.535099,bridge
264,N1,8.976,PC Girder Bridge,KANCHPUR PC GIRDER BRIDGE,397.0,C,23.70506,90.523214,bridge
265,N1,4.925,Box Culvert,.,6.6,A,23.694664,90.487775,bridge
266,N1,1.8,Box Culvert,.,11.3,A,23.698739,90.458861,bridge
267,N1,0.0,sink,sink,0.0,A,23.706028,90.443333,sink


In [66]:
df.head()

Unnamed: 0,road,km,type,name,length,condition,lat,lon,model_type
1,N1,287.453,source,source,0.0,A,22.120805,92.071944,source
2,N1,286.631,RCC Girder Bridge,NOYAKHALAR MUK,28.0,A,22.127734,92.071001,bridge
3,N1,286.378,PC Girder Bridge,PATANER PULL(BRIDGE,34.6,A,22.129916,92.070602,bridge
4,N1,286.08,Box Culvert,PATATR PULL CULVERT,15.4,A,22.132521,92.070157,bridge
5,N1,285.58,Box Culvert,MOULOVER DOKAN,12.4,A,22.136952,92.06993,bridge
