<a href="https://colab.research.google.com/github/nasaharvest/street2sat/blob/Duplicate-Resolution-Notebook/BaselineDuplicateResolution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Duplicate Point Resolution**

**Author:** Anjali Paliyam

**Last modified:** May 26, 2024

**Description:** Generates a new file with no duplicate points at any location. Specifically, the notebook:

1. Load DataFrame with all the points
2. For each gruop, find max dominant crop and cut remaining duplicate points
3. Testing to see if function works
4. Download new csv file with no duplicates

# **1. Load DataFrame with all the points**

In [None]:
import pandas as pd
import numpy as np

#import dataframe
df = pd.read_csv("/content/Kenya_ADM1_51331_Rift_Valley_ADM2_68770_Nandi_South.csv")
#display first 10 points
df.head(10)

Unnamed: 0,road_pixel_centroid,is_right_hand_drive,driving_easting,driving_northing,offset_field_coord,offset_field_pixel_centroid,time_computed,level_0,index,pixel_height,...,wheat,soybean,banana,maize,sugarcane,rice,tobacco,dominant_crop,ADM1 Label,ADM2 Label
0,"(0.06929686379931158, 35.18607790040008)",False,4.521597,-11.337425,"(0.06937288903638301, 35.186239928372224)","(0.06938726229426856, 35.1862575079293)",2024-04-29 14:34:45.191143,37821,74186,2028.0,...,0.0008,0.0,0.0,0.0209,0.0306,0.0137,0.0006,sugarcane,51331 Rift Valley,68770 Nandi South
1,"(0.06929686379931158, 35.18607790040008)",False,2.316477,-5.685306,"(0.06932273387066032, 35.18626022995565)","(0.06929685545845773, 35.186257503780226)",2024-04-29 14:34:45.227847,37822,74187,2028.0,...,0.0025,0.0,0.0,0.0474,0.0325,0.0165,0.0006,maize,51331 Rift Valley,68770 Nandi South
2,"(0.0691160459464454, 35.18616769381127)",False,2.427828,-5.629996,"(0.06922420626269499, 35.18630172555941)","(0.0692064486226432, 35.18625749963655)",2024-04-29 14:34:45.263999,37824,74189,2028.0,...,0.0031,0.0,0.0,0.0464,0.005,0.0007,0.0007,maize,51331 Rift Valley,68770 Nandi South
3,"(0.06893522811517742, 35.18625748723798)",False,2.617138,-5.607863,"(0.06902087429655933, 35.18639405549634)","(0.06902562664214476, 35.186437094722905)",2024-04-29 14:34:45.407774,37828,74193,2028.0,...,0.0001,0.0,0.0,0.0,0.0578,0.0001,0.0005,sugarcane,51331 Rift Valley,68770 Nandi South
4,"(0.06893522811517742, 35.18625748723798)",False,5.18973,-11.215727,"(0.06896963868252365, 35.18641740269945)","(0.06893521981721151, 35.18643709059513)",2024-04-29 14:34:45.445508,37829,74194,2028.0,...,0.0,0.0,0.0,0.0567,0.0284,0.0056,0.0003,maize,51331 Rift Valley,68770 Nandi South
5,"(0.06866400347511832, 35.18634727656888)",False,2.550321,-5.629992,"(0.06871421616870438, 35.18653240402525)","(0.06875440202898968, 35.186526884025916)",2024-04-29 14:34:45.481269,37831,74199,2028.0,...,0.0007,0.0,0.003,0.1117,0.0158,0.0007,0.0006,maize,51331 Rift Valley,68770 Nandi South
6,"(0.06857359664472397, 35.18634727246289)",False,5.100646,-11.271039,"(0.06866315550734126, 35.18655533134099)","(0.06866399520949183, 35.1865268799142)",2024-04-29 14:34:45.517679,37832,74200,2028.0,...,0.0009,0.0,0.0031,0.0841,0.0067,0.0083,0.0004,maize,51331 Rift Valley,68770 Nandi South
7,"(0.06857359251744141, 35.1864370741381)",False,2.784195,-6.183034,"(0.06860694767666932, 35.186580469351796)","(0.06857358426237058, 35.186616677472244)",2024-04-29 14:34:45.555004,37833,74201,2028.0,...,0.0047,0.0,0.0,0.0397,0.0103,0.0004,0.0008,maize,51331 Rift Valley,68770 Nandi South
8,"(0.06694623742783337, 35.187155414335436)",False,1.948932,-4.313749,"(0.06702955323365417, 35.18730497718011)","(0.06703663613674617, 35.18733502158744)",2024-04-29 14:34:45.773288,37839,74207,2028.0,...,0.0015,0.0001,0.0,0.0754,0.0505,0.0491,0.0007,maize,51331 Rift Valley,68770 Nandi South
9,"(0.06676541984511845, 35.18724520795469)",False,1.692789,-3.804949,"(0.06687360477691176, 35.18737389947737)","(0.06685582259566593, 35.18733501357203)",2024-04-29 14:34:45.809881,37843,74211,2028.0,...,0.001,0.0,0.0,0.0183,0.0973,0.0486,0.0019,sugarcane,51331 Rift Valley,68770 Nandi South


# **2. For each gruop, find max dominant crop and cut remaining duplicate points**


In [None]:
#takes in a group and returns only one point with max dominant crop
def get_max_dominant_crop(group):
  max_idx = group.apply(lambda x: x[x['dominant_crop']], axis=1).idxmax()
  return group.loc[max_idx]

#create new DataFrame with no duplicate - using groupby and applying get_max_dominant_crop on each group
result_df = df.groupby('road_pixel_centroid').apply(get_max_dominant_crop).reset_index(drop=True)

In [None]:
#display first 10 points of resulting DataFrame
result_df.head(10)

Unnamed: 0,road_pixel_centroid,is_right_hand_drive,driving_easting,driving_northing,offset_field_coord,offset_field_pixel_centroid,time_computed,level_0,index,pixel_height,...,wheat,soybean,banana,maize,sugarcane,rice,tobacco,dominant_crop,ADM1 Label,ADM2 Label
0,"(-0.0023053409641171736, 35.1977504793553)",False,1.737172,-4.335991,"(-0.002229239568338882, 35.197938919165075)","(-0.0022149345498938054, 35.19793008105916)",2024-04-29 14:35:01.557856,38443,75157,2028.0,...,0.0026,0.0,0.0,0.1128,0.0,0.0,0.0005,maize,51331 Rift Valley,68770 Nandi South
1,"(-0.0023957469535462977, 35.197840280419314)",False,1.959883,-4.68995,"(-0.002317167126122153, 35.19797531408235)","(-0.002305340533835253, 35.19801988210687)",2024-04-29 14:35:01.627557,38445,75159,2028.0,...,0.0004,0.0004,0.0,0.103,0.0012,0.0057,0.0008,maize,51331 Rift Valley,68770 Nandi South
2,"(-0.0024861530863961256, 35.19784028056615)",False,3.919766,-9.579003,"(-0.002442306483961031, 35.198026123172326)","(-0.0024861527776351812, 35.19801988239513)",2024-04-29 14:35:01.699337,38447,75162,2028.0,...,0.0002,0.0076,0.0,0.1194,0.0033,0.0007,0.0009,maize,51331 Rift Valley,68770 Nandi South
3,"(-0.0025765590593939563, 35.19793008163563)",False,3.574561,-9.037004,"(-0.002525978066034616, 35.19805901117434)","(-0.0024861527776351812, 35.19801988239513)",2024-04-29 14:35:01.778963,38449,75164,2028.0,...,0.001,0.0,0.0,0.108,0.0003,0.0001,0.0008,maize,51331 Rift Valley,68770 Nandi South
4,"(-0.002576559219245821, 35.19784028071841)",False,1.803986,-4.524033,"(-0.002484712285213816, 35.19804292742836)","(-0.0024861527776351812, 35.19801988239513)",2024-04-29 14:35:01.735338,38448,75163,2028.0,...,0.0,0.0263,0.0,0.1099,0.0003,0.0033,0.0007,maize,51331 Rift Valley,68770 Nandi South
5,"(-0.00266696518676866, 35.19793008179334)",False,3.329568,-9.147616,"(-0.0026133409152853085, 35.19809066970037)","(-0.0025765587396687963, 35.19810968345375)",2024-04-29 14:35:01.857092,38451,75166,2028.0,...,0.0,0.0,0.0,0.1207,0.0,0.0,0.0001,maize,51331 Rift Valley,68770 Nandi South
6,"(-0.0027573713141432208, 35.19793008195649)",False,1.803984,-5.066032,"(-0.002660328819326541, 35.198107294550475)","(-0.002666964856093125, 35.19810968361148)",2024-04-29 14:35:01.894317,38452,75167,2028.0,...,0.0,0.0,0.0,0.1337,0.0,0.0,0.0,maize,51331 Rift Valley,68770 Nandi South
7,"(-0.002847777265233398, 35.198019883036864)",False,1.714893,-4.446605,"(-0.0028179222860846968, 35.198166171469175)","(-0.0028477769126414856, 35.19819948484417)",2024-04-29 14:35:02.040593,38456,75171,2028.0,...,0.0,0.0,0.0,0.1227,0.0,0.0,0.0001,maize,51331 Rift Valley,68770 Nandi South
8,"(-0.0029381833871325812, 35.198019883210904)",False,3.418653,-8.904271,"(-0.0028584764727759824, 35.19818156862387)","(-0.0028477769126414856, 35.19819948484417)",2024-04-29 14:35:02.076703,38457,75172,2028.0,...,0.0,0.0,0.0,0.1333,0.0,0.0,0.0001,maize,51331 Rift Valley,68770 Nandi South
9,"(-0.0030285895090316093, 35.19801988339039)",False,1.904202,-4.999665,"(-0.0029851285225996177, 35.198229440353536)","(-0.0030285891345380542, 35.19819948519769)",2024-04-29 14:35:02.186343,38460,75175,2028.0,...,0.0,0.0,0.0,0.1401,0.0,0.0,0.0001,maize,51331 Rift Valley,68770 Nandi South


# **3. Testing to see if function works**

In [None]:
#group by 'road_pixel_centroid'
group_df = df.groupby('road_pixel_centroid')
#prints number of groups
len(group_df.groups.keys())

522

In [None]:
#get group 6 to test (0.1337 is dominant crop value, which is index 303)
group_df.get_group(list(group_df.groups)[6])

Unnamed: 0,road_pixel_centroid,is_right_hand_drive,driving_easting,driving_northing,offset_field_coord,offset_field_pixel_centroid,time_computed,level_0,index,pixel_height,...,wheat,soybean,banana,maize,sugarcane,rice,tobacco,dominant_crop,ADM1 Label,ADM2 Label
303,"(-0.0027573713141432208, 35.19793008195649)",False,1.803984,-5.066032,"(-0.002660328819326541, 35.198107294550475)","(-0.002666964856093125, 35.19810968361148)",2024-04-29 14:35:01.894317,38452,75167,2028.0,...,0.0,0.0,0.0,0.1337,0.0,0.0,0.0,maize,51331 Rift Valley,68770 Nandi South
304,"(-0.0027573713141432208, 35.19793008195649)",False,3.273892,-9.09231,"(-0.002696128542821182, 35.1981202810106)","(-0.002666964856093125, 35.19810968361148)",2024-04-29 14:35:01.929958,38453,75168,2028.0,...,0.0,0.0,0.0,0.1316,0.0,0.0,0.0001,maize,51331 Rift Valley,68770 Nandi South
305,"(-0.0027573713141432208, 35.19793008195649)",False,1.681489,-4.479788,"(-0.0027343447310994927, 35.198134546885335)","(-0.0027573709725173105, 35.19810968377463)",2024-04-29 14:35:01.966301,38454,75169,2028.0,...,0.0004,0.0,0.0084,0.102,0.0001,0.0,0.0016,maize,51331 Rift Valley,68770 Nandi South


In [None]:
#call function on just group 6 to see if correct point is returned
get_max_dominant_crop(group_df.get_group(list(group_df.groups)[6]))

road_pixel_centroid                  (-0.0027573713141432208, 35.19793008195649)
is_right_hand_drive                                                        False
driving_easting                                                         1.803984
driving_northing                                                       -5.066032
offset_field_coord                   (-0.002660328819326541, 35.198107294550475)
offset_field_pixel_centroid           (-0.002666964856093125, 35.19810968361148)
time_computed                                         2024-04-29 14:35:01.894317
level_0                                                                    38452
index                                                                      75167
pixel_height                                                              2028.0
focal_length                                                                 3.0
time                                                   2021-07-16 12:41:42+00:00
is_crop                     

In [None]:
#verify if max dominant crop value from function call is equal to select 0.1337
print(get_max_dominant_crop(group_df.get_group(list(group_df.groups)[6]))['maize'] == 0.1337)

True


# **4. Download new csv file with no duplicates**

In [None]:
#check that resulting DataFrame number of rows is same as number of groups from step 2.
result_df['road_pixel_centroid'].shape

(522,)

In [None]:
#download new file
result_df.to_csv('Kenya_ADM1_51331_Rift_Valley_ADM2_68770_Nandi_South_No_Duplicates.csv')

File should appear in 'Files' tab on left
1. Click on the 3 dots
2. Click on 'Download'