In [1]:
import geopandas as gpd
import pandas as pd

# 1. Read data
path = "C:/Users/engs2461/Documents/Git_project/transport-critical-minerals/Incoming data/ports"
df_ports_shp = gpd.read_file(f"{path}/AFR_Infra_Transport_Ports.shp/AFR_Infra_Transport_Ports.shp")
df_africa_ports = gpd.read_file(f"{path}/africa_ports.gpkg", layer='nodes')
df_corridor = gpd.read_file(f"{path}/AfricanDevelopmentCorridorDatabase2022.gpkg", layer='point')
corridor_port_data = df_corridor[df_corridor["Infrastructure_development_type"] == "Port"]
africa_ports=df_africa_ports[df_africa_ports["infra"]=="port"]
corridor_port_data = corridor_port_data.copy()
corridor_port_data.loc[:, 'FeatureUID'] = corridor_port_data['Project_code'].str[:3] + corridor_port_data['Project_code'].str[-2:]
# 2. Set CRS'EPSG:4326'（WGS84）
if df_ports_shp.crs is None:
    df_ports_shp.set_crs("EPSG:4326", inplace=True)

# 3.  proj
proj_crs = "EPSG:3395"  # World Mercator projection
df_ports_shp_proj = df_ports_shp.to_crs(proj_crs)


# 2. Convert to a projected CRS before performing buffer operations
proj_crs = "EPSG:3395"  # World Mercator projection
df_ports_shp_proj = df_ports_shp.to_crs(proj_crs) #USGS
df_africa_ports_proj = africa_ports.to_crs(proj_crs)
df_corridor_proj = corridor_port_data.to_crs(proj_crs)

# 3. Set buffer
buffer_distance = 0.03 * 111000  # 2.2km in meters
df_ports_shp_proj['geometry'] = df_ports_shp_proj.buffer(buffer_distance)
df_africa_ports_proj['geometry'] = df_africa_ports_proj.buffer(buffer_distance)
df_corridor_proj['geometry'] = df_corridor_proj.buffer(buffer_distance)

In [5]:
# 5. Spatial join between the datasets
merged_shp_gpkg = gpd.sjoin(df_ports_shp_proj, df_africa_ports_proj, how="inner", predicate="intersects",lsuffix='_gpkg', rsuffix='_africa')
merged_shp_corridor = gpd.sjoin(df_africa_ports_proj, df_corridor_proj, how="inner", predicate="intersects",lsuffix='_gpkg', rsuffix='_corridor')
merged_three = gpd.sjoin(merged_shp_gpkg, merged_shp_corridor, how="inner", predicate="intersects")
# Check for duplicates in merged_shp_gpkg
duplicates_gpkg = merged_shp_gpkg[merged_shp_gpkg.duplicated(subset='geometry', keep=False)]
print("Duplicated geometries in merged_shp_gpkg:", len(duplicates_gpkg))

# Check for duplicates in merged_shp_corridor
duplicates_corridor = merged_shp_corridor[merged_shp_corridor.duplicated(subset='geometry', keep=False)]
print("Duplicated geometries in merged_shp_corridor:", len(duplicates_corridor))
merged_shp_gpkg.drop_duplicates(subset='geometry', inplace=True)
merged_three = gpd.sjoin(merged_shp_gpkg, merged_shp_corridor, how="inner", predicate="intersects")

Duplicated geometries in merged_shp_gpkg: 167
Duplicated geometries in merged_shp_corridor: 0


In [9]:
print(merged_three.shape)

(27, 65)


In [10]:
# Merge all data
merged_all = pd.concat([merged_shp_gpkg, merged_shp_corridor,merged_three,df_ports_shp_proj, df_africa_ports_proj,df_corridor_proj])
# 2. Drop duplicates. Assuming 'geometry' is the column to identify duplicates:
merged_all_unique = merged_all.drop_duplicates(subset='geometry', keep='first')

In [12]:
merged_all = merged_all_unique.copy()
merged_all['geometry'] = merged_all['geometry'].centroid

In [13]:
path_to_shp = r"C:\Users\engs2461\Documents\Git_project\transport-critical-minerals\Incoming data\ports\ne_110m_admin_0_countries\ne_110m_admin_0_countries.shp"
world = gpd.read_file(path_to_shp)
# Reprojecting 'merged_all' to "EPSG:4326"
merged_all = merged_all.to_crs("EPSG:4326")

# Dropping 'index_left' and 'index_right' if they exist in either dataframe
for df in [merged_all, world]:
    for col in ['index_left', 'index_right']:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)

# Performing the spatial join
merged_with_iso = gpd.sjoin(merged_all, world[['geometry', 'ISO_A3']], how="left", predicate='within')

In [14]:
output_gpkg_path = r"C:\Users\engs2461\Documents\Git_project\transport-critical-minerals\Processed data\merged_with_iso2.gpkg"
merged_with_iso.to_file(output_gpkg_path, driver="GPKG")

In [15]:
# When saving as a CSV, the geometry will be converted to a text representation (WKT format by default).
output_csv_path = r"C:\Users\engs2461\Documents\Git_project\transport-critical-minerals\Processed data\merged_with_iso2.csv"
merged_with_iso.to_csv(output_csv_path, index=False)

In [16]:
df_africa_ports_proj = df_africa_ports_proj.to_crs(merged_all.crs)
intersected_with_gpkg = gpd.sjoin(merged_all, df_africa_ports_proj, how="inner", predicate="intersects", lsuffix="_left", rsuffix="_right")

# Extract points in the `merged_all` that did not intersect with `df_africa_ports_proj`
non_intersected_from_merged = merged_all.loc[
    ~merged_all.index.isin(intersected_with_gpkg.index)
]

# Save these points as separate files
non_intersected_from_merged.to_file("non_intersected_from_merged.gpkg", driver="GPKG")

print(f"Total entries in non_intersected_from_merged: {len(non_intersected_from_merged)}")

Total entries in non_intersected_from_merged: 35


In [17]:
output_csv_path = r"C:\Users\engs2461\Documents\Git_project\transport-critical-minerals\Processed data\non_intersected_from_merged2.csv"
non_intersected_from_merged.to_csv(output_csv_path, index=False)