In [1]:
import pandas as pd
import rasterio
import geopandas as gpd

# Load junction data
junction_df = pd.read_csv("pennines_JN.csv")

# Load basin raster (.bil file)
with rasterio.open("pennines_AllBasins.bil") as basin_raster:
    basin_transform = basin_raster.transform
    basin_data = basin_raster.read(1)

# Convert junction points to GeoDataFrame (no reprojection needed)
junction_gdf = gpd.GeoDataFrame(
    junction_df, geometry=gpd.points_from_xy(junction_df['x'], junction_df['y']), crs="EPSG:32360")

# Filter points within the basin
filtered_points = []
for point in junction_gdf.itertuples(index=False):
    row, col = rasterio.transform.rowcol(basin_transform, point.geometry.x, point.geometry.y)
    if 0 <= row < basin_data.shape[0] and 0 <= col < basin_data.shape[1]:
        if basin_data[row, col] >= 1:  # Check for the specific basin value !!!!change everytime its different!!!! (>= 1 for all basins)
            filtered_points.append(point)

# Convert filtered points back to a DataFrame
filtered_df = pd.DataFrame(filtered_points, columns=junction_gdf.columns)

# Save to CSV
filtered_df.drop(columns="geometry").to_csv("pennines_merged_JN.csv", index=False)


In [2]:
# import packages
import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point

In [3]:
# Load the m chi data
dfA = pd.read_csv("pennines_MChiSegmented.csv")
# Check for missing values in dfA
print("Missing values in dfA:")
print(dfA.isna().sum())  # Check for missing values in dfA

# Convert to a geopandas dataframe
gdfA = gpd.GeoDataFrame(
    dfA, geometry=gpd.points_from_xy(dfA.longitude, dfA.latitude))
# coordinate system
gdfA.crs = "EPSG:4326" 

# Load the junction data
dfB = pd.read_csv("pennines_merged_JN.csv")
# Check for missing values in dfB
print("Missing values in dfB:")
print(dfB.isna().sum())  # Check for missing values in dfB
gdfB = gpd.GeoDataFrame(
    dfB, geometry=gpd.points_from_xy(dfB.longitude, dfB.latitude))
# coordinate system
gdfB.crs = "EPSG:4326" 

Missing values in dfA:
node                   0
row                    0
col                    0
latitude               0
longitude              0
chi                    0
elevation              0
flow_distance          0
drainage_area          0
m_chi                  0
b_chi                  0
source_key             0
basin_key              0
segmented_elevation    0
dtype: int64
Missing values in dfB:
junction              0
node                  0
x                     0
y                     0
latitude              0
longitude             0
 stream_order         0
 receiver_junction    0
dtype: int64


In [4]:
gdfA.head()

Unnamed: 0,node,row,col,latitude,longitude,chi,elevation,flow_distance,drainage_area,m_chi,b_chi,source_key,basin_key,segmented_elevation,geometry
0,206321938,9973,8232,54.718234,-2.382409,18.453,518.32,103780.0,1310500.0,24.347,70.657,0,0,519.93,POINT (-2.38241 54.71823)
1,206298618,9972,8233,54.718278,-2.382331,18.44,517.99,103770.0,1357500.0,24.248,72.408,0,0,519.55,POINT (-2.38233 54.71828)
2,206275299,9971,8233,54.718323,-2.38233,18.432,517.98,103770.0,1360100.0,24.266,72.101,0,0,519.36,POINT (-2.38233 54.71832)
3,206251982,9970,8234,54.718368,-2.382252,18.419,517.98,103760.0,1365200.0,24.588,66.347,0,0,519.23,POINT (-2.38225 54.71837)
4,206228665,9969,8234,54.718413,-2.382251,18.411,517.83,103750.0,1367500.0,24.182,73.627,0,0,518.84,POINT (-2.38225 54.71841)


In [5]:
gdfB.head()

Unnamed: 0,junction,node,x,y,latitude,longitude,stream_order,receiver_junction,geometry
0,303,17035626,525157.5,6108312.5,55.120866,-2.605533,1,304,POINT (-2.60553 55.12087)
1,304,20323241,525907.5,6107467.5,55.113235,-2.59385,2,305,POINT (-2.59385 55.11323)
2,305,26518767,526587.5,6105882.5,55.098956,-2.583339,3,306,POINT (-2.58334 55.09896)
3,306,27836136,526842.5,6105547.5,55.095932,-2.579374,3,307,POINT (-2.57937 55.09593)
4,307,27285279,527212.5,6105687.5,55.09717,-2.573563,3,308,POINT (-2.57356 55.09717)


In [6]:
# function for combining datasets
def ckdnearest(gdA, gdB):

    nA = np.array(list(gdA.geometry.apply(lambda x: (x.x, x.y))))
    nB = np.array(list(gdB.geometry.apply(lambda x: (x.x, x.y))))
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdB_nearest = gdB.iloc[idx].drop(columns="geometry").reset_index(drop=True)
    gdf = pd.concat(
        [
            gdA.reset_index(drop=True),
            gdB_nearest,
            pd.Series(dist, name='dist')
        ], 
        axis=1)

    return gdf

In [7]:
# merging the two files
merged_gdp = ckdnearest(gdfB, gdfA)
merged_gdp.head()

  result = super().apply(func, convert_dtype=convert_dtype, args=args, **kwargs)
  result = super().apply(func, convert_dtype=convert_dtype, args=args, **kwargs)


Unnamed: 0,junction,node,x,y,latitude,longitude,stream_order,receiver_junction,geometry,node.1,...,chi,elevation,flow_distance,drainage_area,m_chi,b_chi,source_key,basin_key,segmented_elevation,dist
0,303,17035626,525157.5,6108312.5,55.120866,-2.605533,1,304,POINT (-2.60553 55.12087),17035626,...,24.568,399.89,99482.0,1250100.0,26.869,-260.48,414,6,399.65,0.0
1,304,20323241,525907.5,6107467.5,55.113235,-2.59385,2,305,POINT (-2.59385 55.11323),20323241,...,22.53,348.99,98110.0,3974000.0,20.083,-103.11,414,6,349.38,0.0
2,305,26518767,526587.5,6105882.5,55.098956,-2.583339,3,306,POINT (-2.58334 55.09896),26518767,...,19.887,291.32,95472.0,12434000.0,10.91,74.384,413,6,291.36,0.0
3,306,27836136,526842.5,6105547.5,55.095932,-2.579374,3,307,POINT (-2.57937 55.09593),27836136,...,19.533,287.32,94919.0,13839000.0,10.898,74.628,413,6,287.5,0.0
4,307,27285279,527212.5,6105687.5,55.09717,-2.573563,3,308,POINT (-2.57356 55.09717),27285279,...,19.122,283.25,94246.0,16520000.0,10.898,74.638,413,6,283.03,0.0


In [8]:
merged_gdp.to_csv("pennines_merged_JN_mchi.csv")