In [1]:
import pandas as pd
import os

def split_csv(file_path, output_dir, chunk_size):
    """Split a CSV file into smaller chunks."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read CSV in chunks
    chunk_iter = pd.read_csv(file_path, chunksize=chunk_size)
    for i, chunk in enumerate(chunk_iter):
        chunk_file = os.path.join(output_dir, f"chunk_{i}.csv")
        chunk.to_csv(chunk_file, index=False)
        print(f"Created {chunk_file}")

if __name__ == "__main__":
    input_file = "pennines_CN.csv"
    output_directory = "chunks"
    chunk_size = 100000

    split_csv(input_file, output_directory, chunk_size)


Created chunks/chunk_0.csv
Created chunks/chunk_1.csv
Created chunks/chunk_2.csv
Created chunks/chunk_3.csv
Created chunks/chunk_4.csv
Created chunks/chunk_5.csv
Created chunks/chunk_6.csv
Created chunks/chunk_7.csv
Created chunks/chunk_8.csv
Created chunks/chunk_9.csv
Created chunks/chunk_10.csv
Created chunks/chunk_11.csv
Created chunks/chunk_12.csv
Created chunks/chunk_13.csv


In [21]:
import pandas as pd
import numpy as np
from osgeo import gdal
import rasterio
from pyproj import Transformer

def extract_raster_values(channel_network, raster_path, from_crs="EPSG:4326", to_crs="EPSG:32630"):
    """Assign raster values to channel network data points after reprojecting coordinates."""
    
    # Initialize the CRS transformer
    transformer = Transformer.from_crs(from_crs, to_crs, always_xy=True)
    
    # Open the raster file
    with rasterio.open(raster_path) as src:
        # Reproject coordinates to the raster CRS
        lons = np.array(channel_network['longitude'])
        lats = np.array(channel_network['latitude'])
        reprojected_coords = np.array([transformer.transform(lon, lat) for lon, lat in zip(lons, lats)])
        
        # Convert reprojected coordinates to raster indices
        rows, cols = src.index(reprojected_coords[:, 0], reprojected_coords[:, 1])
        
        # Read the raster values at these locations
        raster_values = np.empty(len(rows))
        for i, (row, col) in enumerate(zip(rows, cols)):
            try:
                raster_values[i] = src.read(1)[row, col]
            except IndexError:
                raster_values[i] = np.nan
        
    # Add the raster values to the channel network
    channel_network['rocktype_id'] = raster_values
    return channel_network

# Paths to the raster file and channel network CSV
raster_file = "merged_bedrock.bil"
channel_network_csv = "chunk_13.csv"

# Load the channel network
channel_network = pd.read_csv(channel_network_csv)

# Process the data
channel_network_updated = extract_raster_values(channel_network, raster_file)

# Save the updated channel network
channel_network_updated.to_csv('processed_chunks_13.csv', index=False)
print("Processing complete. File saved.")


Processing complete. File saved.


In [22]:
import pandas as pd
import glob

# List all CSV files with the specified prefix
csv_files = glob.glob('processed_chunks_*.csv')

# Read the first CSV file to set the column names
first_file = pd.read_csv(csv_files[0])
column_names = first_file.columns.tolist()

# Initialize an empty DataFrame
merged_data = pd.DataFrame(columns=column_names)

# Loop through the CSV files and concatenate them into the merged DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    merged_data = pd.concat([merged_data, df], ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_data.to_csv('pennines_CN_with_rocktype.csv', index=False)


  merged_data = pd.concat([merged_data, df], ignore_index=True)


In [26]:
import pandas as pd
from collections import Counter

# Load the channel network and junction data
channel_network = pd.read_csv('pennines_CN_with_rocktype.csv')  # Your channel network file
junctions = pd.read_csv('pennines_sin_grad_filtered.csv')  # Your junction file

# Merge channel network with junctions to find the segments between junctions
merged_data = pd.merge(channel_network, junctions, 
                       left_on='receiver_JI', right_on='receiver_junction', 
                       suffixes=('_segment', '_junction'))

# Group by 'Junction Index' and 'receiver_JI' to find segments
segments = merged_data.groupby(['Junction Index', 'receiver_JI']).agg(list).reset_index()

# Function to count rocktype along a channel segment
def count_rocktype(segment, network_data):
    rocktype_counts = Counter(segment['rocktype_id'])
    most_common_rocktype_id = rocktype_counts.most_common(1)[0][0]
    return most_common_rocktype_id

# Apply the function to each segment
segments['most_common_rocktype_id'] = segments.apply(count_rocktype, axis=1, network_data=channel_network)

# Map the most common rocktype_id back to the junctions
junctions = junctions.merge(segments[['Junction Index', 'most_common_rocktype_id']], 
                            left_on='junction', right_on='Junction Index', 
                            how='left')

# Drop duplicate 'Junction Index' entries
junctions = junctions.drop_duplicates(subset=['junction'])

# Save the updated junctions data with assigned rocktype_id
junctions.to_csv('pennines_sin_grad_geol_fil.csv', index=False)
