# Rearrange the columns of conflux outputs

Conflux outputs the columns weird order. This will reorder the columns to 'feature_id', 'water', 'wet', 'bs', 'pv', 'npv', 'pc_missing', 'date', and remove 'norm_pv', 'norm_npv', 'norm_bs'. This is specific to the ANAE MDB data request. 

TO DO: add concurrency for next time (see append_new_results.ipynb)

In [None]:
import boto3
import pandas as pd
from io import StringIO
from tqdm import tqdm

In [None]:
temp_aws_access_key_id=""
temp_aws_secret_access_key=""
temp_aws_session_token=""

In [None]:
s3 = boto3.client('s3', aws_access_key_id=temp_aws_access_key_id,
                      aws_secret_access_key=temp_aws_secret_access_key, 
                 aws_session_token=temp_aws_session_token)

In [None]:
bucket = 'dea-public-data-dev'
source_prefix = 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/polygon_base_result/'
destination_prefix = 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/polygon_base_result_rearranged_cols/'

In [None]:
# new column order
new_column_order = [
    'feature_id', 'water', 'wet', 'bs', 'pv', 'npv', 'pc_missing',
    'date'
]

In [None]:
# use this when you have more than 999 csv's 

In [None]:
# list csvs with pagination
csv_files = []
response = s3.list_objects_v2(Bucket=bucket, Prefix=source_prefix)

# fetch files
csv_files.extend([obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv')])

# check for more files if the result is truncated
while response.get('IsTruncated'):  
    # Continue with the next page of results
    continuation_token = response.get('NextContinuationToken')
    response = s3.list_objects_v2(Bucket=bucket, Prefix=source_prefix, ContinuationToken=continuation_token)
    csv_files.extend([obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv')])

# process CSVs
for key in tqdm(csv_files, desc="Processing CSV files"):
    # download CSV content
    csv_obj = s3.get_object(Bucket=bucket, Key=key)
    body = csv_obj['Body'].read().decode('utf-8')

    # read CSV into DataFrame
    df = pd.read_csv(StringIO(body))

    # drop the unwanted columns
    df = df.drop(columns=['norm_pv', 'norm_npv', 'norm_bs'], errors='ignore')

    # reorder columns
    df = df[new_column_order]

    # save to new S3 path
    new_key = key.replace(source_prefix, destination_prefix)
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)

    s3.put_object(Bucket=bucket, Key=new_key, Body=csv_buffer.getvalue())

In [None]:
# use if you have less than 999 csv's

In [None]:
# list csvs 
response = s3.list_objects_v2(Bucket=bucket, Prefix=source_prefix)
csv_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv')]

for key in tqdm(csv_files, desc="Processing CSV files"):
    # download CSV content
    csv_obj = s3.get_object(Bucket=bucket, Key=key)
    body = csv_obj['Body'].read().decode('utf-8')

    # read CSV into DataFrame
    df = pd.read_csv(StringIO(body))

    # Reorder columns
    df = df[new_column_order]

    # save to new S3 path
    new_key = key.replace(source_prefix, destination_prefix)
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)

    s3.put_object(Bucket=bucket, Key=new_key, Body=csv_buffer.getvalue())

### Quick count check

In [None]:
bucket = 'dea-public-data-dev'
prefix = 'projects/WIT/MDBA_ANAE_WIT_MH_18_04_2025/polygon_base_result_rearranged_cols/'

# Count files with pagination
file_count = 0
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

# Count initial batch
file_count += len(response.get('Contents', []))

# Keep paginating if needed
while response.get('IsTruncated'):
    continuation_token = response.get('NextContinuationToken')
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=continuation_token)
    file_count += len(response.get('Contents', []))

print(f"Total number of files: {file_count}")