In [1]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
import boto3

In [2]:
s3 = boto3.client('s3')
response = s3.list_objects_v2(Bucket='data-bucket-properties-5a166591')
files = []
if 'Contents' in response:
        for item in response['Contents']:
            files.append(item['Key'])
files

['LBSMv2 - Data Dictionary.xlsx',
 'LBSMv2_Barnet.csv',
 'LBSMv2_Bromley.csv',
 'LBSMv2_Croydon.csv',
 'LBSMv2_Greenwich.csv',
 'LBSMv2_Lewisham.csv',
 'LBSMv2_Southwark.csv']

In [3]:
params = []
if '--JOB_NAME' in sys.argv:
    params.append('JOB_NAME')
args = getResolvedOptions(sys.argv, params)

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

if 'JOB_NAME' in args:
    jobname = args['JOB_NAME']
else:
    jobname = "police_data_job"
job.init(jobname, args)

#get logger for this glue job
logger = glueContext.get_logger()
logger.info(f"Job {jobname} started with args: {args}")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/14 12:15:20 WARN Job$: Job run ID police_data_job is either null or empty or its same as Job name. 
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


In [4]:
def get_files_from_s3(bucket_name, s3, logger=None):
    """Retrieve a list of files from an S3 bucket."""
    try:
        # List objects in the specified S3 bucket
        response = s3.list_objects_v2(Bucket=bucket_name)
    except Exception as e:
        logger.error(f"Error accessing bucket {bucket_name}: {e}")
        # Return an empty list if the bucket cannot be accessed
        return []
    response = s3.list_objects_v2(Bucket=bucket_name)
    files = []
    if 'Contents' in response:
        for item in response['Contents']:
            files.append(item['Key'])
    return files

def partition_files_by_extension(files):
    """Partition a list of files by their extension."""
    csv_files = []
    xlsx_files = []
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(file)
        elif file.endswith('.xlsx'):
            xlsx_files.append(file)
    return csv_files, xlsx_files

def load_to_dyf(files, glueContext, logger=None):
    """Load files into a DynamicFrame."""
    dyf_list = []
    for file in files:
        try:
            dyf = glueContext.create_dynamic_frame.from_options(
                connection_type="s3",
                format="csv",
                connection_options={"paths": [f"s3://{s3_bucket}/{file}"]},
                format_options={"withHeader": True}
            )
            dyf_list.append(dyf)
            logger.info(f"Loaded file {file} into DynamicFrame")
        except Exception as e:
            logger.error(f"Error loading file {file}: {e}")

    if not dyf_list:
        logger.error("No files were loaded into DynamicFrames.")
        raise ValueError("No files were loaded into DynamicFrames.")
    return dyf_list

In [5]:
s3_bucket = 'data-bucket-properties-5a166591'
s3_client = boto3.client('s3')

files = get_files_from_s3(s3_bucket, s3_client, logger)
csv_files, xlsx_files = partition_files_by_extension(files)

dyf_list = load_to_dyf(csv_files, glueContext, logger)

In [8]:
dyf_list[3].show()  # Display the first DynamicFrame for verification

{"uprn": "100021003505", "os_topo_toid": "1000042426398", "easting": "544698", "northing": "173296", "postcode_locator": "SE9 2EZ", "administrative_area": "Greenwich", "oa21cd": "E00001683", "lsoa21cd": "E01000339", "lsoa21nm": "Bexley 025A", "lsoa11cd": "E01000339", "lsoa11nm": "Bexley 025A", "ward22cd": "E05014079", "ward22nm": "Eltham Town & Avery Hill", "property_type": "house", "built_form": "semi-detached", "property_type_built_form": "semi-detached house", "tenure": "owner-occupied", "tenure_known": "0", "building_use": "residential only", "construction_age_band": "1930-1949", "construction_age_band_known": "1", "epc_score": "71", "epc_score_known": "0", "epc_rating": "C", "epc_rating_known": "0", "potential_epc_score": "80", "potential_epc_score_known": "0", "potential_epc_rating": "C", "potential_epc_rating_known": "0", "number_habitable_rooms": "6", "number_habitable_rooms_known": "0", "total_floor_area": "113", "total_floor_area_known": "0", "estimated_floor_count": "2", "ba

In [None]:
path = "s3://data-bucket-properties-5a166591/LBSMv2_Bromley.csv"

# Method 1: Split and extract
filename = path.split('/')[-1]  # Gets "LBSMv2_Bromley.csv"
area = filename.split('_')[1].replace('.csv', '')  # Gets "Bromley"

# Method 2: One liner
area = path.split('/')[-1].split('_')[1].replace('.csv', '')

# Method 3: For all files
csv_files = df.inputFiles()
areas = [f.split('/')[-1].split('_')[1].replace('.csv', '') for f in csv_files if f.endswith('.csv')]
print(areas)  # ['Southwark', 'Greenwich', 'Croydon', 'Bromley', 'Barnet', 'Lewisham']
