In [3]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
import boto3

In [34]:
s3 = boto3.client('s3')
response = s3.list_objects_v2(Bucket='data-bucket-properties-5a166591')
files = []
if 'Contents' in response:
        for item in response['Contents']:
            files.append(item['Key'])
files

['LBSMv2 - Data Dictionary.xlsx',
 'LBSMv2_Barnet.csv',
 'LBSMv2_Bromley.csv',
 'LBSMv2_Croydon.csv',
 'LBSMv2_Greenwich.csv',
 'LBSMv2_Lewisham.csv',
 'LBSMv2_Southwark.csv']

In [8]:
s3_bucket = "s3://data-bucket-properties-5a166591"

In [7]:
params = []
if '--JOB_NAME' in sys.argv:
    params.append('JOB_NAME')
args = getResolvedOptions(sys.argv, params)

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

if 'JOB_NAME' in args:
    jobname = args['JOB_NAME']
else:
    jobname = "police_data_job"
job.init(jobname, args)

#get logger for this glue job
logger = glueContext.get_logger()
logger.info(f"Job {jobname} started with args: {args}")


In [None]:
# list filef from s3 bucket
df = spark.read.option("basePath", f"{s3_bucket}").csv(f"{s3_bucket}/*")
df.inputFiles()  # Returns array of file paths

                                                                                

['s3://data-bucket-properties-5a166591/LBSMv2_Southwark.csv',
 's3://data-bucket-properties-5a166591/LBSMv2_Greenwich.csv',
 's3://data-bucket-properties-5a166591/LBSMv2%20-%20Data%20Dictionary.xlsx',
 's3://data-bucket-properties-5a166591/LBSMv2_Croydon.csv',
 's3://data-bucket-properties-5a166591/LBSMv2_Bromley.csv',
 's3://data-bucket-properties-5a166591/LBSMv2_Barnet.csv',
 's3://data-bucket-properties-5a166591/LBSMv2_Lewisham.csv']

In [30]:
csv_data = [f for f in df.inputFiles() if f.endswith('.csv')]
dyf = glueContext.create_dynamic_frame.from_options(
    connection_type="s3",
    format="csv",
    connection_options={
        "paths": csv_data,
        "recurse": True
    },
    format_options={
        "withHeader": True,
        "separator": ","
    },
    transformation_ctx="data-bucket-properties"
)
dyf.printSchema()


ERROR:root:KeyboardInterrupt while sending command.                (3 + 9) / 12]
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib64/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
dyf.show()

{"uprn": "200003466691", "os_topo_toid": "1000041669857", "easting": "535702", "northing": "175834", "postcode_locator": "SE4 2ED", "administrative_area": "Southwark", "oa21cd": "E00016898", "lsoa21cd": "E01003342", "lsoa21nm": "Lewisham 010C", "lsoa11cd": "E01003342", "lsoa11nm": "Lewisham 010C", "ward22cd": "E05011108", "ward22nm": "Nunhead & Queen's Road", "property_type": "house", "built_form": "end-terrace", "property_type_built_form": "end-terraced house", "tenure": "owner-occupied", "tenure_known": "1", "building_use": "residential only", "construction_age_band": "1900-1929", "construction_age_band_known": "1", "epc_score": "53", "epc_score_known": "1", "epc_rating": "E", "epc_rating_known": "1", "potential_epc_score": "77", "potential_epc_score_known": "1", "potential_epc_rating": "C", "potential_epc_rating_known": "1", "number_habitable_rooms": "4", "number_habitable_rooms_known": "1", "total_floor_area": "99", "total_floor_area_known": "1", "estimated_floor_count": "2", "base

                                                                                

                                                                                

In [None]:
path = "s3://data-bucket-properties-5a166591/LBSMv2_Bromley.csv"

# Method 1: Split and extract
filename = path.split('/')[-1]  # Gets "LBSMv2_Bromley.csv"
area = filename.split('_')[1].replace('.csv', '')  # Gets "Bromley"

# Method 2: One liner
area = path.split('/')[-1].split('_')[1].replace('.csv', '')

# Method 3: For all files
csv_files = df.inputFiles()
areas = [f.split('/')[-1].split('_')[1].replace('.csv', '') for f in csv_files if f.endswith('.csv')]
print(areas)  # ['Southwark', 'Greenwich', 'Croydon', 'Bromley', 'Barnet', 'Lewisham']
