In [10]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os 
import pandas as pd
import os

OCI_ACCESS_KEY_ID = os.environ['OCI_ACCESS_KEY_ID']
OCI_SECRET_ACCESS_KEY = os.environ['OCI_SECRET_ACCESS_KEY']
OCI_REGION ='uk-london-1'
OCI_NAMESPACE = 'lrqgbz9z6zlj'
BUCKET_NAME = 'london-property-sales-price'
bucket_folder = "ppd-download-chunks/"

In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('PySparkOCIConnection') \
    .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.506') \
    .config('spark.hadoop.fs.s3a.endpoint', f'https://{OCI_NAMESPACE}.compat.objectstorage.{OCI_REGION}.oraclecloud.com') \
    .config('spark.hadoop.fs.s3a.access.key', OCI_ACCESS_KEY_ID) \
    .config('spark.hadoop.fs.s3a.secret.key', OCI_SECRET_ACCESS_KEY) \
    .config('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem') \
    .config('spark.hadoop.fs.s3a.path.style.access', 'true') \
    .config('spark.hadoop.fs.s3a.connection.ssl.enabled', 'true') \
    .getOrCreate()

file_path = f's3a://{BUCKET_NAME}/{bucket_folder}/*.parquet'
ppd = spark.read.parquet(file_path, header=True, inferSchema=True)

In [4]:
ppd.columns

['TRANSACTION_UNIQUE_IDENTIFIER',
 'PRICE',
 'DATE_OF_TRANSFER',
 'POSTCODE',
 'PROPERTY_TYPE',
 'OLD_NEW',
 'DURATION',
 'PAON',
 'SAON',
 'STREET',
 'LOCALITY',
 'TOWN_CITY',
 'DISTRICT',
 'COUNTY',
 'PPD_CATEGORY_TYPE',
 'RECORD_STATUS_MONTHLY_FILE_ONLY',
 '__index_level_0__']

In [6]:
df1 = ppd.filter(
    (F.col('postcode')=='SE10 8GR')
)

In [18]:
df_ppc = df1.toPandas()

In [20]:
df_ppc.columns

Index(['TRANSACTION_UNIQUE_IDENTIFIER', 'PRICE', 'DATE_OF_TRANSFER',
       'POSTCODE', 'PROPERTY_TYPE', 'OLD_NEW', 'DURATION', 'PAON', 'SAON',
       'STREET', 'LOCALITY', 'TOWN_CITY', 'DISTRICT', 'COUNTY',
       'PPD_CATEGORY_TYPE', 'RECORD_STATUS_MONTHLY_FILE_ONLY',
       '__index_level_0__'],
      dtype='object')

In [23]:
epc_folder = "epc_test/"
files = os.listdir(epc_folder)
# files.remove(".ipynb_checkpoints")

dfs = []
for f in files:
    tmp = pd.read_csv(epc_folder+f)
    dfs.append(tmp)

epc = pd.concat(dfs,ignore_index=True)


In [24]:
epc[epc['postcode'].str.contains("SE10 8G")]

Unnamed: 0,lmk-key,address1,address2,address3,postcode,building-reference-number,current-energy-rating,potential-energy-rating,current-energy-efficiency,potential-energy-efficiency,...,local-authority-label,constituency-label,posttown,construction-age-band,lodgement-datetime,tenure,fixed-lighting-outlets-count,low-energy-fixed-light-count,uprn,uprn-source
698,1504295499922016121315191628378106,Flat 13 Blossom House,Hillside Avenue,,SE10 8GB,213998478,B,B,86,86,...,Greenwich,Greenwich and Woolwich,LONDON,NO DATA!,2016-12-13 15:19:16,unknown,5.0,5.0,1.009163e+10,Address Matched
699,1504311549312016121315175099969047,Flat 1 Blossom House,Hillside Avenue,,SE10 8GB,772998478,B,B,86,86,...,Greenwich,Greenwich and Woolwich,LONDON,NO DATA!,2016-12-13 15:17:50,unknown,5.0,5.0,1.009163e+10,Address Matched
700,1504326199922016121315190418278816,Flat 16 Blossom House,Hillside Avenue,,SE10 8GB,1123998478,B,B,83,83,...,Greenwich,Greenwich and Woolwich,LONDON,NO DATA!,2016-12-13 15:19:04,unknown,5.0,5.0,1.009163e+10,Address Matched
702,1504371389412016121315190599969245,Flat 6 Blossom House,Hillside Avenue,,SE10 8GB,2592998478,B,B,85,85,...,Greenwich,Greenwich and Woolwich,LONDON,NO DATA!,2016-12-13 15:19:05,unknown,5.0,5.0,1.009163e+10,Address Matched
704,1504399899962016121315175238978336,Flat 3 Blossom House,Hillside Avenue,,SE10 8GB,3382998478,B,B,83,83,...,Greenwich,Greenwich and Woolwich,LONDON,NO DATA!,2016-12-13 15:17:52,unknown,5.0,5.0,1.009163e+10,Address Matched
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123285,a25b29005fd924a316eabc2390c6459ed08ac8d2aa44f1...,FLAT 4,"NOAKES HOUSE, 33 ROYAL HILL",,SE10 8GW,10003342738,B,B,89,89,...,Greenwich,Greenwich and Woolwich,London,2022,2022-07-20 13:10:59,Rented (private),24.0,,1.001027e+10,Address Matched
123290,c1b3a2f6f2586f6808a0861ab7ff073fd43b679fde65d3...,FLAT 5,"NOAKES HOUSE, 33 ROYAL HILL",,SE10 8GW,10003312239,B,B,88,88,...,Greenwich,Greenwich and Woolwich,London,2022,2022-07-20 13:19:21,Rented (private),23.0,,1.001027e+10,Address Matched
123291,c61d3f97ea6d3ca22b04a2553e8d2680055db5fb16f9ef...,FLAT 1,"NOAKES HOUSE, 33 ROYAL HILL",,SE10 8GW,10003299793,B,B,87,87,...,Greenwich,Greenwich and Woolwich,London,2022,2022-07-20 13:01:49,Rented (private),24.0,,1.001027e+10,Address Matched
123298,e864756c3552dcbb54fb30f0edcd0d5b506076175c92c2...,FLAT 6,"NOAKES HOUSE, 33 ROYAL HILL",,SE10 8GW,10003314955,B,B,89,89,...,Greenwich,Greenwich and Woolwich,London,2022,2022-07-20 13:20:17,Rented (private),32.0,,1.001027e+10,Address Matched


In [17]:
len(epc)

157041