In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
OCI_ACCESS_KEY_ID = os.environ['OCI_ACCESS_KEY_ID']
OCI_SECRET_ACCESS_KEY = os.environ['OCI_SECRET_ACCESS_KEY']
OCI_REGION = 'uk-london-1'
OCI_NAMESPACE = 'lrqgbz9z6zlj'
BUCKET_NAME = 'london-property-sales-price'

db_url = "jdbc:postgresql://pgwarehouse:5432/london"
db_properties = {
    "user": os.environ['POSTGRES_USER'],
    "password": os.environ['POSTGRES_PASSWORD'],
    "driver": "org.postgresql.Driver"
}

In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('PySparkOCIConnection') \
    .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.506') \
    .config('spark.hadoop.fs.s3a.endpoint', f'https://{OCI_NAMESPACE}.compat.objectstorage.{OCI_REGION}.oraclecloud.com') \
    .config('spark.hadoop.fs.s3a.access.key', OCI_ACCESS_KEY_ID) \
    .config('spark.hadoop.fs.s3a.secret.key', OCI_SECRET_ACCESS_KEY) \
    .config('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem') \
    .config('spark.hadoop.fs.s3a.path.style.access', 'true') \
    .config('spark.hadoop.fs.s3a.connection.ssl.enabled', 'true') \
    .config("spark.jars", "/opt/spark/jars/postgresql-42.7.4.jar") \
    .getOrCreate()

In [37]:
file_path = f's3a://{BUCKET_NAME}/chunks/*.parquet'

df = spark.read.parquet(file_path, header=True, inferSchema=True)

df = df.withColumn('DATE_OF_TRANSFER', F.col("DATE_OF_TRANSFER").cast('date'))

df_london = df[
    (df['COUNTY'] == 'GREATER LONDON')
    & (df['DATE_OF_TRANSFER'] >= '2010-01-01')
    & (df["POSTCODE"].isNotNull())
]

In [38]:
df_london.show()

+-----------------------------+-------+----------------+--------+-------------+-------+--------+--------------------+--------------------+--------------------+-----------+---------+--------------------+--------------+-----------------+-------------------------------+
|TRANSACTION_UNIQUE_IDENTIFIER|  PRICE|DATE_OF_TRANSFER|POSTCODE|PROPERTY_TYPE|OLD_NEW|DURATION|                PAON|                SAON|              STREET|   LOCALITY|TOWN_CITY|            DISTRICT|        COUNTY|PPD_CATEGORY_TYPE|RECORD_STATUS_MONTHLY_FILE_ONLY|
+-----------------------------+-------+----------------+--------+-------------+-------+--------+--------------------+--------------------+--------------------+-----------+---------+--------------------+--------------+-----------------+-------------------------------+
|         {2131FCF6-BEA7-86...| 165000|      2021-05-25|TW13 4JE|            F|      N|       L|                  28|                NULL|        ROSE GARDENS|       NULL|  FELTHAM|            HOU

In [None]:
#     df = df[
#         (df['COUNTY'] == 'GREATER LONDON')
#         &(df[df['DATE_OF_TRANSFER'].dt.date == date(2010,1,1)])
#         ]
    

In [5]:
df.show()

+-----------------------------+------+----------------+--------+-------------+-------+--------+--------------------+----+-----------------+----------+-------------+----------+-----------+-----------------+-------------------------------+
|TRANSACTION_UNIQUE_IDENTIFIER| PRICE|DATE_OF_TRANSFER|POSTCODE|PROPERTY_TYPE|OLD_NEW|DURATION|                PAON|SAON|           STREET|  LOCALITY|    TOWN_CITY|  DISTRICT|     COUNTY|PPD_CATEGORY_TYPE|RECORD_STATUS_MONTHLY_FILE_ONLY|
+-----------------------------+------+----------------+--------+-------------+-------+--------+--------------------+----+-----------------+----------+-------------+----------+-----------+-----------------+-------------------------------+
|         {2131FCF6-AC4A-86...|174000|2021-08-18 00:00|PO20 0FL|            S|      N|       F|                  12|NULL|      HALLEY VIEW|    SELSEY|   CHICHESTER|CHICHESTER|WEST SUSSEX|                A|                              A|
|         {2131FCF6-AC59-86...|385250|2021-01-22

In [None]:

grouped = df.groupby(F.col('DISTRICT')).agg(F.sum('PRICE').alias('somma'))

In [None]:
grouped.toPandas()

In [None]:
grouped.write.jdbc(url=db_url, table="prova123", mode="overwrite", properties=db_properties)

In [None]:
spark.stop()