# analyze french immo transactions

In [None]:
from sedona.spark import *
from pyspark.sql import SparkSession, DataFrame
from pathlib import Path
from pyspark.sql.functions import trim, split, expr, col, lit

In [None]:
# build a sedona session offline
project_root_dir = Path.cwd().parent
print(project_root_dir.as_posix())

In [None]:
# here we choose sedona 1.7.2 for spark 3.5.* build with scala 2.12
jar_folder = Path(f"{project_root_dir}/jars/sedona-35-212-172")
jar_list = [str(jar) for jar in jar_folder.iterdir() if jar.is_file()]
jar_path = ",".join(jar_list)

# build a sedona session (sedona = 1.7.2) offline
spark = SparkSession.builder \
    .appName("sedona_tutorial") \
    .master("local[*]") \
    .config("spark.jars", jar_path) \
    .getOrCreate()

In [None]:
# create a sedona context
sedona = SedonaContext.create(spark)

In [None]:
sc = spark.sparkContext
# use utf as default encoding
sc.setSystemProperty("sedona.global.charset", "utf8")

In [None]:
data_dir = project_root_dir / "data"
fr_immo_transaction_path = data_dir / "large_ds/fr_immo_transaction.parquet"
fr_immo_transactions_df = spark.read.parquet(fr_immo_transaction_path.as_posix())

In [None]:
required_col = ["id_transaction", "date_transaction", "prix", "departement", "ville", "code_postal", "adresse",
                "type_batiment", "n_pieces", "surface_habitable", "latitude", "longitude"]
clean_fr_immo_df = fr_immo_transactions_df.select(required_col)

In [None]:
clean_fr_immo_df.show()

In [None]:
clean_fr_immo_df.printSchema()

In [None]:
fr_immo_geometry_df = clean_fr_immo_df.withColumn("geo_coord", ST_Point(col("longitude"), col("latitude"))).drop(
    "longitude", "latitude")

In [None]:
fr_immo_geometry_df.show()

In [None]:
from pyspark.sql.functions import asc


def get_near_immo_transaction(geo_df: DataFrame, target_loc: str, distance: float) -> DataFrame:
    """
    This function get the nearest hospital based on distance with a given patient location
    :return:
    """
    tmp_df = geo_df.withColumn("distance_meter", ST_DistanceSphere(ST_GeomFromWKT(lit(target_loc)), col("geo_coord"))) \
        .orderBy(asc("distance_meter"))

    return tmp_df.filter(tmp_df.distance_meter <= distance)

In [None]:
# filter with column code_postal to find all immo transactions in montrouge

montrouge_immo_df = fr_immo_geometry_df.filter(
    (col("code_postal") == 92120) & (col("date_transaction") > lit("2021-12-31"))).select("adresse", "type_batiment",
                                                                                          "n_pieces",
                                                                                          "surface_habitable", "prix",
                                                                                          "geo_coord")
montrouge_immo_df.count()

In [None]:
map_config = {
    "visState": {
        "layers": [
            {
                "type": "point",
                "config": {
                    "dataId": "Montrouge Transactions",
                    "label": "Transactions",
                    "color": [255, 0, 0],
                    "isVisible": True,
                },
                "visualChannels": {
                    "colorField": {"name": "prix", "type": "real"},
                    "colorScale": "quantile"
                }
            }
        ]
    },
    "mapState": {
        "bearing": 0,
        "latitude": 48.816,  # Starting center latitude
        "longitude": 2.313,  # Starting center longitude
        "pitch": 0,
        "zoom": 13  # Starting zoom level
    }
}

In [None]:
# to be able to use kepler map, you must install the kepler extension. pip install apache-sedona[kepler-map]
kepler_map_path = project_root_dir / "tmp/montrouge_immo_map.html"
montrouge_immo_map = SedonaKepler.create_map(df=montrouge_immo_df, name="montrouge_immo_transaction", config=map_config)
montrouge_immo_map.save_to_html(file_name=kepler_map_path.as_posix())