## Cluster managed
Download and install libraries

In [None]:
%pip install findspark
%pip install pyspark

# BigQuery connector for Spark sessions
%pip install google-cloud-bigquery

In [None]:
# Google Cloud Project
PROJECT_ID = "adcz-adoki-poc"
LOCATION_ID = "europe-west1"

print(f"Project '{PROJECT_ID}' on a Location '{LOCATION_ID}'")

## Connect to the cluster

In [None]:
# Connect to Cluster
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import lower, col

findspark.init()
spark = SparkSession \
    .builder \
    .master ('yarn') \
    .appName("demo-wrkbnch-cluster") \
    .getOrCreate()

# spark.sparkContext.setLogLevel("INFO")

print(spark.sparkContext.appName)
print(spark.sparkContext.master)

## Get data from GCS

In [None]:
# Prepare vaiables
file_location = "gs://adastra-demo-real-estate/Properties list/dbx_property.csv"

# Get data from GCS
df_gcs = spark.read.csv(file_location, header=True, inferSchema=True)

df_property = df_gcs

df_gcs.show(5)

## Get data from BigQuery

In [None]:
# Prepare variables
dataset_id = "subs_property_dataset" # "demo_real_estate"
table_id = "property_type"

bq_table = f"{PROJECT_ID}.{dataset_id}.{table_id}"


# Get data from BQ table
df_bq_table = (
    spark.read.format("bigquery")
        .option("table", bq_table)
        .option("location", LOCATION_ID)
        .load()
)

df_property_type = df_bq_table

df_bq_table.show(10)

### Join tables together

In [None]:
joined_df = df_property_type.join(df_property, on='property_type_id', how='inner')

filtered_df = joined_df.filter(lower(col('type_name')) == 'byt')

filtered_df.select('type_name', 'property_name', 'source_url').show()