## Roadmap : 
- Scrapping des données sur le site de "citya"
- Processing et écriture des données en parquet et en delta 


## Modules

In [1]:
import findspark
findspark.init()

In [13]:
import os
import pyspark
from delta import *
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *
import configparser
import realstate_utils as rs


In [4]:
config = configparser.ConfigParser()
config.read('config.ini')

path = config.get('PATH', 'PATH_DATA')
url = config.get('URL', 'URL')
page = int(config.get('PAGE', 'page'))
output_parquet = config.get('PARQUET_OUTPUT', 'parquet_output')
output_delta = config.get('PARQUET_DELTA', 'delta_output')

# Processing

In [5]:
builder = pyspark.sql.SparkSession.builder \
    .appName("Real_state") \
    .config("spark.hadoop.home", "C:\hadoop-3.3.5\bin") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [7]:
# Définissez le schéma du DataFrame
schema = StructType([
    StructField("id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("surface", FloatType(), True),
    StructField("price", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("number_pieces", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("description", StringType(), True)
    
])

In [None]:
data = rs.scrapping(url,page)

In [15]:
#Fonction read_data 

def read_realstate (page: int, spark: SparkSession , url: str, schema:StructType) :
    data = rs.scrapping(url,page)
    df = spark.createDataFrame(data,schema)
    return df

In [26]:
#Fonction pour sélectionner une liste de colonnes 
def select_columns (df: DataFrame, cols:List):
    df_final=df.select(*cols)
    return df_final

In [21]:
#Ecriture en parquet
def write_parquet (final_df:DataFrame, colspartitionBy: List):
    final_df.write.partitionBy(*colspartitionBy)\
        .mode("overwrite").parquet(output_parquet)

In [33]:

def write_delta (final_df:DataFrame):
    final_df.write.format("delta")\
        .mode("overwrite").save(output_delta)

In [30]:
df = read_realstate(page,spark,url,schema)




  soup = bs(html, "html")


In [27]:
col_to_select = ["id","type","city","postal_code","number_pieces","surface","price"]
selected_columns= select_columns(df, col_to_select)

In [31]:
cols_to_partitions = ["type"]
write_parquet(selected_columns,cols_to_partitions )

In [34]:
write_delta(selected_columns)