# Read xml files

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col

In [None]:
# Create a Spark session with databricks spark xml reader package
spark = SparkSession.builder \
    .appName("Read XML in PySpark") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.13:0.14.0") \
    .getOrCreate()

In [None]:
# Path to the XML file
xml_file_path = "./data/xml/city.xml"

# Read the XML file
df = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rowTag", "Placemark") \
    .load(xml_file_path)



In [None]:
# Show the DataFrame
df.show(truncate=False)
df.printSchema()

In [3]:
# Split the coordinates into latitude and longitude
df = df.withColumn("longitude", split(col("Point.coordinates"), ",").getItem(0)) \
       .withColumn("latitude", split(col("Point.coordinates"), ",").getItem(1)) \
       .withColumn("altitude", split(col("Point.coordinates"), ",").getItem(2))

df.select("name", "description", "longitude", "latitude", "altitude").show()

+-------------+-------------+----------+---------+--------+
|         name|  description| longitude| latitude|altitude|
+-------------+-------------+----------+---------+--------+
|New York City|New York City|-74.006393|40.714172|       0|
+-------------+-------------+----------+---------+--------+

