# GeoJSON 데이터 로드 ( 제대로 읽어지지 않는 방법과 제대로 읽는 방법 )

https://medium.com/@sabman/loading-geojson-data-in-apache-spark-f7a52390cdc9   

https://databricks.com/blog/2019/12/05/processing-geospatial-data-at-scale-with-databricks.html

## 제대로 읽어지지 않는다
https://medium.com/@sabman/loading-geojson-data-in-apache-spark-f7a52390cdc9

import org.apache.spark.sql.SparkSession

val spark = SparkSession
  .builder()
  .appName("GeoJson")
  .getOrCreate()

In [2]:
val raw = spark.read.json("hdfs://namenode:8020/taxidata/nyc-borough-boundaries-polygon.geojson")
raw.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- geometry: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: array (containsNull = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- id: long (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- @id: string (nullable = true)
 |    |-- borough: string (nullable = true)
 |    |-- boroughCode: long (nullable = true)
 |-- type: string (nullable = true)



raw: org.apache.spark.sql.DataFrame = [_corrupt_record: string, geometry: struct<coordinates: array<array<array<double>>>, type: string> ... 3 more fields]


In [4]:
raw.show

+--------------------+--------------------+----+--------------------+-------+
|     _corrupt_record|            geometry|  id|          properties|   type|
+--------------------+--------------------+----+--------------------+-------+
|                   {|                null|null|                null|   null|
|"type": "FeatureC...|                null|null|                null|   null|
|       "features": [|                null|null|                null|   null|
|                null|[[[[-74.050508064...|   0|[http://nyc.pedia...|Feature|
|                   ,|                null|null|                null|   null|
|                null|[[[[-74.053140368...|   1|[http://nyc.pedia...|Feature|
|                   ,|                null|null|                null|   null|
|                null|[[[[-74.159456024...|   2|[http://nyc.pedia...|Feature|
|                   ,|                null|null|                null|   null|
|                null|[[[[-74.082212729...|   3|[http://nyc.pedi

파일이 깨져서 로드 되어서 약간 필터링이 필요하다.

In [7]:
val filteredDF = raw.drop("_corrupt_record").na.drop()
filteredDF.show()

+--------------------+---+--------------------+-------+
|            geometry| id|          properties|   type|
+--------------------+---+--------------------+-------+
|[[[[-74.050508064...|  0|[http://nyc.pedia...|Feature|
|[[[[-74.053140368...|  1|[http://nyc.pedia...|Feature|
|[[[[-74.159456024...|  2|[http://nyc.pedia...|Feature|
|[[[[-74.082212729...|  3|[http://nyc.pedia...|Feature|
|[[[[-73.836682741...|  4|[http://nyc.pedia...|Feature|
|[[[[-73.813396652...|  5|[http://nyc.pedia...|Feature|
|[[[[-73.827182821...|  6|[http://nyc.pedia...|Feature|
|[[[[-73.826074726...|  7|[http://nyc.pedia...|Feature|
|[[[[-73.832442073...|  8|[http://nyc.pedia...|Feature|
|[[[[-73.794201726...|  9|[http://nyc.pedia...|Feature|
|[[[[-73.805097201...| 10|[http://nyc.pedia...|Feature|
|[[[[-73.804991988...| 11|[http://nyc.pedia...|Feature|
|[[[[-73.739558564...| 12|[http://nyc.pedia...|Feature|
|[[[[-73.739443808...| 13|[http://nyc.pedia...|Feature|
|[[[[-73.790549485...| 14|[http://nyc.pedia...|F

filteredDF: org.apache.spark.sql.DataFrame = [geometry: struct<coordinates: array<array<array<double>>>, type: string>, id: bigint ... 2 more fields]


In [10]:
val properties = filteredDF.select(col("properties.@id"), col("properties.borough"), col("properties.boroughCode"))
properties.show(false)

+---------------------------------------------------------+-------------+-----------+
|@id                                                      |borough      |boroughCode|
+---------------------------------------------------------+-------------+-----------+
|http://nyc.pediacities.com/Resource/Borough/Staten_Island|Staten Island|5          |
|http://nyc.pediacities.com/Resource/Borough/Staten_Island|Staten Island|5          |
|http://nyc.pediacities.com/Resource/Borough/Staten_Island|Staten Island|5          |
|http://nyc.pediacities.com/Resource/Borough/Staten_Island|Staten Island|5          |
|http://nyc.pediacities.com/Resource/Borough/Queens       |Queens       |4          |
|http://nyc.pediacities.com/Resource/Borough/Queens       |Queens       |4          |
|http://nyc.pediacities.com/Resource/Borough/Queens       |Queens       |4          |
|http://nyc.pediacities.com/Resource/Borough/Queens       |Queens       |4          |
|http://nyc.pediacities.com/Resource/Borough/Queens   

properties: org.apache.spark.sql.DataFrame = [@id: string, borough: string ... 1 more field]


## 제대로 읽어 보자 GeoJson
https://databricks.com/blog/2019/12/05/processing-geospatial-data-at-scale-with-databricks.html

기본적으로 스파크는 JSON 파일의 모든 레코드가 한 라인으로 완벽하다고 가정한다. 그렇지 않은 데이터에 대해 멀티라인이라는 옵션을 줄 수가 있다.
~~~scala 
option("multiline", "false") // Default
~~~

In [11]:
val raw_json_df = spark.read.option("multiline", "true").json("hdfs://namenode:8020/taxidata/nyc-borough-boundaries-polygon.geojson")

raw_json_df: org.apache.spark.sql.DataFrame = [features: array<struct<geometry:struct<coordinates:array<array<array<double>>>,type:string>,id:bigint,properties:struct<@id:string,borough:string,boroughCode:bigint>,type:string>>, type: string]


In [12]:
raw_json_df.printSchema()

root
 |-- features: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- geometry: struct (nullable = true)
 |    |    |    |-- coordinates: array (nullable = true)
 |    |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |    |    |-- element: double (containsNull = true)
 |    |    |    |-- type: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- properties: struct (nullable = true)
 |    |    |    |-- @id: string (nullable = true)
 |    |    |    |-- borough: string (nullable = true)
 |    |    |    |-- boroughCode: long (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- type: string (nullable = true)



In [22]:
import spark.implicits._

import spark.implicits._


In [45]:
import org.apache.spark.sql.functions.explode

val explode_properties = raw_json_df.withColumn("properties", explode(col("features.properties")))
val explode_geometry = explode_properties.withColumn("geometry", explode(col("features.geometry")))

import org.apache.spark.sql.functions.explode
explode_properties: org.apache.spark.sql.DataFrame = [features: array<struct<geometry:struct<coordinates:array<array<array<double>>>,type:string>,id:bigint,properties:struct<@id:string,borough:string,boroughCode:bigint>,type:string>>, type: string ... 1 more field]
explode_geometry: org.apache.spark.sql.DataFrame = [features: array<struct<geometry:struct<coordinates:array<array<array<double>>>,type:string>,id:bigint,properties:struct<@id:string,borough:string,boroughCode:bigint>,type:string>>, type: string ... 2 more fields]


In [50]:
val parsed = explode_geometry.drop(col("features"))
parsed.printSchema()

root
 |-- type: string (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- @id: string (nullable = true)
 |    |-- borough: string (nullable = true)
 |    |-- boroughCode: long (nullable = true)
 |-- geometry: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: array (containsNull = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)



parsed: org.apache.spark.sql.DataFrame = [type: string, properties: struct<@id: string, borough: string ... 1 more field> ... 1 more field]


In [52]:
parsed.filter(col("properties.@id") === "http://nyc.pediacities.com/Resource/Borough/Staten_Island").count()

res18: Long = 416


In [62]:
raw_json_df.select("features", explode(col("features")))

<console>: 42: error: overloaded method value select with alternatives: