# How the data looks like
```
station,date,element,value,qflag,mflag,sflag,
AE000041196,20210101,TMAX,278,,,S,
AE000041196,20210101,PRCP,0,D,,S,
AE000041196,20210101,TAVG,214,H,,S,
AEM00041194,20210101,TMAX,266,,,S,
AEM00041194,20210101,TMIN,178,,,S,
```

In [6]:
# imports
from pyspark.sql import SparkSession, functions, types, Row
import sys
assert sys.version_info >= (3, 5)
import re

# Configuration
## DataFrames
spark = SparkSession.builder.appName('Canadian wind').getOrCreate()
spark.sparkContext.setLogLevel('WARN')
assert spark.version >= '3.0'  # make sure we have Spark 3.0+
## RDDs
sc = spark.sparkContext
assert sc.version >= '3.0'

# General variables
data_path = "../data.nosync"

## Group the data by states, because we want to find specific regions that are good for refugee settlement

In [15]:
stations = spark.read.parquet(data_path + "/ghcnd-stations-cleaned").withColumnRenamed("id", "station")
#stations.show()

def get_data_schema():
    return types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.StringType()),
        types.StructField('observation', types.StringType()),
        types.StructField('value', types.IntegerType()),
        types.StructField('mflag', types.StringType()),
        types.StructField('qflag', types.StringType()),
        types.StructField('sflag', types.StringType()),
        types.StructField('obstime', types.StringType()),
    ])

data = spark.read.csv(data_path + "/cluster-data", schema=get_data_schema())
cleaned_data = data.where(data["station"].startswith("CA"))
#cleaned_data.show()

merged = stations.join(cleaned_data, "station").select("station", "state", "date", "observation", "value").where(data["qflag"] != "null")

# only once:
# merged.write.partitionBy("state").parquet(data_path + "/cluster-data-cleaned", mode="overwrite")

## Group the data by observations such that I can focus on specific elements and read the data optimally
