# Part 1: Getting to know the data

In [21]:
# imports
from pyspark.sql import SparkSession, functions, types, Row
import sys
assert sys.version_info >= (3, 5)
import re

# Configuration
## DataFrames
spark = SparkSession.builder.appName('Canadian wind').getOrCreate()
spark.sparkContext.setLogLevel('WARN')
assert spark.version >= '3.0'  # make sure we have Spark 3.0+
## RDDs
sc = spark.sparkContext
assert sc.version >= '3.0'

In [22]:
# General variables
data_path = "data.nosync"

## Metadata
### Stations
See [documentation](https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt)
IV. FORMAT OF "ghcnd-stations.txt"

In [23]:
def parse_line(line):
    id = '(\S+)'
    latitude = '([-+]?(?:\d*\.\d+|\d+))'
    longitude = '([-+]?(?:\d*\.\d+|\d+))'
    elevation = '([-+]?(?:\d*\.\d+|\d+))'
    state = '([-a-zA-Z0-9_][-a-zA-Z0-9_])'
    name = '((\S+\s)+)'
    delimiter = '\s+'
    any = ".*"
    line_re = re.compile(r'^'+ id + delimiter + latitude + delimiter + longitude + delimiter + elevation + delimiter + state + delimiter + name + any + '$')
    splitted_line = re.match(line_re, line)
    return Row(splitted_line.group(1), float(splitted_line.group(2)), float(splitted_line.group(3)), float(splitted_line.group(4)), splitted_line.group(5), splitted_line.group(6))

In [24]:
def stations_schema():
    return types.StructType([
        types.StructField("id", types.StringType()),
        types.StructField("latitude", types.FloatType()),
        types.StructField("longitude", types.FloatType()),
        types.StructField("elevation", types.FloatType()),
        types.StructField("state", types.StringType()),
        types.StructField("name", types.StringType()),
        #types.StructField("gsn_flag", types.StringType()),
        #types.StructField("crn_flag", types.StringType()),
        #types.StructField("wmo_id", types.StringType()),
    ])

In [25]:
stations_input = sc.textFile(data_path + "/ghcnd-stations.txt")
formatted_lines = stations_input.filter(lambda line: line.startswith("CA")).map(parse_line)
cleaned_stations = spark.createDataFrame(data=formatted_lines, schema = stations_schema())
cleaned_stations.show()

+-----------+--------+---------+---------+-----+--------------------+
|         id|latitude|longitude|elevation|state|                name|
+-----------+--------+---------+---------+-----+--------------------+
|CA001010066| 48.8667|-123.2833|      4.0|   BC|        ACTIVE PASS |
|CA001010235|    48.4|-123.4833|     17.0|   BC|        ALBERT HEAD |
|CA001010595| 48.5833|-123.5167|     85.0|   BC|BAMBERTON OCEAN C...|
|CA001010720|    48.5|   -124.0|    351.0|   BC|         BEAR CREEK |
|CA001010774|    48.5|  -123.35|     61.0|   BC|        BEAVER LAKE |
|CA001010780| 48.3333|-123.6333|     12.0|   BC|         BECHER BAY |
|CA001010960|    48.6|-123.4667|     38.0|   BC|    BRENTWOOD BAY 2 |
|CA001010961| 48.5667|  -123.45|     31.0|   BC|BRENTWOOD CLARKE ...|
|CA001010965| 48.5667|-123.4333|     91.0|   BC|BRENTWOOD W SAANI...|
|CA001011467| 48.5833|-123.4167|     53.0|   BC|CENTRAL SAANICH V...|
|CA0010114F6| 48.5667|   -123.4|     38.0|   BC|CENTRAL SAANICH I...|
|CA0010114FF|   48.5

In [26]:
cleaned_stations.write.parquet(data_path + "/ghcnd-stations-cleaned", mode="overwrite")

### Countries
See [documentation](https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt)
V. FORMAT OF "ghcnd-countries.txt"

In [27]:
def parse_line(line):
    code = '([A-Z][A-Z])\s'
    name = '(([,A-Za-z\[\]\(\)]+\s*)+)'
    line_re = re.compile(r'^'+ code + name + '\s*$')
    splitted_line = re.match(line_re, line)
    return Row(splitted_line.group(1), splitted_line.group(2))

In [28]:
def countries_schema():
    return types.StructType([
        types.StructField("code", types.StringType()),
        types.StructField("name", types.StringType()),
    ])

In [29]:
countries_input = sc.textFile(data_path + "/ghcnd-countries.txt")
formatted_lines = countries_input.map(parse_line)
cleaned_countries = spark.createDataFrame(data=formatted_lines, schema = countries_schema())
cleaned_countries.show()

+----+--------------------+
|code|                name|
+----+--------------------+
|  AC|Antigua and Barbuda |
|  AE|United Arab Emira...|
|  AF|         Afghanistan|
|  AG|            Algeria |
|  AJ|         Azerbaijan |
|  AL|             Albania|
|  AM|            Armenia |
|  AO|             Angola |
|  AQ|American Samoa [U...|
|  AR|          Argentina |
|  AS|          Australia |
|  AU|            Austria |
|  AY|         Antarctica |
|  BA|            Bahrain |
|  BB|           Barbados |
|  BC|           Botswana |
|  BD|Bermuda [United K...|
|  BE|            Belgium |
|  BF|       Bahamas, The |
|  BG|          Bangladesh|
+----+--------------------+
only showing top 20 rows



### States
See [documentation](https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt)
VI. FORMAT OF "ghcnd-states.txt"

In [30]:
def parse_line(line):
    code = '([A-Z][A-Z])\s'
    name = '(([A-Z]+\s*)+)'
    line_re = re.compile(r'^'+ code + name + '\s*$')
    splitted_line = re.match(line_re, line)
    return Row(splitted_line.group(1), splitted_line.group(2))

In [31]:
def states_schema():
    return types.StructType([
        types.StructField("code", types.StringType()),
        types.StructField("name", types.StringType()),
    ])

In [32]:
states_input = sc.textFile(data_path + "/ghcnd-states.txt")
formatted_lines = states_input.map(parse_line)
cleaned_states = spark.createDataFrame(data=formatted_lines, schema = states_schema())
cleaned_states.show()

+----+--------------------+
|code|                name|
+----+--------------------+
|  AB|             ALBERTA|
|  AK|              ALASKA|
|  AL|             ALABAMA|
|  AR|            ARKANSAS|
|  AS|      AMERICAN SAMOA|
|  AZ|             ARIZONA|
|  BC|    BRITISH COLUMBIA|
|  CA|          CALIFORNIA|
|  CO|            COLORADO|
|  CT|         CONNECTICUT|
|  DC|DISTRICT OF COLUMBIA|
|  DE|            DELAWARE|
|  FL|             FLORIDA|
|  FM|          MICRONESIA|
|  GA|             GEORGIA|
|  GU|                GUAM|
|  HI|              HAWAII|
|  IA|                IOWA|
|  ID|               IDAHO|
|  IL|            ILLINOIS|
+----+--------------------+
only showing top 20 rows



### Inventory
See [documentation](https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt)
VII. FORMAT OF "ghcnd-inventory.txt"

In [33]:
# Cleanup
def toDF(data):
    splitted_data = data.split(" ")
    while "" in splitted_data:
        splitted_data.remove("")

    return Row(splitted_data[0], float(splitted_data[1]), float(splitted_data[2]), splitted_data[3], int(splitted_data[4]), int(splitted_data[5]))

In [34]:
def inventory_schema():
    return types.StructType([
        types.StructField("id", types.StringType()),
        types.StructField("latitude", types.FloatType()),
        types.StructField("longitude", types.FloatType()),
        types.StructField("element", types.StringType()),
        types.StructField("first_year", types.IntegerType()),
        types.StructField("last_year", types.IntegerType()),
    ])

In [35]:
inventory_input = sc.textFile(data_path + "/ghcnd-inventory.txt")
formatted_lines = inventory_input.filter(lambda line: line.startswith("CA")).map(toDF)
cleaned_inventory = spark.createDataFrame(data=formatted_lines, schema = inventory_schema())
cleaned_inventory.show()

[Stage 13:>                                                         (0 + 1) / 1]

22/11/07 11:58:14 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 13 (TID 16): Attempting to kill Python Worker
+-----------+--------+---------+-------+----------+---------+
|         id|latitude|longitude|element|first_year|last_year|
+-----------+--------+---------+-------+----------+---------+
|CA001010066| 48.8667|-123.2833|   PRCP|      1984|     1996|
|CA001010066| 48.8667|-123.2833|   SNOW|      1984|     1996|
|CA001010066| 48.8667|-123.2833|   SNWD|      1984|     1996|
|CA001010066| 48.8667|-123.2833|   MDPR|      1984|     1996|
|CA001010066| 48.8667|-123.2833|   MDSF|      1984|     1990|
|CA001010235|    48.4|-123.4833|   TMAX|      1976|     1978|
|CA001010235|    48.4|-123.4833|   TMIN|      1976|     1978|
|CA001010235|    48.4|-123.4833|   PRCP|      1971|     1995|
|CA001010235|    48.4|-123.4833|   SNOW|      1971|     1995|
|CA001010235|    48.4|-123.4833|   SNWD|      1991|     1995|
|CA001010235|    48.4|-123.4833|   MDPR|      1971|     19

                                                                                

In [36]:
cleaned_inventory.write.parquet(data_path + "/ghcnd-inventory-cleaned", mode="overwrite")

                                                                                

## Data
### Daily summaries latest


In [37]:
def daily_summaries_schema():
    return types.StructType([
        types.StructField("station", types.StringType()),
        types.StructField("date", types.StringType()),
        types.StructField("latitude", types.StringType()),
        types.StructField("longtitude", types.StringType()),
        types.StructField("elevation", types.StringType()),
        types.StructField("name", types.StringType()),
        types.StructField("prcp", types.StringType()),
        types.StructField("prcp_attributes", types.StringType()),
        types.StructField("snow", types.StringType()),
        types.StructField("snow_attributes", types.StringType()),
        types.StructField("snwd", types.StringType()),
        types.StructField("snwd_attributes", types.StringType()),
        types.StructField("dapr", types.StringType()),
        types.StructField("dapr_attributes", types.StringType()),
        types.StructField("mdpr", types.StringType()),
        types.StructField("mdpr_attributes", types.StringType()),
        types.StructField("wesd", types.StringType()),
        types.StructField("wesd_attributes", types.StringType()),
    ])
# show the data for the station CA1AB000001
daily_summaries_data = spark.read.csv(data_path + "/ghcnd-daily-summaries-latest-canada/CA1AB000001.csv", sep=",", header=True)
daily_summaries_data.show()

+-----------+----------+---------+-----------+---------+--------------------+-----+---------------+-----+---------------+-----+---------------+----+---------------+----+---------------+----+---------------+
|    STATION|      DATE| LATITUDE|  LONGITUDE|ELEVATION|                NAME| PRCP|PRCP_ATTRIBUTES| SNOW|SNOW_ATTRIBUTES| SNWD|SNWD_ATTRIBUTES|DAPR|DAPR_ATTRIBUTES|MDPR|MDPR_ATTRIBUTES|WESD|WESD_ATTRIBUTES|
+-----------+----------+---------+-----------+---------+--------------------+-----+---------------+-----+---------------+-----+---------------+----+---------------+----+---------------+----+---------------+
|CA1AB000001|2014-07-04|53.606907|-113.561926|    686.7|EDMONTON 9.1 NNW,...|    0|            ,,N|    0|            ,,N|    0|            ,,N|null|           null|null|           null|null|           null|
|CA1AB000001|2014-07-05|53.606907|-113.561926|    686.7|EDMONTON 9.1 NNW,...|  241|            ,,N|    0|            ,,N|    0|            ,,N|null|           null|null|   

### GHCND-all
See [documentation](https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt)
III. FORMAT OF DATA FILES (".dly" FILES)

In [84]:
def parse_line(line):
    station = line[:11]
    year = line[11:15]
    month = line[15:17]
    element = line[17:21]
    start_index=21
    data = [station, year, month, element]
    for i in range(31):
        value_i = line[start_index:start_index+5]
        mflag_i = line[start_index+5:start_index+6]
        qflag_i = line[start_index+6:start_index+7]
        sflag_i = line[start_index+7:start_index+8]
        start_index=start_index+8
        data.append(int(value_i))
        data.append(mflag_i)
        data.append(qflag_i)
        data.append(sflag_i)

    return data

def ghcnd_all_columns():
    columns = ["station", "year", "month", "element"]
    for i in range(31):
        value_i = "value" + str(i+1)
        mflag_i = "mflag" + str(i+1)
        qflag_i = "qflag" + str(i+1)
        sflag_i = "sflag" + str(i+1)
        columns.append(value_i)
        columns.append(mflag_i)
        columns.append(qflag_i)
        columns.append(sflag_i)

    return columns

In [87]:
ghcnd_all_input = sc.textFile(data_path + "/ghcnd-all-canada/CA1AB000001.dly")
formatted_lines = ghcnd_all_input.map(parse_line)
cleaned_ghcnd_all = formatted_lines.toDF(ghcnd_all_columns())
cleaned_ghcnd_all.show()

Py4JJavaError: An error occurred while calling o1462.partitions.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/Users/ninjeanne/Documents/mpcs/datastorm/data.nosync/ghcnd-all-canada/CA1AB000001.dly
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:304)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:244)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:332)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:208)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:292)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:292)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:288)
	at org.apache.spark.api.java.JavaRDDLike.partitions(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.JavaRDDLike.partitions$(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.io.IOException: Input path does not exist: file:/Users/ninjeanne/Documents/mpcs/datastorm/data.nosync/ghcnd-all-canada/CA1AB000001.dly
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:278)
	... 25 more


In [86]:
cleaned_ghcnd_all.write.partitionBy("station").parquet(data_path + "/ghcnd-all-canada-cleaned", mode="overwrite")

22/11/07 12:36:58 ERROR Executor: Exception in task 1.0 in stage 45.0 (TID 50)
java.io.FileNotFoundException: File file:/Users/ninjeanne/Documents/mpcs/datastorm/data.nosync/ghcnd-all-canada/CA1AB000001.dly does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:779)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1100)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:769)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:160)
	at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:372)
	at org.apache.hadoop.fs.ChecksumFileSystem.lambda$openFileWithOptions$0(ChecksumFileSystem.java:896)
	at org.apache.hadoop.util.LambdaUtils.eval(LambdaUtils.java:52)
	at org.apache.hadoop.fs.ChecksumFileSystem.openFileWithOptions

Py4JJavaError: An error occurred while calling o1445.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:638)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:278)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:793)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 45.0 failed 1 times, most recent failure: Lost task 1.0 in stage 45.0 (TID 50) (d142-058-087-241.wireless.sfu.ca executor driver): java.io.FileNotFoundException: File file:/Users/ninjeanne/Documents/mpcs/datastorm/data.nosync/ghcnd-all-canada/CA1AB000001.dly does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:779)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1100)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:769)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:160)
	at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:372)
	at org.apache.hadoop.fs.ChecksumFileSystem.lambda$openFileWithOptions$0(ChecksumFileSystem.java:896)
	at org.apache.hadoop.util.LambdaUtils.eval(LambdaUtils.java:52)
	at org.apache.hadoop.fs.ChecksumFileSystem.openFileWithOptions(ChecksumFileSystem.java:894)
	at org.apache.hadoop.fs.FileSystem$FSDataInputStreamBuilder.build(FileSystem.java:4768)
	at org.apache.hadoop.mapred.LineRecordReader.<init>(LineRecordReader.java:115)
	at org.apache.hadoop.mapred.TextInputFormat.getRecordReader(TextInputFormat.java:67)
	at org.apache.spark.rdd.HadoopRDD$$anon$1.liftedTree1$1(HadoopRDD.scala:288)
	at org.apache.spark.rdd.HadoopRDD$$anon$1.<init>(HadoopRDD.scala:287)
	at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:245)
	at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:97)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:245)
	... 42 more
Caused by: java.io.FileNotFoundException: File file:/Users/ninjeanne/Documents/mpcs/datastorm/data.nosync/ghcnd-all-canada/CA1AB000001.dly does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:779)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1100)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:769)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:160)
	at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:372)
	at org.apache.hadoop.fs.ChecksumFileSystem.lambda$openFileWithOptions$0(ChecksumFileSystem.java:896)
	at org.apache.hadoop.util.LambdaUtils.eval(LambdaUtils.java:52)
	at org.apache.hadoop.fs.ChecksumFileSystem.openFileWithOptions(ChecksumFileSystem.java:894)
	at org.apache.hadoop.fs.FileSystem$FSDataInputStreamBuilder.build(FileSystem.java:4768)
	at org.apache.hadoop.mapred.LineRecordReader.<init>(LineRecordReader.java:115)
	at org.apache.hadoop.mapred.TextInputFormat.getRecordReader(TextInputFormat.java:67)
	at org.apache.spark.rdd.HadoopRDD$$anon$1.liftedTree1$1(HadoopRDD.scala:288)
	at org.apache.spark.rdd.HadoopRDD$$anon$1.<init>(HadoopRDD.scala:287)
	at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:245)
	at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:97)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more


### Wind

### Temperature

### Weather

# Part 2: Clean up the data
## CSV to Parquet