In [49]:
from pyspark.sql import SparkSession, functions, types
from pyspark.sql.functions import col,isnan,when,count

def main():

    stations_schema = types.StructType([
        types.StructField("Station ID", types.StringType()),
        types.StructField("Latitude", types.FloatType()),
        types.StructField("Longitude", types.FloatType()),
        types.StructField("Elevation", types.FloatType()),
        types.StructField("State", types.StringType())
    ])

    weather_schema = types.StructType([
        types.StructField('station_id', types.StringType()),
        types.StructField('date', types.StringType()),
        types.StructField('PRCP', types.FloatType()),
        types.StructField('SNOW', types.FloatType()),
        types.StructField('SNWD', types.FloatType()),
        types.StructField('TMAX', types.FloatType()),
        types.StructField('TMIN', types.FloatType())
    ])
    stations = spark.read.format('csv').schema(stations_schema).load("gs://big-data-1-project-storage/onlyBCStationNames")
    stations = stations.withColumnRenamed('Station ID','station_id')
    weather = spark.read.format('csv').schema(weather_schema)\
    .load("gs://big-data-1-project-storage/cleaned-data/weather_cleaned_1958Onwards.csv")
#     print(type(weather))
#     print(type(stations))
#     print(weather.show(5))
#     print(stations.show(5))
    data = weather.join(functions.broadcast(stations), on=['station_id'], how='left')
#     print(type(data)) 
    data = data.filter(data.station_id != "null")
    data = data.filter(data.date != "null")
    print(data.show(10))

    data.write.save("gs://big-data-1-project-storage/cleaned-data/weather_stations.csv", format='csv',header=True)


if __name__ == '__main__':
    spark = SparkSession.builder.appName('weather and station join').getOrCreate()
    spark.sparkContext.setLogLevel('WARN')
    sc = spark.sparkContext
    main()


+-----------+--------+----+----+----+----+----+--------+---------+---------+-----+
| station_id|    date|PRCP|SNOW|SNWD|TMAX|TMIN|Latitude|Longitude|Elevation|State|
+-----------+--------+----+----+----+----+----+--------+---------+---------+-----+
|CA001010235|19730707|null|null| 0.0| 0.0|null|    48.4|-123.4833|     17.0|   BC|
|CA001010235|19780330|14.0| 8.0| 0.0| 0.0|null|    48.4|-123.4833|     17.0|   BC|
|CA001010595|19690607|null|null| 0.0| 0.0|null| 48.5833|-123.5167|     85.0|   BC|
|CA001010720|19670815|28.9|12.8| 0.0| 0.0|null|    48.5|   -124.0|    351.0|   BC|
|CA001010960|19910618|null|null| 0.0| 0.0| 0.0|    48.6|-123.4667|     38.0|   BC|
|CA001010960|19960115|null|null|10.1| 0.0| 0.0|    48.6|-123.4667|     38.0|   BC|
|CA001010961|19751130|null|null|10.9|15.0|null| 48.5667|  -123.45|     31.0|   BC|
|CA001011467|19750419|null|null| 0.5| 0.0|null| 48.5833|-123.4167|     53.0|   BC|
|CA001011467|19870719|null|null| 0.0| 0.0| 0.0| 48.5833|-123.4167|     53.0|   BC|
|CA0