# Setup Weather Notebook

In [1]:
import numpy as np
import pandas as pd
import matplotlib.cm
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr

import matplotlib.lines as mlines
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import time

import findspark
findspark.init()

from geopy.distance import geodesic

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, sum, avg, udf, to_timestamp, date_trunc
from pyspark.sql.functions import year, month, hour, dayofweek
from pyspark.sql.functions import round, concat, col, lit

from pyspark.sql.types import FloatType, StructType, IntegerType, StringType, DoubleType, StructField, TimestampType, DateType
from pyspark.sql.types import TimestampType

import random

spark1 = SparkSession.builder.appName("CB").getOrCreate()

import datetime as dt
print("modules imported")

randomSeed = 1984

pathWeather = "/users/sajudson/Dropbox/WPI/DS504/project/weather/"
pathData = "/users/sajudson/Dropbox/WPI/DS504/project/data/"
pathFigure = "/users/sajudson/Dropbox/WPI/DS504/project/figures/"


plt.style.use('ggplot')

modules imported


In [2]:
#Use NYC weather data for both data sets
weatherRaw = "NYC"+'weatherRaw'
weatherFeatures = "NYC"+'weatherFeatures'
weather_file_type = 'csv'


# Weather

 - Define Schema of weather data file
 - Load weather data into a dataframe,
 - Convert date string to timestamp,
 - Remove uneeded columns



In [5]:
t0= time.time()
weatherDataFileName = 'weather_nyc_metblue'
weatherDataFileExt = ".csv"

#create weather schema


weatherSchema1 = StructType([StructField('year', IntegerType(), False),
                            StructField('month', IntegerType(), False),
                            StructField('day', IntegerType(), False),
                            StructField('hour', IntegerType(), False),
                            StructField('minute', IntegerType(), False),
                            StructField('temp', DoubleType(), False),
                            StructField('humidity', DoubleType(), True),
                            StructField('total_precip', DoubleType(), True),
                            StructField('snow', DoubleType(), True),
                            StructField('cloud_cover', DoubleType(), True),
                            StructField('wind_speed', DoubleType(), True),
                            StructField('wind_direction', DoubleType(), True),
                            StructField('wind_gust', DoubleType(), True),
                           ])




# CSV options
infer_schema = "false"
first_row_is_header = "true"
#delimiter = ","
delimiter = ";"

# The applied options are for CSV files. For other file types, these will be ignored.
weather = spark1.read.format(weather_file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(weatherSchema1) \
  .load(pathWeather+weatherDataFileName+weatherDataFileExt
)
print("weather file loaded",time.time()-t0)
#convert dt_iso into spark timestamp
@udf('string')
def trimDateTimeUTC(d):
    return (d[:-10])

from pyspark.sql.functions import col, lit

dateConcat = concat(col("year"), lit("-"), col("month"), lit("-"), col('day'))
timeConcat = concat(col("hour"), lit(":"), col("minute"), lit(':00'))
datetimeConcat = concat(dateConcat,lit(" "),timeConcat)

weather = weather.withColumn("datetime", to_timestamp(datetimeConcat,"yyyy-MM-dd HH:mm:ss").cast("timestamp"))
#weather = weather.withColumn("temp", round(weather.temp-273.15,3))
#remove extraneous weather features

weatherDropFeatures1 =['year','month','day','hour','minute','wind_gust','wind_speed','snow','wind_direction'
                     ]

weather = weather.select([column for column in weather.columns if column not in weatherDropFeatures1])
print("time stamps converted, extra columns dropped",time.time()-t0)
weather.show()
weather.cache()

#dates = ("2015-05-12",  "2015-05-13")
#date_from, date_to = [to_timestamp(lit(s)).cast(TimestampType()) for s in dates]

#weatherrange = weather.where((weather.datetime >= date_from) & (weather.datetime <= date_to))
#weatherrange.show()


weather file loaded 2.4327681064605713
time stamps converted, extra columns dropped 3.6369149684906006
+-----+--------+------------+-----------+-------------------+
| temp|humidity|total_precip|cloud_cover|           datetime|
+-----+--------+------------+-----------+-------------------+
|61.12|    70.0|         0.0|      100.0|2015-10-01 00:00:00|
|60.11|    69.0|         0.0|      100.0|2015-10-01 01:00:00|
|58.99|    71.0|         0.0|      100.0|2015-10-01 02:00:00|
|57.68|    74.0|         0.0|      100.0|2015-10-01 03:00:00|
|56.46|    75.0|         0.0|      100.0|2015-10-01 04:00:00|
|55.48|    76.0|         0.0|      100.0|2015-10-01 05:00:00|
|54.69|    75.0|         0.0|      100.0|2015-10-01 06:00:00|
|54.62|    64.0|         0.0|      100.0|2015-10-01 07:00:00|
| 54.8|    63.0|         0.0|      100.0|2015-10-01 08:00:00|
|55.72|    62.0|         0.0|      100.0|2015-10-01 09:00:00|
|57.07|    60.0|         0.0|      100.0|2015-10-01 10:00:00|
|58.15|    59.0|         0.0|

DataFrame[temp: double, humidity: double, total_precip: double, cloud_cover: double, datetime: timestamp]

In [9]:
weatherFilename = weatherFeatures + "."+weather_file_type

t0= time.time()
weather.write.csv(pathWeather+weatherFilename, mode="overwrite", header=True, sep=",")
print(time.time()-t0)


weather.describe().show()

0.6012887954711914
+-------+------------------+-----------------+-------------------+------------------+
|summary|              temp|         humidity|       total_precip|       cloud_cover|
+-------+------------------+-----------------+-------------------+------------------+
|  count|             26304|            26304|              26304|             26304|
|   mean| 55.70662902980582|69.73696015815085|0.11632831508515679| 53.49331660583943|
| stddev|18.188485165737887|18.09230770784709| 0.5428439737579306|45.858469289829046|
|    min|             -1.11|             22.0|                0.0|               0.0|
|    max|             98.41|            100.0|               14.3|             100.0|
+-------+------------------+-----------------+-------------------+------------------+



In [12]:
weatherFeatureSchema = StructType([StructField('temp', DoubleType(), False),
                            StructField('humidity', DoubleType(), True),
                            StructField('total_precip', DoubleType(), True),
                            StructField('cloud_cover', DoubleType(), True),                 
                            StructField('datetime', TimestampType(), True)
                           ])


# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
weather2 = spark1.read.format(weather_file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(weatherFeatureSchema) \
  .load(pathWeather+weatherFilename)

weather2.describe().show()

+-------+------------------+-----------------+-------------------+------------------+
|summary|              temp|         humidity|       total_precip|       cloud_cover|
+-------+------------------+-----------------+-------------------+------------------+
|  count|             26304|            26304|              26304|             26304|
|   mean| 55.70662902980582|69.73696015815085|0.11632831508515679| 53.49331660583943|
| stddev|18.188485165737887|18.09230770784709| 0.5428439737579306|45.858469289829046|
|    min|             -1.11|             22.0|                0.0|               0.0|
|    max|             98.41|            100.0|               14.3|             100.0|
+-------+------------------+-----------------+-------------------+------------------+

