# Read CSV File to DataFrame

In [None]:
# imports
import pandas as pd

# data path
dataPath = 'D:\mattp\Documents\School\SeniorYear\DataAnalysis\WaterStreetPA_WthrMETAR_SNOW-2018.csv'

# read in data
data = pd.read_csv(dataPath, header=0)

# Investigate Each Feature

## Row Number

In [None]:
# listing of preprocessing info: type/processing method, null values, null value type, number of null values 
row_num_df = data.get('Unnamed: 0')
row_num_list = row_num_df.tolist()

# listing of summary stats: min, max, mean, median, std dev
import numpy

# row num range
row_min = min(row_num_list)
row_max = max(row_num_list)
print('Row Range: {}-{}'.format(row_min, row_max))

## Datetime

In [None]:
import datetime as dt

datetime_df = data.get('datetime')
datetime_list = datetime_df.tolist()
dt_obj = dt.datetime.strptime(datetime_list[0], '%Y-%m-%d %H:%M:%S')

# datetime range
datetime_min = min(datetime_list)
datetime_max = max(datetime_list)
print('Datetime Range: {} to {}'.format(datetime_min, datetime_max))

# TODO: plot histogram of time deltas

## PM 2.5

In [None]:
pm_df = data.get('PM25')

# Check for null values
is_null_values = pm_df.isnull().values.any()
print('Null Values: {}\n'.format(is_null_values))


import matplotlib.pyplot as plt

# PM 2.5 summary stats
df_min = pm_df.min()
df_max = pm_df.max()
df_mean = pm_df.mean()
df_median = pm_df.median()
df_stddev = pm_df.std()
print('PM 2.5 Summary Stats- \nmin:{}\nmax:{}\nmean:{}\nmedian:{}\nstd dev:{}\n'.format(df_min, df_max, df_mean, df_median, df_stddev))

df_mode = pm_df.mode()
df_mode = df_mode.get(0, 1)
num_mode = pm_df.value_counts()[df_mode]
print('Mode Value: {}'.format(df_mode))
print('Mode Occurences: {}'.format(num_mode))

# PM 2.5 plots
plt.hist(pm_df, bins=140, range=(0,1400))
plt.show()

plt.hist(pm_df, bins=50, range=(0,50))
plt.show()

plt.hist(pm_df, bins=50, range=(50,100))
plt.show()

plt.hist(pm_df, bins=13, range=(100,1400))
plt.show()

# TODO: Plot histogram of the highest PM 2.5 reading of each day

## Temperature

In [None]:
temp_df = data.get('temp')
temp_list = temp_df.tolist()

# Check for null values
is_null_values = temp_df.isnull().values.any()
print('Null Values: {}\n'.format(is_null_values))

# Temp summary stats
df_min = temp_df.min()
df_max = temp_df.max()
df_mean = temp_df.mean()
df_median = temp_df.median()
df_stddev = temp_df.std()
print('Temperature Summary Stats- \nmin:{}\nmax:{}\nmean:{}\nmedian:{}\nstd dev:{}\n'.format(df_min, df_max, df_mean, df_median, df_stddev))

df_mode = temp_df.mode()
df_mode = df_mode.get(0, 1)
num_mode = temp_df.value_counts()[df_mode]
print('Mode Value: {}'.format(df_mode))
print('Mode Occurences: {}\n'.format(num_mode))

print('Value counts:\n{}'.format(temp_df.value_counts()))

# Temp plots
plt.hist(temp_df, bins=100)
plt.show()

## Dewpoint

In [None]:
dewpoint_df = data.get('dewpoint')
dewpoint_list = dewpoint_df.tolist()

# Check for null values
is_null_values = dewpoint_df.isnull().values.any()
print('Null Values: {}\n'.format(is_null_values))

# Dewpoint summary stats
df_min = dewpoint_df.min()
df_max = dewpoint_df.max()
df_mean = dewpoint_df.mean()
df_median = dewpoint_df.median()
df_stddev = dewpoint_df.std()
print('Dewpoint Summary Stats- \nmin:{}\nmax:{}\nmean:{}\nmedian:{}\nstd dev:{}\n'.format(df_min, df_max, df_mean, df_median, df_stddev))

df_mode = dewpoint_df.mode()
df_mode = df_mode.get(0, 1)
num_mode = dewpoint_df.value_counts()[df_mode]
print('Mode Value: {}'.format(df_mode))
print('Mode Occurences: {}\n'.format(num_mode))

print('Value counts:\n{}'.format(dewpoint_df.value_counts()))

# Dewpoint plots
plt.hist(dewpoint_df, bins=100)
plt.show()

## Relative Humidity

In [None]:
rh_df = data.get('RH')
rh_list = rh_df.tolist()

# Check for null values
is_null_values = rh_df.isnull().values.any()
print('Null Values: {}\n'.format(is_null_values))

# RH summary stats
df_min = rh_df.min()
df_max = rh_df.max()
df_mean = rh_df.mean()
df_median = rh_df.median()
df_stddev = rh_df.std()
print('Relative Humidity Summary Stats- \nmin:{}\nmax:{}\nmean:{}\nmedian:{}\nstd dev:{}\n'.format(df_min, df_max, df_mean, df_median, df_stddev))

df_mode = rh_df.mode()
df_mode = df_mode.get(0, 1)
num_mode = rh_df.value_counts()[df_mode]
print('Mode Value: {}'.format(df_mode))
print('Mode Occurences: {}\n'.format(num_mode))

print('Value counts:\n{}'.format(rh_df.value_counts()))

# RH plots
plt.hist(rh_df, bins=100)
plt.show()

## Wind Direction

## Wind Speed

In [None]:
wind_speed_df = data.get('windMPH')
wind_speed_list = wind_speed_df.tolist()

# Check for null values
is_null_values = wind_speed_df.isnull().values.any()
print('Null Values: {}\n'.format(is_null_values))

# Temp summary stats
df_min = wind_speed_df.min()
df_max = wind_speed_df.max()
df_mean = wind_speed_df.mean()
df_median = wind_speed_df.median()
df_stddev = wind_speed_df.std()
print('Wind Speed Summary Stats- \nmin:{}\nmax:{}\nmean:{}\nmedian:{}\nstd dev:{}\n'.format(df_min, df_max, df_mean, df_median, df_stddev))

df_mode = wind_speed_df.mode()
df_mode = df_mode.get(0, 1)
num_mode = wind_speed_df.value_counts()[df_mode]
print('Mode Value: {}'.format(df_mode))
print('Mode Occurences: {}\n'.format(num_mode))

print('Value counts:\n{}'.format(wind_speed_df.value_counts()))

# Wind speed plots
plt.hist(wind_speed_df, bins=40, range=(0,20))
plt.show()

plt.hist(wind_speed_df, bins=30, range=(20,35))
plt.show()

## Precipitation

## Mean Sea Level Pressure (MSLP)

## Visibility

## Gust

## Weather Code (WX Code)
https://graphical.weather.gov/definitions/defineWxNoTable.html

In [None]:
wx_code_df = data.get('wxcodes')
wx_code_list = wx_code_df.tolist()

# Check for null values
is_null_values = wx_code_df.isnull().values.any()
print(is_null_values)

is_null_values = None in wx_code_list
print(is_null_values)

is_null_values = "" in wx_code_list
print(is_null_values)

len_list = len(wx_code_list)
print(len_list)

print(wx_code_list[0])

print(type(wx_code_list[0]))

## Snow Depth

## Snow Temperature

## Snow Density

## Forecasted from 0 UTC

## Few Cloud Layer (FEW)
Describes cloud cover when between 0/8th and 2/8th of the sky is obscured by cloud.

## Scattered Cloud Layer (SCT)
Scattered cloud layer 3/8ths to 4/8ths 
http://www.moratech.com/aviation/metaf-abbrev.html

## Broken Cloud Layer (BKN)
Broken cloud layer 5/8ths to 7/8ths

## Overcast Cloud Layer (OVC)
Overcast cloud layer 8/8ths coverage

## Vertical Visibility (VV)
Vertical Visibility, indefinite ceiling

## Cloud Rating
TODO: Determine meaning