# Data from NOAA and climod2
* https://www.ncdc.noaa.gov/cdo-web/
* http://climod2.nrcc.cornell.edu/
* Note:
    * T: Trace amount- Less than 0.01" precipitation; less than 0.1" snowfall; less than 1" snow depth.
    * S: Subsequent- Indicates the observation is missing, but is included in a subsequent value.
    * A: Accumulated- Indicates an accumulated value which includes the current day and any immediately preceeding missing days (starting with a day flagged by "S").
    * M: Missing.

___

# Importing Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import os
sns.set()

In [2]:
df = pd.read_csv('Bradley_1_8_1942_2_8_2023.csv', skipinitialspace = True)

In [5]:
df = df.drop(['STATION'], axis = 1)

In [6]:
df = df.drop(['NAME'], axis = 1)

In [7]:
df['Year'] = pd.DatetimeIndex(df['DATE']).year
df['Month'] = pd.DatetimeIndex(df['DATE']).month
df['Day'] = pd.DatetimeIndex(df['DATE']).day

In [12]:
for col in df.columns:
    print(col,'percent NaN:', df[col].isna().sum()/len(df))

DATE percent NaN: 0.0
ACMH percent NaN: 0.5988958824149935
ACSH percent NaN: 0.5987200675129224
AWND percent NaN: 0.4979078026653539
FMTM percent NaN: 0.6431660747564963
FRGT percent NaN: 0.9998593480783431
GAHT percent NaN: 0.9998945110587574
PGTM percent NaN: 0.5575090544674567
PRCP percent NaN: 0.0483842610499666
PSUN percent NaN: 0.7560040789057281
SNOW percent NaN: 0.12792292274693204
SNWD percent NaN: 0.30440592144590173
TAVG percent NaN: 0.7311086887724604
TMAX percent NaN: 0.04901719469742255
TMIN percent NaN: 0.04898203171700834
TSUN percent NaN: 0.5607088856851506
WDF1 percent NaN: 0.8460916347269595
WDF2 percent NaN: 0.6551566510777453
WDF5 percent NaN: 0.655754421744787
WDFG percent NaN: 0.7296670065754773
WDFM percent NaN: 0.7532965294138331
WESD percent NaN: 0.9402229332958262
WSF1 percent NaN: 0.846021308766131
WSF2 percent NaN: 0.6551566510777453
WSF5 percent NaN: 0.655754421744787
WSFG percent NaN: 0.729456028692992
WSFM percent NaN: 0.7532965294138331
WT01 percent NaN

# Drop
* ['ACMH ','ACSH ','AWND','FMTM','FRGT','GAHT','PGTM','PSUN','TSUN','WDF1','WDF2','WDF5','WDFG', 'WDFM','WESD','WSF1','WSF2','WSF5','WDFG','WDFM','WESD','WSF1','WSF2','WSF5','WSFG','WSFM', 'WT01','WT02','WT03','WT04','WT05','WT06','WT07','WT08','WT09', 'WT11','WT12','WT13','WT14','WT15','WT16','WT17','WT18','WT19', 'WT21','WT22','WV01','WV03']

# Keep
* DATE,ACMH,ACSH,AWND,FMTM,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,TSUN,WDF1,WDF2,WDF5,WDFG,WDFM,
WESD,WSF1,WSF2,WSF5,WSFG,WSFM,WT01,WT08,WT16,Year,Month,Da

In [14]:
df = df.drop(['ACMH','ACSH','AWND','FMTM','FRGT','GAHT','PGTM','PSUN','TSUN','WDF1',
              'WDF2','WDF5','WDFG', 'WDFM','WESD','WSF1','WSF2','WSF5','WDFG','WDFM',
              'WESD','WSF1','WSF2','WSF5','WSFG','WSFM', 'WT01','WT02','WT03','WT04',
              'WT05','WT06','WT07','WT08','WT09', 'WT11','WT12','WT13','WT14','WT15',
              'WT16','WT17','WT18','WT19', 'WT21','WT22','WV01','WV03'], axis = 1)

In [15]:
df.shape

(28439, 10)

In [16]:
for col in df.columns:
    print(col,'percent NaN:', df[col].isna().sum()/len(df))

DATE percent NaN: 0.0
PRCP percent NaN: 0.0483842610499666
SNOW percent NaN: 0.12792292274693204
SNWD percent NaN: 0.30440592144590173
TAVG percent NaN: 0.7311086887724604
TMAX percent NaN: 0.04901719469742255
TMIN percent NaN: 0.04898203171700834
Year percent NaN: 0.0
Month percent NaN: 0.0
Day percent NaN: 0.0


In [17]:
df_cornell = pd.read_csv('Bradley_Cornell_1_1_1940_2_13_2023.txt', sep=',', header=0)

In [18]:
df_cornell.columns = [i.replace(' ','') for i in df_cornell.columns]

In [19]:
df_cornell.iloc[0,]

Date                       1940-01-01
MaxTemperature                      M
MaxTemperatureNormal             36.6
MaxTemperatureDeparture             M
MinTemperature                      M
MinTemperatureNormal             20.8
MinTemperatureDeparture             M
AvgTemperature                      M
AvgTemperatureNormal             28.7
AvgTemperatureDeparture             M
AtObsTemperature                    M
Precipitation                       M
PrecipitationNormal              0.11
PrecipitationDeparture              M
Snowfall                            M
SnowfallNormal                    0.4
SnowfallDeparture                   M
SnowDepth                           M
HDD                                 M
HDDNormal                        36.3
HDDDeparture                        M
CDD                                 M
CDDNormal                         0.0
CDDDeparture                        M
GDD                                 M
Name: 0, dtype: object

In [22]:
df_cornell['Year'] = pd.DatetimeIndex(df_cornell['Date']).year
df_cornell['Month'] = pd.DatetimeIndex(df_cornell['Date']).month
df_cornell['Day'] = pd.DatetimeIndex(df_cornell['Date']).day

In [23]:
for col in df_cornell.columns:
    print(col,'percent NaN:', df_cornell[col].isna().sum()/len(df_cornell))

Date percent NaN: 0.0
MaxTemperature percent NaN: 0.0
MaxTemperatureNormal percent NaN: 0.0
MaxTemperatureDeparture percent NaN: 0.0
MinTemperature percent NaN: 0.0
MinTemperatureNormal percent NaN: 0.0
MinTemperatureDeparture percent NaN: 0.0
AvgTemperature percent NaN: 0.0
AvgTemperatureNormal percent NaN: 0.0
AvgTemperatureDeparture percent NaN: 0.0
AtObsTemperature percent NaN: 0.0
Precipitation percent NaN: 0.0
PrecipitationNormal percent NaN: 0.0
PrecipitationDeparture percent NaN: 0.0
Snowfall percent NaN: 0.0
SnowfallNormal percent NaN: 0.0
SnowfallDeparture percent NaN: 0.0
SnowDepth percent NaN: 0.0
HDD percent NaN: 0.0
HDDNormal percent NaN: 0.0
HDDDeparture percent NaN: 0.0
CDD percent NaN: 0.0
CDDNormal percent NaN: 0.0
CDDDeparture percent NaN: 0.0
GDD percent NaN: 0.0
Year percent NaN: 0.0
Month percent NaN: 0.0
Day percent NaN: 0.0


In [33]:
df_cornell.head(1)

Unnamed: 0,Date,MaxTemperature,MaxTemperatureNormal,MaxTemperatureDeparture,MinTemperature,MinTemperatureNormal,MinTemperatureDeparture,AvgTemperature,AvgTemperatureNormal,AvgTemperatureDeparture,...,HDD,HDDNormal,HDDDeparture,CDD,CDDNormal,CDDDeparture,GDD,Year,Month,Day
0,1940-01-01,M,36.6,M,M,20.8,M,M,28.7,M,...,M,36.3,M,M,0.0,M,M,1940,1,1


In [35]:
df_cornell.iloc[0,1]

' M'

In [36]:
for col in df_cornell.columns:
    print(col, len(df_cornell[df_cornell[col].isin([' M'])])/len(df_cornell))

Date 0.0
MaxTemperature 0.10912384716732543
MaxTemperatureNormal 0.0
MaxTemperatureDeparture 0.10912384716732543
MinTemperature 0.10909090909090909
MinTemperatureNormal 0.0
MinTemperatureDeparture 0.10909090909090909
AvgTemperature 0.10922266139657444
AvgTemperatureNormal 0.0
AvgTemperatureDeparture 0.10922266139657444
AtObsTemperature 1.0
Precipitation 0.10843214756258235
PrecipitationNormal 0.000691699604743083
PrecipitationDeparture 0.10902503293807642
Snowfall 0.18158761528326745
SnowfallNormal 0.000691699604743083
SnowfallDeparture 0.18214756258234518
SnowDepth 0.2326086956521739
HDD 0.10922266139657444
HDDNormal 0.0
HDDDeparture 0.10922266139657444
CDD 0.10922266139657444
CDDNormal 0.0
CDDDeparture 0.10922266139657444
GDD 0.10922266139657444
Year 0.0
Month 0.0
Day 0.0


In [37]:
for col in df_cornell.columns:
    print(col, len(df_cornell[df_cornell[col].isin([' T'])])/len(df_cornell))

Date 0.0
MaxTemperature 0.0
MaxTemperatureNormal 0.0
MaxTemperatureDeparture 0.0
MinTemperature 0.0
MinTemperatureNormal 0.0
MinTemperatureDeparture 0.0
AvgTemperature 0.0
AvgTemperatureNormal 0.0
AvgTemperatureDeparture 0.0
AtObsTemperature 0.0
Precipitation 0.12262845849802372
PrecipitationNormal 0.0
PrecipitationDeparture 0.0
Snowfall 0.056357048748353095
SnowfallNormal 0.0
SnowfallDeparture 0.0
SnowDepth 0.03593544137022398
HDD 0.0
HDDNormal 0.0
HDDDeparture 0.0
CDD 0.0
CDDNormal 0.0
CDDDeparture 0.0
GDD 0.0
Year 0.0
Month 0.0
Day 0.0


In [41]:
for col in df_cornell.columns:
    print(col, len(df_cornell[df_cornell[col].isin([' S'])])/len(df_cornell))

Date 0.0
MaxTemperature 0.0
MaxTemperatureNormal 0.0
MaxTemperatureDeparture 0.0
MinTemperature 0.0
MinTemperatureNormal 0.0
MinTemperatureDeparture 0.0
AvgTemperature 0.0
AvgTemperatureNormal 0.0
AvgTemperatureDeparture 0.0
AtObsTemperature 0.0
Precipitation 0.0
PrecipitationNormal 0.0
PrecipitationDeparture 0.0
Snowfall 0.0
SnowfallNormal 0.0
SnowfallDeparture 0.0
SnowDepth 0.0
HDD 0.0
HDDNormal 0.0
HDDDeparture 0.0
CDD 0.0
CDDNormal 0.0
CDDDeparture 0.0
GDD 0.0
Year 0.0
Month 0.0
Day 0.0


In [42]:
for col in df_cornell.columns:
    print(col, len(df_cornell[df_cornell[col].isin([' A'])])/len(df_cornell))

Date 0.0
MaxTemperature 0.0
MaxTemperatureNormal 0.0
MaxTemperatureDeparture 0.0
MinTemperature 0.0
MinTemperatureNormal 0.0
MinTemperatureDeparture 0.0
AvgTemperature 0.0
AvgTemperatureNormal 0.0
AvgTemperatureDeparture 0.0
AtObsTemperature 0.0
Precipitation 0.0
PrecipitationNormal 0.0
PrecipitationDeparture 0.0
Snowfall 0.0
SnowfallNormal 0.0
SnowfallDeparture 0.0
SnowDepth 0.0
HDD 0.0
HDDNormal 0.0
HDDDeparture 0.0
CDD 0.0
CDDNormal 0.0
CDDDeparture 0.0
GDD 0.0
Year 0.0
Month 0.0
Day 0.0
