# Data from NOAA and climod2
* https://www.ncdc.noaa.gov/cdo-web/
* http://climod2.nrcc.cornell.edu/
* Note:
    * T: Trace amount- Less than 0.01" precipitation; less than 0.1" snowfall; less than 1" snow depth.
    * S: Subsequent- Indicates the observation is missing, but is included in a subsequent value.
    * A: Accumulated- Indicates an accumulated value which includes the current day and any immediately preceeding missing days (starting with a day flagged by "S").
    * M: Missing.

___

# Importing Libraries and Data

In [255]:
import pandas as pd
import numpy as np
import matplotlib as plt
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import os
sns.set()

# Import NOAA data and grab useful columns

In [256]:
df_noaa_1_8_2942_2_8_2023 = pd.read_csv('Bradley_1_8_1942_2_8_2023.csv', skipinitialspace = True)

In [257]:
df_noaa_2023 = pd.read_csv('NOAA_1_1_2023_12_31_2023.csv', skipinitialspace = True)
df_noaa_2023 = df_noaa_2023.loc[df_noaa_2023['DATE']> '2023-02-08']

In [258]:
df = pd.concat([df_noaa_1_8_2942_2_8_2023, df_noaa_2023], ignore_index = True)

In [259]:
df = df.drop(['STATION'], axis = 1)

In [260]:
df = df.drop(['NAME'], axis = 1)

In [261]:
df['Year'] = pd.DatetimeIndex(df['DATE']).year
df['Month'] = pd.DatetimeIndex(df['DATE']).month
df['Day'] = pd.DatetimeIndex(df['DATE']).day

In [262]:
df = df.sort_values(by=['DATE'])

In [263]:
for col in df.columns:
    print(col,'percent NaN:', df[col].isna().sum()/len(df))

DATE percent NaN: 0.0
ACMH percent NaN: 0.6034416826003824
ACSH percent NaN: 0.6032678602468278
AWND percent NaN: 0.4922649052668173
FMTM percent NaN: 0.6472101512254476
FRGT percent NaN: 0.9998609421171563
GAHT percent NaN: 0.9998957065878672
PGTM percent NaN: 0.5625239005736138
PRCP percent NaN: 0.047835911698244395
PSUN percent NaN: 0.758769337736833
SNOW percent NaN: 0.1264731444463758
SNWD percent NaN: 0.30095602294455065
TAVG percent NaN: 0.7228228750217278
TMAX percent NaN: 0.0484616721710412
TMIN percent NaN: 0.048426907700330264
TSUN percent NaN: 0.5656874674083087
WDF1 percent NaN: 0.8478359116982443
WDF2 percent NaN: 0.6477316182861116
WDF5 percent NaN: 0.6483226142881975
WDFG percent NaN: 0.7327307491743438
WDFM percent NaN: 0.7560924734920911
WESD percent NaN: 0.9409003997914132
WSF1 percent NaN: 0.8477663827568225
WSF2 percent NaN: 0.6477316182861116
WSF5 percent NaN: 0.6483226142881975
WSFG percent NaN: 0.7325221623500782
WSFM percent NaN: 0.7560924734920911
WT01 percent

# Drop
* ['ACMH ','ACSH ','AWND','FMTM','FRGT','GAHT','PGTM','PSUN','TSUN','WDF1','WDF2','WDF5','WDFG', 'WDFM','WESD','WSF1','WSF2','WSF5','WDFG','WDFM','WESD','WSF1','WSF2','WSF5','WSFG','WSFM', 'WT01','WT02','WT03','WT04','WT05','WT06','WT07','WT08','WT09', 'WT11','WT12','WT13','WT14','WT15','WT16','WT17','WT18','WT19', 'WT21','WT22','WV01','WV03']

# Keep
* DATE,ACMH,ACSH,AWND,FMTM,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,TSUN,WDF1,WDF2,WDF5,WDFG,WDFM,
WESD,WSF1,WSF2,WSF5,WSFG,WSFM,WT01,WT08,WT16,Year,Month,Da

In [264]:
df = df.drop(['ACMH','ACSH','AWND','FMTM','FRGT','GAHT','PGTM','PSUN','TSUN','WDF1',
              'WDF2','WDF5','WDFG', 'WDFM','WESD','WSF1','WSF2','WSF5','WDFG','WDFM',
              'WESD','WSF1','WSF2','WSF5','WSFG','WSFM', 'WT01','WT02','WT03','WT04',
              'WT05','WT06','WT07','WT08','WT09', 'WT11','WT12','WT13','WT14','WT15',
              'WT16','WT17','WT18','WT19', 'WT21','WT22','WV01','WV03'], axis = 1).copy()

In [265]:
df.shape

(28765, 10)

In [266]:
# convert noaa columns to float
df['PRCP'] = pd.to_numeric(df['PRCP'], errors='coerce')
df['SNOW'] = pd.to_numeric(df['SNOW'], errors='coerce')
df['SNWD'] = pd.to_numeric(df['SNWD'], errors='coerce')
df['TAVG'] = pd.to_numeric(df['TAVG'], errors='coerce')
df['TMAX'] = pd.to_numeric(df['TMAX'], errors='coerce')
df['TMIN'] = pd.to_numeric(df['TMIN'], errors='coerce')
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Month'] = pd.to_numeric(df['Month'], errors='coerce')
df['Day'] = pd.to_numeric(df['Day'], errors='coerce')

### Any missing data in 2023
* No so I'll just use NOAA

In [267]:
df_2023 = df[df['DATE']>= '2023-01-01']

In [268]:
df_2023.head(1)

Unnamed: 0,DATE,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,Year,Month,Day
28400,2023-01-01,0.03,0.0,0.0,47.0,51.0,33.0,2023,1,1


In [269]:
print(df_2023[df_2023['PRCP'].isna()].shape)
print(df_2023[df_2023['SNOW'].isna()].shape)
print(df_2023[df_2023['SNWD'].isna()].shape)
print(df_2023[df_2023['TAVG'].isna()].shape)
print(df_2023[df_2023['TMAX'].isna()].shape)
print(df_2023[df_2023['TMIN'].isna()].shape)

(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)


In [270]:
for col in df.columns:
    print(col,'percent NaN:', df[col].isna().sum()/len(df))

DATE percent NaN: 0.0
PRCP percent NaN: 0.047835911698244395
SNOW percent NaN: 0.1264731444463758
SNWD percent NaN: 0.30095602294455065
TAVG percent NaN: 0.7228228750217278
TMAX percent NaN: 0.0484616721710412
TMIN percent NaN: 0.048426907700330264
Year percent NaN: 0.0
Month percent NaN: 0.0
Day percent NaN: 0.0


In [271]:
# lowercase all columns
df.columns = [x.lower() for x in df.columns]

# End Getting NOAA Data

___

# Import Cornell data and get useful columns

# NOTE: AvgTemperature is just average of min and max

In [272]:
df_cornell = pd.read_csv('Bradley_Cornell_1_1_1940_2_13_2023.txt', sep=',', header=0, skipinitialspace=True)

In [273]:
df_cornell.columns = [i.replace(' ','') for i in df_cornell.columns]

In [274]:
df_cornell = df_cornell.sort_values(by=['Date'])

In [275]:
df_cornell.iloc[0,]

Date                       1940-01-01
MaxTemperature                      M
MaxTemperatureNormal             36.6
MaxTemperatureDeparture             M
MinTemperature                      M
MinTemperatureNormal             20.8
MinTemperatureDeparture             M
AvgTemperature                      M
AvgTemperatureNormal             28.7
AvgTemperatureDeparture             M
AtObsTemperature                    M
Precipitation                       M
PrecipitationNormal              0.11
PrecipitationDeparture              M
Snowfall                            M
SnowfallNormal                    0.4
SnowfallDeparture                   M
SnowDepth                           M
HDD                                 M
HDDNormal                        36.3
HDDDeparture                        M
CDD                                 M
CDDNormal                         0.0
CDDDeparture                        M
GDD                                 M
Name: 0, dtype: object

In [276]:
df_cornell.iloc[0,1]

'M'

In [277]:
df_cornell['Year'] = pd.DatetimeIndex(df_cornell['Date']).year
df_cornell['Month'] = pd.DatetimeIndex(df_cornell['Date']).month
df_cornell['Day'] = pd.DatetimeIndex(df_cornell['Date']).day

In [278]:
for col in df_cornell.columns:
    print(col,'percent NaN:', df_cornell[col].isna().sum()/len(df_cornell))

Date percent NaN: 0.0
MaxTemperature percent NaN: 0.0
MaxTemperatureNormal percent NaN: 0.0
MaxTemperatureDeparture percent NaN: 0.0
MinTemperature percent NaN: 0.0
MinTemperatureNormal percent NaN: 0.0
MinTemperatureDeparture percent NaN: 0.0
AvgTemperature percent NaN: 0.0
AvgTemperatureNormal percent NaN: 0.0
AvgTemperatureDeparture percent NaN: 0.0
AtObsTemperature percent NaN: 0.0
Precipitation percent NaN: 0.0
PrecipitationNormal percent NaN: 0.0
PrecipitationDeparture percent NaN: 0.0
Snowfall percent NaN: 0.0
SnowfallNormal percent NaN: 0.0
SnowfallDeparture percent NaN: 0.0
SnowDepth percent NaN: 0.0
HDD percent NaN: 0.0
HDDNormal percent NaN: 0.0
HDDDeparture percent NaN: 0.0
CDD percent NaN: 0.0
CDDNormal percent NaN: 0.0
CDDDeparture percent NaN: 0.0
GDD percent NaN: 0.0
Year percent NaN: 0.0
Month percent NaN: 0.0
Day percent NaN: 0.0


In [279]:
df_cornell.head(1)

Unnamed: 0,Date,MaxTemperature,MaxTemperatureNormal,MaxTemperatureDeparture,MinTemperature,MinTemperatureNormal,MinTemperatureDeparture,AvgTemperature,AvgTemperatureNormal,AvgTemperatureDeparture,...,HDD,HDDNormal,HDDDeparture,CDD,CDDNormal,CDDDeparture,GDD,Year,Month,Day
0,1940-01-01,M,36.6,M,M,20.8,M,M,28.7,M,...,M,36.3,M,M,0.0,M,M,1940,1,1


In [280]:
for col in df_cornell.columns:
    print(col, len(df_cornell[df_cornell[col].isin(['M'])])/len(df_cornell))

Date 0.0
MaxTemperature 0.10912384716732543
MaxTemperatureNormal 0.0
MaxTemperatureDeparture 0.10912384716732543
MinTemperature 0.10909090909090909
MinTemperatureNormal 0.0
MinTemperatureDeparture 0.10909090909090909
AvgTemperature 0.10922266139657444
AvgTemperatureNormal 0.0
AvgTemperatureDeparture 0.10922266139657444
AtObsTemperature 1.0
Precipitation 0.10843214756258235
PrecipitationNormal 0.000691699604743083
PrecipitationDeparture 0.10902503293807642
Snowfall 0.18158761528326745
SnowfallNormal 0.000691699604743083
SnowfallDeparture 0.18214756258234518
SnowDepth 0.2326086956521739
HDD 0.10922266139657444
HDDNormal 0.0
HDDDeparture 0.10922266139657444
CDD 0.10922266139657444
CDDNormal 0.0
CDDDeparture 0.10922266139657444
GDD 0.10922266139657444
Year 0.0
Month 0.0
Day 0.0


In [281]:
for col in df_cornell.columns:
    print(col, len(df_cornell[df_cornell[col].isin(['T'])])/len(df_cornell))

Date 0.0
MaxTemperature 0.0
MaxTemperatureNormal 0.0
MaxTemperatureDeparture 0.0
MinTemperature 0.0
MinTemperatureNormal 0.0
MinTemperatureDeparture 0.0
AvgTemperature 0.0
AvgTemperatureNormal 0.0
AvgTemperatureDeparture 0.0
AtObsTemperature 0.0
Precipitation 0.12262845849802372
PrecipitationNormal 0.0
PrecipitationDeparture 0.0
Snowfall 0.056357048748353095
SnowfallNormal 0.0
SnowfallDeparture 0.0
SnowDepth 0.03593544137022398
HDD 0.0
HDDNormal 0.0
HDDDeparture 0.0
CDD 0.0
CDDNormal 0.0
CDDDeparture 0.0
GDD 0.0
Year 0.0
Month 0.0
Day 0.0


In [282]:
for col in df_cornell.columns:
    print(col, len(df_cornell[df_cornell[col].isin(['S'])])/len(df_cornell))

Date 0.0
MaxTemperature 0.0
MaxTemperatureNormal 0.0
MaxTemperatureDeparture 0.0
MinTemperature 0.0
MinTemperatureNormal 0.0
MinTemperatureDeparture 0.0
AvgTemperature 0.0
AvgTemperatureNormal 0.0
AvgTemperatureDeparture 0.0
AtObsTemperature 0.0
Precipitation 0.0
PrecipitationNormal 0.0
PrecipitationDeparture 0.0
Snowfall 0.0
SnowfallNormal 0.0
SnowfallDeparture 0.0
SnowDepth 0.0
HDD 0.0
HDDNormal 0.0
HDDDeparture 0.0
CDD 0.0
CDDNormal 0.0
CDDDeparture 0.0
GDD 0.0
Year 0.0
Month 0.0
Day 0.0


In [283]:
for col in df_cornell.columns:
    print(col, len(df_cornell[df_cornell[col].isin(['A'])])/len(df_cornell))

Date 0.0
MaxTemperature 0.0
MaxTemperatureNormal 0.0
MaxTemperatureDeparture 0.0
MinTemperature 0.0
MinTemperatureNormal 0.0
MinTemperatureDeparture 0.0
AvgTemperature 0.0
AvgTemperatureNormal 0.0
AvgTemperatureDeparture 0.0
AtObsTemperature 0.0
Precipitation 0.0
PrecipitationNormal 0.0
PrecipitationDeparture 0.0
Snowfall 0.0
SnowfallNormal 0.0
SnowfallDeparture 0.0
SnowDepth 0.0
HDD 0.0
HDDNormal 0.0
HDDDeparture 0.0
CDD 0.0
CDDNormal 0.0
CDDDeparture 0.0
GDD 0.0
Year 0.0
Month 0.0
Day 0.0


# Keep:
* Date, MaxTemperature, MinTemperature, AvgTemperature, AtObsTemperature, Precipitation, Snowfall, SnowDepth,
* Year, Month, Day
# Drop: 
* MaxTemperatureNormal, MaxTemperatureDeparture, MinTemperatureNormal, MinTemperatureDeparture, AvgTemperatureNormal
* AvgTemperatureDeparture, PrecipitationNormal, PrecipitationDeparture, SnowfallNormal, SnowfallDeparture
* HDD, HDDNormal, HDDDeparture, CDD, CDDNormal, CDDDeparture, GDD

In [284]:
df_cornell.shape

(30360, 28)

In [285]:
df_cornell.columns

Index(['Date', 'MaxTemperature', 'MaxTemperatureNormal',
       'MaxTemperatureDeparture', 'MinTemperature', 'MinTemperatureNormal',
       'MinTemperatureDeparture', 'AvgTemperature', 'AvgTemperatureNormal',
       'AvgTemperatureDeparture', 'AtObsTemperature', 'Precipitation',
       'PrecipitationNormal', 'PrecipitationDeparture', 'Snowfall',
       'SnowfallNormal', 'SnowfallDeparture', 'SnowDepth', 'HDD', 'HDDNormal',
       'HDDDeparture', 'CDD', 'CDDNormal', 'CDDDeparture', 'GDD', 'Year',
       'Month', 'Day'],
      dtype='object')

In [286]:
df_cornell = df_cornell.drop(['MaxTemperatureNormal','MaxTemperatureDeparture','MinTemperatureNormal',
                             'MinTemperatureDeparture','AvgTemperatureNormal','AvgTemperatureDeparture',
                             'PrecipitationNormal', 'PrecipitationDeparture','SnowfallNormal', 'SnowfallDeparture',
                             'HDD', 'HDDNormal','HDDDeparture', 'CDD', 'CDDNormal', 'CDDDeparture', 'GDD'], axis = 1).copy()

In [287]:
df_cornell.shape

(30360, 11)

In [288]:
# lowercase all columns
df_cornell.columns = [x.lower() for x in df_cornell.columns]

In [289]:
# T: Trace amount- Less than 0.01" precipitation; less than 0.1" snowfall; less than 1" snow depth.

In [290]:
len(df_cornell[df_cornell['precipitation'] == 'T'])/len(df_cornell)*100

12.262845849802371

In [291]:
df_cornell['precipitation'] = df_cornell['precipitation'].replace('T',0.009)

In [292]:
df_cornell[df_cornell['precipitation'] == 'T'].shape

(0, 11)

In [293]:
len(df_cornell[df_cornell['snowfall'] == 'T'])/len(df_cornell)*100

5.63570487483531

In [294]:
df_cornell['snowfall'] = df_cornell['snowfall'].replace('T',0.09)

In [295]:
df_cornell[df_cornell['snowfall'] == 'T'].shape

(0, 11)

In [296]:
len(df_cornell[df_cornell['snowdepth'] == 'T'])/len(df_cornell)*100

3.5935441370223984

In [297]:
df_cornell['snowdepth'] = df_cornell['snowdepth'].replace('T',0.09)

In [298]:
df_cornell[df_cornell['snowdepth'] == 'T'].shape

(0, 11)

In [299]:
df_cornell.columns

Index(['date', 'maxtemperature', 'mintemperature', 'avgtemperature',
       'atobstemperature', 'precipitation', 'snowfall', 'snowdepth', 'year',
       'month', 'day'],
      dtype='object')

In [300]:
# convert cornell columns to float
df_cornell['precipitation'] = pd.to_numeric(df_cornell['precipitation'], errors='coerce')
df_cornell['snowfall'] = pd.to_numeric(df_cornell['snowfall'], errors='coerce')
df_cornell['snowdepth'] = pd.to_numeric(df_cornell['snowdepth'], errors='coerce')
df_cornell['avgtemperature'] = pd.to_numeric(df_cornell['avgtemperature'], errors='coerce')
df_cornell['maxtemperature'] = pd.to_numeric(df_cornell['maxtemperature'], errors='coerce')
df_cornell['mintemperature'] = pd.to_numeric(df_cornell['mintemperature'], errors='coerce')
df_cornell['year'] = pd.to_numeric(df_cornell['year'], errors='coerce')
df_cornell['month'] = pd.to_numeric(df_cornell['month'], errors='coerce')
df_cornell['day'] = pd.to_numeric(df_cornell['day'], errors='coerce')

# End Getting Cornell Data

___

# Check date range and make sure the same

In [301]:
df_cornell.iloc[0,0]

'1940-01-01'

In [302]:
type(df_cornell.iloc[0,0])

str

In [303]:
df.iloc[0,0]

'1942-01-08'

In [304]:
type(df.iloc[0,0])

str

In [305]:
print(df.date.min())
print(df.date.max())
print(df_cornell.date.min())
print(df_cornell.date.max())

1942-01-08
2023-12-31
1940-01-01
2023-02-13


# Get rid of 2023 data because one df is missing data

In [306]:
df_2023 = df[df['date'] >= '2023-01-01'].copy()
df = df[(df['date'] >= '1943-01-01') & (df['date'] < '2023-01-01')].copy()

In [307]:
df_cornell = df_cornell[(df_cornell['date'] >= '1943-01-01') & (df_cornell['date'] < '2023-01-01')].copy()

In [308]:
print(df.shape)
print(df_cornell.shape)

(28044, 10)
(29220, 11)


### NOAA is missing data or Cornell has duplicate dates

In [309]:
for noaa, cornell in zip(df.date, df_cornell.date):
    if noaa != cornell:
        print(noaa, cornell)
        break

1949-01-01 1945-10-15


In [310]:
print(len(set(df.date)))
print(len(set(df_cornell.date)))

28044
29220


In [311]:
date_diff = set(df_cornell.date).difference(set(df.date))
print(len(date_diff))
print(len(set(df_cornell.date))-len(set(df.date)))

1176
1176


In [312]:
df_cornell[(df_cornell['date']>='1945-10-12') & (df_cornell['date']<'1945-10-22')]

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year,month,day
2111,1945-10-12,,,,M,,,,1945,10,12
2112,1945-10-13,,,,M,,,,1945,10,13
2113,1945-10-14,,,,M,,,,1945,10,14
2114,1945-10-15,,,,M,,,,1945,10,15
2115,1945-10-16,,,,M,,,,1945,10,16
2116,1945-10-17,,,,M,,,,1945,10,17
2117,1945-10-18,,,,M,,,,1945,10,18
2118,1945-10-19,,,,M,,,,1945,10,19
2119,1945-10-20,,,,M,,,,1945,10,20
2120,1945-10-21,,,,M,,,,1945,10,21


In [313]:
df_cornell.tail(2)

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year,month,day
30314,2022-12-30,64.0,32.0,48.0,M,0.0,0.0,0.0,2022,12,30
30315,2022-12-31,53.0,35.0,44.0,M,0.32,0.0,0.0,2022,12,31


In [314]:
df[(df['date']>='1945-10-12') & (df['date']<'1947-10-30')]

Unnamed: 0,date,prcp,snow,snwd,tavg,tmax,tmin,year,month,day
1371,1945-10-12,,,,49.0,,,1945,10,12
1372,1945-10-13,,,,53.0,,,1945,10,13
1373,1945-10-14,,,,48.0,,,1945,10,14


In [315]:
df.tail(2)

Unnamed: 0,date,prcp,snow,snwd,tavg,tmax,tmin,year,month,day
28398,2022-12-30,0.0,0.0,0.0,45.0,64.0,32.0,2022,12,30
28399,2022-12-31,0.32,0.0,0.0,46.0,53.0,35.0,2022,12,31


In [316]:
# looks like some years are missing from NOAA
diff_years = set(df_cornell.year).difference(set(df.year))
print(diff_years)

{1946, 1947, 1948}


In [317]:
cor_46_47_48 = df_cornell[(df_cornell['date'] >= '1946-01-01') & (df_cornell['date'] < '1949-01-01')].copy()

In [318]:
cor_46_47_48

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year,month,day
2192,1946-01-01,,,,M,,,,1946,1,1
2193,1946-01-02,,,,M,,,,1946,1,2
2194,1946-01-03,,,,M,,,,1946,1,3
2195,1946-01-04,,,,M,,,,1946,1,4
2196,1946-01-05,,,,M,,,,1946,1,5
...,...,...,...,...,...,...,...,...,...,...,...
3283,1948-12-27,,,,M,,,,1948,12,27
3284,1948-12-28,,,,M,,,,1948,12,28
3285,1948-12-29,,,,M,,,,1948,12,29
3286,1948-12-30,,,,M,,,,1948,12,30


# Looks like there is a lot of missing value so let's check when actual data comes in

___

# Merge all the data and add NOAA's 2023 data and input the same values for cornell and work backwards

In [319]:
## Take 2023 data from NOAA and input the same values for 2023 cornell

In [320]:
df = pd.concat([df, df_2023], ignore_index = True)

In [321]:
df_cornell_2023 = pd.DataFrame(df_2023)

In [322]:
df_cornell_2023['atobstemperature'] = 'M'

In [323]:
df_cornell_2023.head(1)

Unnamed: 0,date,prcp,snow,snwd,tavg,tmax,tmin,year,month,day,atobstemperature
28400,2023-01-01,0.03,0.0,0.0,47.0,51.0,33.0,2023,1,1,M


In [324]:
# create a dictionary
# key = old name
# value = new name
dict = {'prcp': 'precipitation',
        'snow': 'snowfall',
        'snwd': 'snowdepth',
        'tavg': 'avgtemperature',
        'tmax': 'maxtemperature',
        'tmin': 'mintemperature'}
 
# call rename () method
df_cornell_2023.rename(columns=dict,
          inplace=True)

In [325]:
df_cornell_2023.head(1)

Unnamed: 0,date,precipitation,snowfall,snowdepth,avgtemperature,maxtemperature,mintemperature,year,month,day,atobstemperature
28400,2023-01-01,0.03,0.0,0.0,47.0,51.0,33.0,2023,1,1,M


In [326]:
list(df_cornell.columns)

['date',
 'maxtemperature',
 'mintemperature',
 'avgtemperature',
 'atobstemperature',
 'precipitation',
 'snowfall',
 'snowdepth',
 'year',
 'month',
 'day']

In [327]:
df_cornell_2023 = df_cornell_2023[list(df_cornell.columns)]

In [328]:
df_cornell_2023.head(1)

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year,month,day
28400,2023-01-01,51.0,33.0,47.0,M,0.03,0.0,0.0,2023,1,1


In [329]:
df_cornell.head(1)

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year,month,day
1096,1943-01-01,,,,M,,,,1943,1,1


In [330]:
df_cornell = pd.concat([df_cornell, df_cornell_2023], ignore_index = True)

In [331]:
print(df_cornell[df_cornell['date']>='2023-01-01'].shape)
print(df[df['date']>='2023-01-01'].shape)

(365, 11)
(365, 10)


In [332]:
### Based on analyes below
df_cornell.loc[(df_cornell['month'].isin([5,6,7,8,9])) & (df_cornell['snowfall'] == '0.0'), 'snowdepth'] = 0.0
df.loc[(df['month'].isin([5,6,7,8,9])) & (df['snow'] == 0.0), 'snwd'] = 0.0

In [333]:
df2 = pd.merge(df_cornell, df, how = 'left', on = 'date')

In [334]:
print(df.shape)
print(df_cornell.shape)
print(df2.shape)

(28409, 10)
(29585, 11)
(29585, 20)


In [335]:
df2[df2['date']>= '2023-01-01']

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year_x,month_x,day_x,prcp,snow,snwd,tavg,tmax,tmin,year_y,month_y,day_y
29220,2023-01-01,51.0,33.0,47.0,M,0.03,0.0,0.0,2023,1,1,0.03,0.0,0.0,47.0,51.0,33.0,2023.0,1.0,1.0
29221,2023-01-02,47.0,28.0,37.0,M,0.00,0.0,0.0,2023,1,2,0.00,0.0,0.0,37.0,47.0,28.0,2023.0,1.0,2.0
29222,2023-01-03,40.0,29.0,35.0,M,0.42,0.0,0.0,2023,1,3,0.42,0.0,0.0,35.0,40.0,29.0,2023.0,1.0,3.0
29223,2023-01-04,47.0,39.0,41.0,M,0.26,0.0,0.0,2023,1,4,0.26,0.0,0.0,41.0,47.0,39.0,2023.0,1.0,4.0
29224,2023-01-05,46.0,39.0,44.0,M,0.03,0.0,0.0,2023,1,5,0.03,0.0,0.0,44.0,46.0,39.0,2023.0,1.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29580,2023-12-27,46.0,40.0,42.0,M,0.13,0.0,0.0,2023,12,27,0.13,0.0,0.0,42.0,46.0,40.0,2023.0,12.0,27.0
29581,2023-12-28,49.0,44.0,46.0,M,1.70,0.0,0.0,2023,12,28,1.70,0.0,0.0,46.0,49.0,44.0,2023.0,12.0,28.0
29582,2023-12-29,47.0,42.0,45.0,M,0.05,0.0,0.0,2023,12,29,0.05,0.0,0.0,45.0,47.0,42.0,2023.0,12.0,29.0
29583,2023-12-30,46.0,35.0,44.0,M,0.01,0.0,0.0,2023,12,30,0.01,0.0,0.0,44.0,46.0,35.0,2023.0,12.0,30.0


___

# Need to make sure I have all the data

In [336]:
leap_years = []
for i in range(1944,2024,4):
    leap_years.append(i)

In [337]:
non_leap_years = []
for i in range(1943,2024):
    if i not in leap_years:
        non_leap_years.append(i)

In [338]:
for year in leap_years:
    print(year,df2[df2['year_x'] == year].shape)

1944 (366, 20)
1948 (366, 20)
1952 (366, 20)
1956 (366, 20)
1960 (366, 20)
1964 (366, 20)
1968 (366, 20)
1972 (366, 20)
1976 (366, 20)
1980 (366, 20)
1984 (366, 20)
1988 (366, 20)
1992 (366, 20)
1996 (366, 20)
2000 (366, 20)
2004 (366, 20)
2008 (366, 20)
2012 (366, 20)
2016 (366, 20)
2020 (366, 20)


In [339]:
for year in non_leap_years:
    if len(df2[df2['year_x'] == year]) != 365:
        print(year,df2[df2['year_x'] == year].shape)

# End

___

In [340]:
df2 = df2.drop(['year_y', 'month_y', 'day_y'], axis = 1)

In [341]:
df2 = df2.rename(columns = {'year_x':'year','month_x':'month','day_x':'day'})

# Fill in data:
## If it's 5/6/7/8 or 9 and precpitation is 0 then make snow and snowfall 0

In [342]:
# avgtemperature, tavg

In [343]:
# df2.loc[(df2['date']== '2010-01-01'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-01'), 'snowdepth'] = 0.0

In [344]:
print(df2[df2['snow'].isna()].shape)
print(df2[df2['snowfall'].isna()].shape)

(4458, 17)
(4417, 17)


In [345]:
df2.loc[(df2['month'].isin([5,6,7,8,9])) & (df2['prcp'] == 0.0) | (df2['precipitation'] == 0.0) , 'snow'] = 0.0
df2.loc[(df2['month'].isin([5,6,7,8,9])) & (df2['prcp'] == 0.0) | (df2['precipitation'] == 0.0) , 'snowfall'] = 0.0

In [346]:
print(df2[df2['snow'].isna()].shape)
print(df2[df2['snowfall'].isna()].shape)

(3175, 17)
(3151, 17)


# Fill in data:
## If it's 4/5/6/7/8 or 9 and snowfall is 0 and avg temp is >= 42.0 then make snowdepth 0

In [347]:
print(df2[df2['snwd'].isna()].shape)
print(df2[df2['snowdepth'].isna()].shape)

(7203, 17)
(5966, 17)


In [348]:
df2.loc[(df2['month'].isin([4,5,6,7,8,9])) & 
        (df2['snow'] == 0.0) | (df2['snowfall'] == 0.0) & 
        (df2['avgtemperature'] >= 42.0) | (df2['tavg'] >= 42.0), 'snwd'] = 0.0

In [349]:
df2.loc[(df2['month'].isin([4,5,6,7,8,9])) & 
        (df2['snow'] == 0.0) | (df2['snowfall'] == 0.0) & 
        (df2['avgtemperature'] >= 42.0) | (df2['tavg'] >= 42.0), 'snowdepth'] = 0.0

In [350]:
print(df2[df2['snwd'].isna()].shape)
print(df2[df2['snowdepth'].isna()].shape)

(3749, 17)
(3078, 17)


In [351]:
# Fill in data
## If it's April 20 to April 30 and min temp >= 45 and snowfall is 0 then make snowdepth 0

# 2020s
### Overall could use either because there isn't any missing but NOAA is better
* Drop Cornell's avgtemperature
    * Cornell's avgtemperature is must the average of max and min so don't use that because NOAAs is over the entire day
* Drop Cornell's atobstemperature
    * All 'M'
* Drop Cornell's precipitation
    * I filled in T (trace) so not 100% accurate
* Drop Cornell's snowfall
    * I filled in T (trace) so not 100% accurate
* Drop Cornell's snowdepth
    * I filled in T (trace) so not 100% accurate

In [352]:
df2_2020s = df2[df2['date']>= '2020-01-01'].copy()

In [353]:
def same_val(a, b):
    if float(a) == float(b):
        return 1
    else:
        return 0

In [354]:
df2_2020s['max_temp_same'] = df2_2020s[['maxtemperature','tmax']].apply(lambda x: same_val(*x), axis = 1)
print(df2_2020s.max_temp_same.sum()/len(df2_2020s))

1.0


In [355]:
df2_2020s['min_temp_same'] = df2_2020s[['mintemperature','tmin']].apply(lambda x: same_val(*x), axis = 1)
print(df2_2020s.min_temp_same.sum()/len(df2_2020s))

1.0


In [356]:
df2_2020s['avg_temp_same'] = df2_2020s[['avgtemperature','tavg']].apply(lambda x: same_val(*x), axis = 1)
print(df2_2020s.avg_temp_same.sum()/len(df2_2020s))

0.3278576317590691


In [357]:
# Cornell's avgtemperature is must the average of max and min so don't use that because NOAAs is over the entire day
df2_2020s[df2_2020s['tavg'].isna()].shape

(0, 20)

In [358]:
len(df2_2020s[df2_2020s['atobstemperature']=='M'])/len(df2_2020s)

1.0

In [359]:
df2_2020s['prcp_same'] = df2_2020s[['precipitation','prcp']].apply(lambda x: same_val(*x), axis = 1)
print(df2_2020s.prcp_same.sum()/len(df2_2020s))

0.8966461327857632


In [360]:
df2_2020s['snow_same'] = df2_2020s[['snowfall','snow']].apply(lambda x: same_val(*x), axis = 1)
print(df2_2020s.snow_same.sum()/len(df2_2020s))

0.9568788501026694


In [361]:
df2_2020s['snow_depth_same'] = df2_2020s[['snowdepth','snwd']].apply(lambda x: same_val(*x), axis = 1)
print(df2_2020s.snow_depth_same.sum()/len(df2_2020s))

0.9541409993155373


In [362]:
df2_2020s.columns

Index(['date', 'maxtemperature', 'mintemperature', 'avgtemperature',
       'atobstemperature', 'precipitation', 'snowfall', 'snowdepth', 'year',
       'month', 'day', 'prcp', 'snow', 'snwd', 'tavg', 'tmax', 'tmin',
       'max_temp_same', 'min_temp_same', 'avg_temp_same', 'prcp_same',
       'snow_same', 'snow_depth_same'],
      dtype='object')

# End 2020s

___

# Breaking the rest up into decades is too much work so instead focus on the missing data

# 2010s
* avgtemperature/tavg
    * Drop Cornell's avgtemperature
        * Cornell's avgtemperature is must the average of max and min so don't use that because NOAAs is over the entire day
    * Use NOAA's
* maxtemperature/tmax
    * Can use either
* mintemperature/tmin
    * Can use either
* precipitation/prcp
    * Use NOAAs because it's probably more accurate
* snowfall/snow
    * Use NOAAs because it's probably more accurate
* snowdepth/snwd
* atobstemperature
    * Drop: All missing

In [363]:
df2_2010s = df2[(df2['date']>= '2010-01-01') & (df2['date']< '2020-01-01')].copy()

In [364]:
df2_2010s['max_temp_same'] = df2_2010s[['maxtemperature','tmax']].apply(lambda x: same_val(*x), axis = 1)
print(df2_2010s.max_temp_same.sum()/len(df2_2010s))

1.0


In [365]:
df2_2010s['min_temp_same'] = df2_2010s[['mintemperature','tmin']].apply(lambda x: same_val(*x), axis = 1)
print(df2_2010s.min_temp_same.sum()/len(df2_2010s))

1.0


In [366]:
df2_2010s['avg_temp_same'] = df2_2010s[['avgtemperature','tavg']].apply(lambda x: same_val(*x), axis = 1)
print(df2_2010s.avg_temp_same.sum()/len(df2_2010s))

0.07064622124863089


In [367]:
df2_2010s['prcp_same'] = df2_2010s[['precipitation','prcp']].apply(lambda x: same_val(*x), axis = 1)
print(df2_2010s.prcp_same.sum()/len(df2_2010s))

0.8682913472070098


In [368]:
print(df2_2010s[df2_2010s['precipitation'].isna()].shape)
print(df2_2010s[df2_2010s['precipitation']=='M'].shape)
print(df2_2010s[df2_2010s['prcp'].isna()].shape)
print(df2_2010s[df2_2010s['prcp']=='M'].shape)

(0, 21)
(0, 21)
(0, 21)
(0, 21)


In [369]:
df2_2010s['snow_same'] = df2_2010s[['snowfall','snow']].apply(lambda x: same_val(*x), axis = 1)
print(df2_2010s.snow_same.sum()/len(df2_2010s))

0.9422234392113911


In [370]:
print(df2_2010s[df2_2010s['snowfall'].isna()].shape)
print(df2_2010s[df2_2010s['snowfall']=='M'].shape)
print(df2_2010s[df2_2010s['snow'].isna()].shape)
print(df2_2010s[df2_2010s['snow']=='M'].shape)

(0, 22)
(0, 22)
(0, 22)
(0, 22)


In [371]:
print(df2_2010s[df2_2010s['atobstemperature'].isna()].shape)
print(len(df2_2010s[df2_2010s['atobstemperature']=='M'])/len(df2_2010s))

(0, 22)
1.0


# 2010s Missing snowdepth/snwd
* Use this site to get acurate info:
    * https://weatherspark.com/h/d/147224/2010/1/1/Historical-Weather-on-Friday-January-1-2010-at-Bradley-International-Airport-Connecticut-United-States#Figures-Temperature

In [372]:
# ### Based on analyes below
# df_cornell.loc[(df_cornell['month'].isin([5,6,7,8,9])) & (df_cornell['snowfall'] == '0.0'), 'snowdepth'] = 0.0
# df.loc[(df['month'].isin([5,6,7,8,9])) & (df['snow'] == 0.0), 'snwd'] = 0.0

# df2.loc[(df2['date']== '2010-01-01'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-01'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-02'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-02'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-03'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-03'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-04'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-04'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-05'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-05'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-06'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-06'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-07'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-07'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-08'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-08'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-09'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-09'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-10'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-10'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-11'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-11'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-11'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-11'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-12'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-12'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-13'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-13'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-13'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-13'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-15'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-15'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-17'), 'snwd'] = 0.01
# df2.loc[(df2['date']== '2010-01-17'), 'snowdepth'] = 0.01
# df2.loc[(df2['date']== '2010-01-19'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-19'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-20'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-20'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-21'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-21'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-22'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-22'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-23'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-23'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-24'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-01-24'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-01-29'), 'snwd'] = 4.7
# df2.loc[(df2['date']== '2010-01-29'), 'snowdepth'] = 4.7
# df2.loc[(df2['date']== '2010-01-30'), 'snwd'] = 4.7
# df2.loc[(df2['date']== '2010-01-30'), 'snowdepth'] = 4.7
# df2.loc[(df2['date']== '2010-01-31'), 'snwd'] = 4.7
# df2.loc[(df2['date']== '2010-01-31'), 'snowdepth'] = 4.7
# df2.loc[(df2['date']== '2010-02-01'), 'snwd'] = 4.7
# df2.loc[(df2['date']== '2010-02-01'), 'snowdepth'] = 4.7
# df2.loc[(df2['date']== '2010-02-02'), 'snwd'] = 4.0
# df2.loc[(df2['date']== '2010-02-02'), 'snowdepth'] = 4.0
# df2.loc[(df2['date']== '2010-02-03'), 'snwd'] = 4.0
# df2.loc[(df2['date']== '2010-02-03'), 'snowdepth'] = 4.0
# df2.loc[(df2['date']== '2010-02-04'), 'snwd'] = 4.0
# df2.loc[(df2['date']== '2010-02-04'), 'snowdepth'] = 4.0
# df2.loc[(df2['date']== '2010-02-05'), 'snwd'] = 3.0
# df2.loc[(df2['date']== '2010-02-05'), 'snowdepth'] = 3.0
# df2.loc[(df2['date']== '2010-02-06'), 'snwd'] = 3.0
# df2.loc[(df2['date']== '2010-02-06'), 'snowdepth'] = 3.0
# df2.loc[(df2['date']== '2010-02-07'), 'snwd'] = 2.0
# df2.loc[(df2['date']== '2010-02-07'), 'snowdepth'] = 2.0
# df2.loc[(df2['date']== '2010-02-08'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-09'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-09'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-02-10'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-10'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-02-11'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-12'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-13'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-14'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-15'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-15'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-02-16'), 'snwd'] = 4.0
# df2.loc[(df2['date']== '2010-02-16'), 'snowdepth'] = 4.0
# df2.loc[(df2['date']== '2010-02-17'), 'snwd'] = 4.0
# df2.loc[(df2['date']== '2010-02-17'), 'snowdepth'] = 4.0
# df2.loc[(df2['date']== '2010-02-18'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-18'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-02-19'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-19'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-02-20'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-20'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-02-21'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-21'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-02-23'), 'snwd'] = 0.3
# df2.loc[(df2['date']== '2010-02-23'), 'snowdepth'] = 0.3
# df2.loc[(df2['date']== '2010-02-24'), 'snwd'] = 2.6
# df2.loc[(df2['date']== '2010-02-24'), 'snowdepth'] = 2.6
# df2.loc[(df2['date']== '2010-02-25'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-02-25'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-02-26'), 'snwd'] = 0.6
# df2.loc[(df2['date']== '2010-02-26'), 'snowdepth'] = 0.6
# df2.loc[(df2['date']== '2010-02-27'), 'snwd'] = 0.6
# df2.loc[(df2['date']== '2010-02-27'), 'snowdepth'] = 0.6
# df2.loc[(df2['date']== '2010-02-28'), 'snwd'] = 0.1
# df2.loc[(df2['date']== '2010-02-28'), 'snowdepth'] = 0.1
# df2.loc[(df2['date']== '2010-03-01'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-03-01'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-03-02'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-03-03'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-03-04'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-03-05'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-03-06'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-03-06'), 'snowdepth'] = 0.0
# df2.loc[(df2['date']== '2010-03-15'), 'snwd'] = 0.0
# df2.loc[(df2['date']== '2010-03-15'), 'snowdepth'] = 0.0

In [377]:
df2_2010s[(df2_2010s['date']>='2010-12-10')&(df2_2010s['date']<='2010-12-31')]

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year,month,...,snow,snwd,tavg,tmax,tmin,max_temp_same,min_temp_same,avg_temp_same,prcp_same,snow_same
24815,2010-12-10,25.0,6.0,15.5,M,0.009,0.09,0.0,2010,12,...,0.0,,,25.0,6.0,1,1,0,0,0
24816,2010-12-11,42.0,16.0,29.0,M,0.0,0.0,0.0,2010,12,...,0.0,,,42.0,16.0,1,1,0,1,1
24817,2010-12-12,56.0,22.0,39.0,M,3.34,0.0,0.0,2010,12,...,0.0,,,56.0,22.0,1,1,0,1,1
24818,2010-12-13,50.0,24.0,37.0,M,0.13,0.3,0.09,2010,12,...,0.3,,,50.0,24.0,1,1,0,1,1
24819,2010-12-14,24.0,14.0,19.0,M,0.009,0.09,0.0,2010,12,...,0.0,,,24.0,14.0,1,1,0,0,0
24820,2010-12-15,23.0,11.0,17.0,M,0.0,0.0,0.0,2010,12,...,0.0,,,23.0,11.0,1,1,0,1,1
24821,2010-12-16,34.0,14.0,24.0,M,0.009,0.09,0.0,2010,12,...,0.0,,,34.0,14.0,1,1,0,0,0
24822,2010-12-17,34.0,10.0,22.0,M,0.0,0.0,0.0,2010,12,...,0.0,,,34.0,10.0,1,1,0,1,1
24823,2010-12-18,38.0,22.0,30.0,M,0.0,0.0,0.0,2010,12,...,0.0,,,38.0,22.0,1,1,0,1,1
24824,2010-12-19,34.0,19.0,26.5,M,0.0,0.0,0.0,2010,12,...,0.0,,,34.0,19.0,1,1,0,1,1


# 1940s

In [101]:
df_40s_dates = ['1940-01-01','1941-01-01','1942-01-01','1943-01-01','1944-01-01','1945-01-01',
               '1946-01-01','1947-01-01','1948-01-01','1949-01-01','1950-01-01']

In [102]:
for start, end in zip(df_40s_dates[0:-1],df_40s_dates[1:]):
    df_40s = df[(df['date'] >= start) & (df['date'] < end)].copy()
    print(start)
    for col in df_40s.columns:
        try:
            if df_40s[col].isna().sum()/len(df_40s) > 0:
                print(col,'percent na: ',df_40s[col].isna().sum()/len(df_40s))
        except:
            continue

1940-01-01
1941-01-01
1942-01-01
1943-01-01
prcp percent na:  1.0
snow percent na:  1.0
snwd percent na:  1.0
tmax percent na:  1.0
tmin percent na:  1.0
1944-01-01
prcp percent na:  1.0
snow percent na:  1.0
snwd percent na:  1.0
tmax percent na:  1.0
tmin percent na:  1.0
1945-01-01
prcp percent na:  1.0
snow percent na:  1.0
snwd percent na:  1.0
tmax percent na:  1.0
tmin percent na:  1.0
1946-01-01
1947-01-01
1948-01-01
1949-01-01
tavg percent na:  1.0
tmax percent na:  0.0027397260273972603


  if df_40s[col].isna().sum()/len(df_40s) > 0:


# NOAA
* Dropped Data for 1940 to 1942
* No data for 1946, 1947, 1948,
* 1943:
    * prcp percent na:  1.0
    * snow percent na:  1.0
    * snwd percent na:  1.0
    * tmax percent na:  1.0
    * tmin percent na:  1.0
* 1944
    * prcp percent na:  1.0
    * snow percent na:  1.0
    * snwd percent na:  1.0
    * tmax percent na:  1.0
    * tmin percent na:  1.0
* 1945
    * prcp percent na:  1.0
    * snow percent na:  1.0
    * snwd percent na:  1.0
    * tmax percent na:  1.0
    * tmin percent na:  1.0
* 1949
    * tavg percent na:  1.0
    * tmax percent na:  0.0027397260273972603

In [103]:
for start, end in zip(df_40s_dates[0:-1],df_40s_dates[1:]):
    df_40s = df_cornell[(df_cornell['date'] >= start) & (df_cornell['date'] < end)].copy()
    print(start)
    for col in df_40s.columns:
        try:
            if df_40s[col].isna().sum()/len(df_40s) > 0:
                print(col,'percent na: ',df_40s[col].isna().sum()/len(df_40s))
        except:
            continue

1940-01-01
1941-01-01
1942-01-01
1943-01-01
1944-01-01
1945-01-01
1946-01-01
1947-01-01
1948-01-01
1949-01-01


  if df_40s[col].isna().sum()/len(df_40s) > 0:


In [104]:
bad_chars = [' S',' M',' T',' A','S','M','T','A']
for start, end in zip(df_40s_dates[0:-1],df_40s_dates[1:]):
    print(start)
    df_40s_cornell = df_cornell[(df_cornell['date'] >= start) & (df_cornell['date'] < end)].copy()
    for col in df_40s_cornell.columns:
        count = 0
        for val in df_40s_cornell[col].to_numpy():
            try:
                val = val.upper()
            except:
                continue
            if any(x in bad_chars for x in val):
                count += 1
        if count > 0:
            print(col,'percent bad:', round(count/len(df_40s_cornell)*100,4))
    del df_40s_cornell

1940-01-01
1941-01-01
1942-01-01
1943-01-01
maxtemperature percent bad: 100.0
mintemperature percent bad: 100.0
avgtemperature percent bad: 100.0
atobstemperature percent bad: 100.0
precipitation percent bad: 100.0
snowfall percent bad: 100.0
snowdepth percent bad: 100.0
1944-01-01
maxtemperature percent bad: 100.0
mintemperature percent bad: 100.0
avgtemperature percent bad: 100.0
atobstemperature percent bad: 100.0
precipitation percent bad: 100.0
snowfall percent bad: 100.0
snowdepth percent bad: 100.0
1945-01-01
maxtemperature percent bad: 100.0
mintemperature percent bad: 100.0
avgtemperature percent bad: 100.0
atobstemperature percent bad: 100.0
precipitation percent bad: 100.0
snowfall percent bad: 100.0
snowdepth percent bad: 100.0
1946-01-01
maxtemperature percent bad: 100.0
mintemperature percent bad: 100.0
avgtemperature percent bad: 100.0
atobstemperature percent bad: 100.0
precipitation percent bad: 100.0
snowfall percent bad: 100.0
snowdepth percent bad: 100.0
1947-01-01


# Cornell
* Dropped Data for 1940 to 1942
* 1943
    * maxtemperature percent bad: 100.0
    * mintemperature percent bad: 100.0
    * avgtemperature percent bad: 100.0
    * atobstemperature percent bad: 100.0
    * precipitation percent bad: 100.0
    * snowfall percent bad: 100.0
    * snowdepth percent bad: 100.0
* 1944
    * maxtemperature percent bad: 100.0
    * mintemperature percent bad: 100.0
    * avgtemperature percent bad: 100.0
    * atobstemperature percent bad: 100.0
    * precipitation percent bad: 100.0
    * snowfall percent bad: 100.0
    * snowdepth percent bad: 100.0
* 1945
    * maxtemperature percent bad: 100.0
    * mintemperature percent bad: 100.0
    * avgtemperature percent bad: 100.0
    * atobstemperature percent bad: 100.0
    * precipitation percent bad: 100.0
    * snowfall percent bad: 100.0
    * snowdepth percent bad: 100.0
* 1946
    * maxtemperature percent bad: 100.0
    * mintemperature percent bad: 100.0
    * avgtemperature percent bad: 100.0
    * atobstemperature percent bad: 100.0
    * precipitation percent bad: 100.0
    * snowfall percent bad: 100.0
    * snowdepth percent bad: 100.0
* 1947
    * maxtemperature percent bad: 100.0
    * mintemperature percent bad: 100.0
    * avgtemperature percent bad: 100.0
    * atobstemperature percent bad: 100.0
    * precipitation percent bad: 100.0
    * snowfall percent bad: 100.0
    * snowdepth percent bad: 100.0
* 1948
    * maxtemperature percent bad: 100.0
    * mintemperature percent bad: 100.0
    * avgtemperature percent bad: 100.0
    * atobstemperature percent bad: 100.0
    * precipitation percent bad: 100.0
    * snowfall percent bad: 100.0
    * snowdepth percent bad: 100.0
* 1949
    * maxtemperature percent bad: 0.8219
    * mintemperature percent bad: 0.5479
    * avgtemperature percent bad: 1.3699
    * atobstemperature percent bad: 100.0
    * precipitation percent bad: 16.4384
    * snowfall percent bad: 5.7534
    * snowdepth percent bad: 2.4658

In [105]:
df.head(1)

Unnamed: 0,date,prcp,snow,snwd,tavg,tmax,tmin,year,month,day
356,1943-01-01,,,,30.0,,,1943,1,1


In [106]:
df_cornell.head(1)

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year,month,day
1096,1943-01-01,M,M,M,M,M,M,M,1943,1,1


* NOAA
* Dropped Data for 1940 to 1942
* No data for 1946, 1947, 1948,
* NOAA (tavg)
    * 1943:
        * prcp percent na:  1.0
        * snow percent na:  1.0
        * snwd percent na:  1.0
        * tmax percent na:  1.0
        * tmin percent na:  1.0
* cornell (all bad)
    * 1943
        * maxtemperature percent bad: 100.0
        * mintemperature percent bad: 100.0
        * avgtemperature percent bad: 100.0
        * atobstemperature percent bad: 100.0
        * precipitation percent bad: 100.0
        * snowfall percent bad: 100.0
        * snowdepth percent bad: 100.0
* NOAA (tavg)
    * 1944
        * prcp percent na:  1.0
        * snow percent na:  1.0
        * snwd percent na:  1.0
        * tmax percent na:  1.0
        * tmin percent na:  1.0
* Cornell (all bad)
    * 1944
        * maxtemperature percent bad: 100.0
        * mintemperature percent bad: 100.0
        * avgtemperature percent bad: 100.0
        * atobstemperature percent bad: 100.0
        * precipitation percent bad: 100.0
        * snowfall percent bad: 100.0
        * snowdepth percent bad: 100.0
* NOAA (tavg)
    * 1945
        * prcp percent na:  1.0
        * snow percent na:  1.0
        * snwd percent na:  1.0
        * tmax percent na:  1.0
        * tmin percent na:  1.0
* cornell (all bad)
    * 1945
        * maxtemperature percent bad: 100.0
        * mintemperature percent bad: 100.0
        * avgtemperature percent bad: 100.0
        * atobstemperature percent bad: 100.0
        * precipitation percent bad: 100.0
        * snowfall percent bad: 100.0
        * snowdepth percent bad: 100.0
* NOAA
    * 1946: NO DATA
* cornell (all bad)
    * 1946
        * maxtemperature percent bad: 100.0
        * mintemperature percent bad: 100.0
        * avgtemperature percent bad: 100.0
        * atobstemperature percent bad: 100.0
        * precipitation percent bad: 100.0
        * snowfall percent bad: 100.0
        * snowdepth percent bad: 100.0
* NOAA
    * 1947: NO DATA
* Cornell (all bad)
    * 1947
        * maxtemperature percent bad: 100.0
        * mintemperature percent bad: 100.0
        * avgtemperature percent bad: 100.0
        * atobstemperature percent bad: 100.0
        * precipitation percent bad: 100.0
        * snowfall percent bad: 100.0
        * snowdepth percent bad: 100.0
* NOAA
    * 1948: NO DATA
* Cornell (all bad)
    * 1948
        * maxtemperature percent bad: 100.0
        * mintemperature percent bad: 100.0
        * avgtemperature percent bad: 100.0
        * atobstemperature percent bad: 100.0
        * precipitation percent bad: 100.0
        * snowfall percent bad: 100.0
        * snowdepth percent bad: 100.0
* NOAA (prcp,snow,snwd,tmin)
    * 1949
        * tavg percent na:  1.0
        * tmax percent na:  0.0027397260273972603
* cornell
    * 1949
        * maxtemperature percent bad: 0.8219
        * mintemperature percent bad: 0.5479
        * avgtemperature percent bad: 1.3699
        * atobstemperature percent bad: 100.0
        * precipitation percent bad: 16.4384
        * snowfall percent bad: 5.7534
        * snowdepth percent bad: 2.4658

In [108]:
df[(df['tavg'].isna()) & (df['date']>='1940-01-01') & (df['date']<'1950-01-01')].shape

(365, 10)

In [109]:
df[(df['tmax'].isna()) & (df['date']>='1949-01-01') & (df['date']<'1950-01-01')]

Unnamed: 0,date,prcp,snow,snwd,tavg,tmax,tmin,year,month,day
1395,1949-01-22,0.27,2.0,1.0,,,29.0,1949,1,22


In [110]:
df[(df['tmin'].isna()) & (df['date']>='1949-01-01') & (df['date']<'1950-01-01')].shape

(0, 10)

In [111]:
df[(df['tavg'].isna()) & (df['date']>='1949-01-01') & (df['date']<'1950-01-01')].shape

(365, 10)

In [112]:
df_cornell[(df_cornell['avgtemperature'].isin([' M'])) & (df_cornell['date']>='1949-01-01') & (df_cornell['date']<'1950-01-01')]

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year,month,day
3309,1949-01-22,M,29,M,M,0.27,2.0,1,1949,1,22
3591,1949-10-31,64,M,M,M,0.5,0.0,0,1949,10,31
3592,1949-11-01,M,40,M,M,0.01,0.0,0,1949,11,1
3616,1949-11-25,48,M,M,M,0.35,0.0,0,1949,11,25
3617,1949-11-26,M,17,M,M,0.0,0.0,0,1949,11,26


In [134]:
df[df['date']=='1949-01-22']

Unnamed: 0,date,prcp,snow,snwd,tavg,tmax,tmin,year,month,day
1395,1949-01-22,0.27,2.0,1.0,,,29.0,1949,1,22


In [133]:
df_cornell[df_cornell['date']=='1949-01-22']

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year,month,day
3309,1949-01-22,M,29,M,M,0.27,2.0,1,1949,1,22


In [135]:
(39.9+29)/2

34.45

## 1/29/1949
* Based on this site:
    * https://weatherspark.com/h/d/147224/1949/1/22/Historical-Weather-on-Saturday-January-22-1949-at-Bradley-International-Airport-United-States#Figures-Temperature
    * Max Temp was 39.9
* Both NOAA and Cornell have min at 29
* Therefore Average: (39.9+29)/2 = 34.45

In [151]:
# noaa tmax
df.iloc[1039,5] = 39.9

In [155]:
# noaa tavg
df.iloc[1039,4] = (39.9+29)/2

In [170]:
# cornell maxtemperature
df_cornell.iloc[2213,1]= 39.9

In [177]:
df_cornell.iloc[2213,3] = (39.9+29)/2

## Best Data for 1940s

* 1949 ONLY      
    * prcp (precipitation): NOAA: No missing
    * snow (snowfall): NOAA: No missing
    * snwd (snowdepth): NOAA: No missing
    * tavg (avgtemperature): Need to see if they don't have the same day missing
        * They both have min temp but no max temp to get average for 1/22/1949
        * Updated based on info above
        * No Missing Data
    * tmax (maxtemperature): Need to see if they don't have the same day missing
        * They are both missing data for max temp on 1/22/1949
        * Updated based on info above
        * No Missing Data
    * tmin (mintemperature): NOAA: No missing
    * atobstemperature: cornell: atobstemperature percent bad: 100%

# 1950s

In [113]:
df_50s_dates = ['1950-01-01','1951-01-01','1952-01-01','1953-01-01','1954-01-01','1955-01-01',
               '1956-01-01','1957-01-01','1958-01-01','1959-01-01','1960-01-01']

In [114]:
# NOAA
for start, end in zip(df_50s_dates[0:-1],df_50s_dates[1:]):
    df_50s = df[(df['date'] >= start) & (df['date'] < end)].copy()
    print(start)
    for col in df_50s.columns:
        try:
            if df_50s[col].isna().sum()/len(df_50s) > 0:
                print(col,'percent na: ',df_50s[col].isna().sum()/len(df_50s))
        except:
            continue

1950-01-01
snwd percent na:  0.005479452054794521
tavg percent na:  1.0
1951-01-01
tavg percent na:  1.0
1952-01-01
tavg percent na:  1.0
1953-01-01
snwd percent na:  0.0136986301369863
tavg percent na:  1.0
tmax percent na:  0.005479452054794521
tmin percent na:  0.005479452054794521
1954-01-01
tavg percent na:  1.0
1955-01-01
tavg percent na:  1.0
1956-01-01
snow percent na:  0.00273224043715847
snwd percent na:  0.00273224043715847
tavg percent na:  1.0
1957-01-01
snwd percent na:  0.005494505494505495
tavg percent na:  1.0
1958-01-01
tavg percent na:  1.0
1959-01-01
snwd percent na:  0.0027397260273972603
tavg percent na:  1.0


In [115]:
for start, end in zip(df_50s_dates[0:-1],df_50s_dates[1:]):
    df_50s = df_cornell[(df_cornell['date'] >= start) & (df_cornell['date'] < end)].copy()
    print(start)
    for col in df_50s.columns:
        try:
            if df_50s[col].isna().sum()/len(df_50s) > 0:
                print(col,'percent na: ',df_50s[col].isna().sum()/len(df_40s))
        except:
            continue

1950-01-01
1951-01-01
1952-01-01
1953-01-01
1954-01-01
1955-01-01
1956-01-01
1957-01-01
1958-01-01
1959-01-01


In [116]:
bad_chars = [' S',' M',' T',' A','S','M','T','A']
for start, end in zip(df_50s_dates[0:-1],df_50s_dates[1:]):
    print(start)
    df_50s_cornell = df_cornell[(df_cornell['date'] >= start) & (df_cornell['date'] < end)].copy()
    for col in df_50s_cornell.columns:
        count = 0
        for val in df_50s_cornell[col].to_numpy():
            try:
                val = val.upper()
            except:
                continue
            if any(x in bad_chars for x in val):
                count += 1
        if count > 0:
            print(col,'percent bad:', round(count/len(df_50s_cornell)*100,4))
    del df_50s_cornell

1950-01-01
atobstemperature percent bad: 100.0
precipitation percent bad: 16.4384
snowfall percent bad: 10.6849
snowdepth percent bad: 2.1918
1951-01-01
maxtemperature percent bad: 0.274
mintemperature percent bad: 0.274
avgtemperature percent bad: 0.5479
atobstemperature percent bad: 100.0
precipitation percent bad: 18.0822
snowfall percent bad: 7.1233
snowdepth percent bad: 1.6438
1952-01-01
atobstemperature percent bad: 100.0
precipitation percent bad: 15.3005
snowfall percent bad: 7.6503
snowdepth percent bad: 4.0984
1953-01-01
maxtemperature percent bad: 0.5479
mintemperature percent bad: 0.5479
avgtemperature percent bad: 0.5479
atobstemperature percent bad: 100.0
precipitation percent bad: 13.6986
snowfall percent bad: 10.6849
snowdepth percent bad: 3.0137
1954-01-01
atobstemperature percent bad: 100.0
precipitation percent bad: 17.8082
snowfall percent bad: 6.8493
snowdepth percent bad: 6.0274
1955-01-01
atobstemperature percent bad: 100.0
precipitation percent bad: 12.3288
sno

In [117]:
df_cornell.shape

(29220, 11)

* 1950
    * prcp (precipitation): NOAA: No missing
    * snow (snowfall): NOAA: No missing
    * snwd (snowdepth): NOAA: 0.00548% missing Cornell: 2.2% bad
    * tavg (avgtemperature): Cornell: No missing
    * tmax (maxtemperature): Need to see if they don't have the same day missing
    * tmin (mintemperature): NOAA: No missing
    * atobstemperature: cornell: atobstemperature percent bad: 100%
* 1951
    * prcp (precipitation): NOAA: No missing
    * snow (snowfall): NOAA: No missing
    * snwd (snowdepth): NOAA: 0.00548% missing Cornell: 2.2% bad
    * tavg (avgtemperature): Cornell: 0.5479 missing
    * tmax (maxtemperature): Need to see if they don't have the same day missing
    * tmin (mintemperature): NOAA: No missing
    * atobstemperature: cornell: atobstemperature percent bad: 100%
* 1952
    * prcp (precipitation): NOAA: No missing
    * snow (snowfall): NOAA: No missing
    * snwd (snowdepth): NOAA: No missing
    * tavg (avgtemperature): Cornell: No missing
    * tmax (maxtemperature): Cornell: No missing
    * tmin (mintemperature): Cornell: No missing
    * atobstemperature: cornell: atobstemperature percent bad: 100%
* 1953
    * prcp (precipitation): NOAA: No missing
    * snow (snowfall): NOAA: No missing
    * snwd (snowdepth): NOAA: 0.0136% missing, Cornell: 3% missing
    * tavg (avgtemperature): Cornell: 0.5479% missing, NOAA: All missing
    * tmax (maxtemperature): Cornell: 0.5479% missing, NOAA: 0.0055% missing
    * tmin (mintemperature): Cornell: 0.5479% missing, NOAA: 0.0055% missing
    * atobstemperature: cornell: atobstemperature percent bad: 100%
* 1954
    * prcp (precipitation): NOAA: No missing
    * snow (snowfall): NOAA: No missing
    * snwd (snowdepth): NOAA: 0.0136% missing, Cornell: 3% missing
    * tavg (avgtemperature): Cornell: No missing
    * tmax (maxtemperature): Cornell: No missing, NOAA: No missing
    * tmin (mintemperature): Cornell: No missing, NOAA: No missing
    * atobstemperature: cornell: atobstemperature percent bad: 100%
* 1955
    * prcp (precipitation): NOAA: No missing
    * snow (snowfall): NOAA: No missing
    * snwd (snowdepth): NOAA: 0.0136% missing, Cornell: 3% missing
    * tavg (avgtemperature): Cornell: No missing
    * tmax (maxtemperature): Cornell: No missing, NOAA: No missing
    * tmin (mintemperature): Cornell: No missing, NOAA: No missing
    * atobstemperature: cornell: atobstemperature percent bad: 100%
* 1956
    * prcp (precipitation): NOAA: No missing
    * snow (snowfall): NOAA: 0.0027% missing, Cornell: 6.0109% missing
    * snwd (snowdepth): NOAA: 0.0027% missing, Cornell: 9.2896% missing
    * tavg (avgtemperature): Cornell: No missing
    * tmax (maxtemperature): Cornell: No missing, NOAA: No missing
    * tmin (mintemperature): Cornell: No missing, NOAA: No missing
    * atobstemperature: cornell: atobstemperature percent bad: 100%
* 1957
    * prcp (precipitation): NOAA: No missing
    * snow (snowfall): NOAA: No missing
    * snwd (snowdepth): NOAA: 5.7534% missing, Cornell: 0.005% missing
    * tavg (avgtemperature): Cornell: 0.274% missing, NOAA: All missing
        * Use NOAA max,min to get average for missing days if overlap doesn't work
    * tmax (maxtemperature): NOAA: No missing
    * tmin (mintemperature): NOAA: No missing
    * atobstemperature: cornell: atobstemperature percent bad: 100%
* 1958
    * prcp (precipitation): NOAA: No missing
    * snow (snowfall): NOAA: No missing
    * snwd (snowdepth): NOAA: No missing
    * tavg (avgtemperature): Cornell: No missing
    * tmax (maxtemperature): NOAA: No missing
    * tmin (mintemperature): NOAA: No missing
    * atobstemperature: cornell: atobstemperature percent bad: 100%
* 1959
    * prcp (precipitation): NOAA: No missing
    * snow (snowfall): NOAA: No missing
    * snwd (snowdepth): NOAA: 0.0027% missing, Cornell: 5.7534% missing
    * tavg (avgtemperature): Cornell: No missing
    * tmax (maxtemperature): NOAA: No missing
    * tmin (mintemperature): NOAA: No missing
    * atobstemperature: cornell: atobstemperature percent bad: 100%

In [121]:
df[(df['snwd'].isna()) & (df['date'] >= '1950-01-01') & (df['date'] < '1960-01-01')].shape

(11, 10)

In [197]:
df[(df['snwd'].isna()) & (df['date'] >= '1950-01-01') & (df['date'] < '1960-01-01')]

Unnamed: 0,date,prcp,snow,snwd,tavg,tmax,tmin,year,month,day
1973,1950-08-23,0.01,0.0,,,80.0,59.0,1950,8,23
3049,1953-08-03,0.0,0.0,,,77.0,50.0,1953,8,3
3051,1953-08-05,0.0,0.0,,,81.0,60.0,1953,8,5
3078,1953-09-01,0.0,0.0,,,92.0,63.0,1953,9,1
3082,1953-09-05,0.0,0.0,,,87.0,73.0,1953,9,5
3086,1953-09-09,0.0,0.0,,,71.0,49.0,1953,9,9
4057,1956-05-07,0.04,,,,53.0,43.0,1956,5,7
4530,1957-08-23,0.0,0.0,,,79.0,46.0,1957,8,23
4605,1957-11-07,0.0,0.0,,,60.0,31.0,1957,11,7
5361,1959-12-03,0.0,0.0,,,50.0,25.0,1959,12,3


In [204]:
df[df['date'] == '1957-10-03']

Unnamed: 0,date,prcp,snow,snwd,tavg,tmax,tmin,year,month,day
4570,1957-10-03,0.0,0.0,0.0,,66.0,37.0,1957,10,3


In [127]:
df_cornell[(df_cornell['snowdepth'].isin([' M'])) & (df_cornell['date'] >= '1950-01-01') & (df_cornell['date'] < '1960-01-01')].shape

(12, 11)

In [128]:
df_cornell[(df_cornell['snowdepth'].isin([' M'])) & (df_cornell['date'] >= '1950-01-01') & (df_cornell['date'] < '1960-01-01')]

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year,month,day
3760,1950-04-18,76,41,58.5,M,0.00,0.0,M,1950,4,18
3887,1950-08-23,80,59,69.5,M,0.01,0.0,M,1950,8,23
4963,1953-08-03,77,50,63.5,M,0.00,0.0,M,1953,8,3
4965,1953-08-05,81,60,70.5,M,0.00,0.0,M,1953,8,5
4992,1953-09-01,92,63,77.5,M,0.00,0.0,M,1953,9,1
4996,1953-09-05,87,73,80.0,M,0.00,0.0,M,1953,9,5
5000,1953-09-09,71,49,60.0,M,0.00,0.0,M,1953,9,9
5971,1956-05-07,53,43,48.0,M,0.04,M,M,1956,5,7
6444,1957-08-23,79,46,62.5,M,0.00,0.0,M,1957,8,23
6484,1957-10-02,M,M,M,M,M,M,M,1957,10,2


In [None]:
### 1950-04-18 Snow depth
* Both datasets show no or Trace snow depth, snow or precipitation 3 days before 4/18/1950
* Both datasets show no or Trace snow depth, snow or precipitation 1 day after 4/18/1950
* Therefore snowdepth will be set to 0

In [179]:
#### 1950-04-18
df[(df['date'] >= '1950-04-11') & (df['date'] <= '1950-04-24')]

Unnamed: 0,date,prcp,snow,snwd,tavg,tmax,tmin,year,month,day
1839,1950-04-11,0.0,0.0,0.0,,53.0,27.0,1950,4,11
1840,1950-04-12,0.0,0.0,0.0,,52.0,29.0,1950,4,12
1841,1950-04-13,0.3,1.0,1.0,,37.0,29.0,1950,4,13
1842,1950-04-14,0.19,1.0,0.0,,50.0,29.0,1950,4,14
1843,1950-04-15,0.0,0.0,0.0,,49.0,28.0,1950,4,15
1844,1950-04-16,0.0,0.0,0.0,,61.0,32.0,1950,4,16
1845,1950-04-17,0.0,0.0,0.0,,69.0,29.0,1950,4,17
1846,1950-04-18,0.0,0.0,,,76.0,41.0,1950,4,18
1847,1950-04-19,0.0,0.0,0.0,,75.0,41.0,1950,4,19
1848,1950-04-20,1.0,0.0,0.0,,56.0,42.0,1950,4,20


In [180]:
#### 1950-04-18
df_cornell[(df_cornell['date'] >= '1950-04-11') & (df_cornell['date'] <= '1950-04-24')]

Unnamed: 0,date,maxtemperature,mintemperature,avgtemperature,atobstemperature,precipitation,snowfall,snowdepth,year,month,day
3753,1950-04-11,53,27,40.0,M,T,0.0,0,1950,4,11
3754,1950-04-12,52,29,40.5,M,T,0.0,0,1950,4,12
3755,1950-04-13,37,29,33.0,M,0.30,1.0,1,1950,4,13
3756,1950-04-14,50,29,39.5,M,0.19,1.0,0,1950,4,14
3757,1950-04-15,49,28,38.5,M,T,T,0,1950,4,15
3758,1950-04-16,61,32,46.5,M,0.00,0.0,0,1950,4,16
3759,1950-04-17,69,29,49.0,M,T,0.0,0,1950,4,17
3760,1950-04-18,76,41,58.5,M,0.00,0.0,M,1950,4,18
3761,1950-04-19,75,41,58.0,M,T,0.0,0,1950,4,19
3762,1950-04-20,56,42,49.0,M,1.00,0.0,0,1950,4,20


In [184]:
df.iloc[1490,3] = 0.0

In [191]:
df_cornell.iloc[2664,7] = 0.0

In [192]:
df_cornell.iloc[2664]

date                1950-04-18
maxtemperature              76
mintemperature              41
avgtemperature            58.5
atobstemperature             M
precipitation             0.00
snowfall                   0.0
snowdepth                  0.0
year                      1950
month                        4
day                         18
Name: 3760, dtype: object