In [22]:
import pandas as pd
import numpy as np


In [23]:
# Read file containing weather daily data for the Oklahoma Mesonet
# Note that it's not necessary to navigate using a separate module, you can do all in one step
# Only use this if the directory is nearby.
df = pd.read_csv('../Datasets/ok_mesonet_8_apr_2019.csv')


In [24]:
# Check our dataset
# Upon inspection we see some blank cells that need to be filled with NaNs
df.head()

Unnamed: 0,STID,NAME,ST,LAT,LON,YR,MO,DA,HR,MI,...,RELH,CHIL,HEAT,WDIR,WSPD,WMAX,PRES,TMAX,TMIN,RAIN
0,ACME,Acme,OK,34.81,-98.02,2019,4,15,15,20,...,,,,,,,,,,
1,ADAX,Ada,OK,34.8,-96.67,2019,4,15,15,20,...,40.0,,,S,12.0,20.0,1011.13,78.0,48.0,
2,ALTU,Altus,OK,34.59,-99.34,2019,4,15,15,20,...,39.0,,82.0,SSW,19.0,26.0,1007.86,82.0,45.0,
3,ALV2,Alva,OK,36.71,-98.71,2019,4,15,15,20,...,32.0,,82.0,S,20.0,26.0,1004.65,84.0,40.0,
4,ANT2,Antlers,OK,34.25,-95.67,2019,4,15,15,20,...,35.0,,,S,11.0,20.0,1013.64,78.0,38.0,


In [25]:
# Let's check what is exactly inside the cell before we replace the values
df.loc[0,'RAIN']

' '

In [14]:
# Replace empty spaces by NaN.
# The inplace option tells Python to directly replace the ' ' with NaNs without creating
# a copy of the dataframe
df.replace(' ', np.nan, inplace=True)
df

Unnamed: 0,STID,NAME,ST,LAT,LON,YR,MO,DA,HR,MI,...,RELH,CHIL,HEAT,WDIR,WSPD,WMAX,PRES,TMAX,TMIN,RAIN
0,ACME,Acme,OK,34.81,-98.02,2019,4,15,15,20,...,,,,,,,,,,
1,ADAX,Ada,OK,34.80,-96.67,2019,4,15,15,20,...,40,,,S,12,20,1011.13,78,48,
2,ALTU,Altus,OK,34.59,-99.34,2019,4,15,15,20,...,39,,82,SSW,19,26,1007.86,82,45,
3,ALV2,Alva,OK,36.71,-98.71,2019,4,15,15,20,...,32,,82,S,20,26,1004.65,84,40,
4,ANT2,Antlers,OK,34.25,-95.67,2019,4,15,15,20,...,35,,,S,11,20,1013.64,78,38,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,WILB,Wilburton,OK,34.90,-95.35,2019,4,15,15,20,...,39,,,S,19,29,1012.43,77,42,
116,WIST,Wister,OK,34.98,-94.69,2019,4,15,15,20,...,39,,,S,12,23,1013.15,77,36,
117,WOOD,Woodward,OK,36.42,-99.42,2019,4,15,15,20,...,23,,83,S,25,32,1004.84,86,46,
118,WYNO,Wynona,OK,36.52,-96.34,2019,4,15,15,20,...,36,,,SSW,24,33,1007.98,80,46,


## Important note

You can replace the missing values for some other values. Unless you have a method for replacing missing data, it is better to replace them with `NaN`. Never replace missing rainfall data with `0`. A value of zero means that the rain gauge recorded no rainfall. A missing value means that the rain gauge was not working that day and we don't know whether the rainfall was 0 or not.

In [26]:
# Find row for ACME station
idx_acme = df['STID'] == 'ACME'
df[idx_acme]

Unnamed: 0,STID,NAME,ST,LAT,LON,YR,MO,DA,HR,MI,...,RELH,CHIL,HEAT,WDIR,WSPD,WMAX,PRES,TMAX,TMIN,RAIN
0,ACME,Acme,OK,34.81,-98.02,2019,4,15,15,20,...,,,,,,,,,,


In [27]:
# Find all stations that contain an A
idx_starts_with_A = df['STID'].str.contains('A')
df[idx_starts_with_A]

Unnamed: 0,STID,NAME,ST,LAT,LON,YR,MO,DA,HR,MI,...,RELH,CHIL,HEAT,WDIR,WSPD,WMAX,PRES,TMAX,TMIN,RAIN
0,ACME,Acme,OK,34.81,-98.02,2019,4,15,15,20,...,,,,,,,,,,
1,ADAX,Ada,OK,34.8,-96.67,2019,4,15,15,20,...,40.0,,,S,12.0,20.0,1011.13,78.0,48.0,
2,ALTU,Altus,OK,34.59,-99.34,2019,4,15,15,20,...,39.0,,82.0,SSW,19.0,26.0,1007.86,82.0,45.0,
3,ALV2,Alva,OK,36.71,-98.71,2019,4,15,15,20,...,32.0,,82.0,S,20.0,26.0,1004.65,84.0,40.0,
4,ANT2,Antlers,OK,34.25,-95.67,2019,4,15,15,20,...,35.0,,,S,11.0,20.0,1013.64,78.0,38.0,
5,APAC,Apache,OK,34.91,-98.29,2019,4,15,15,20,...,41.0,,,S,23.0,29.0,1008.9,80.0,49.0,
6,ARD2,Ardmore,OK,34.19,-97.09,2019,4,15,15,20,...,41.0,,,S,18.0,26.0,1011.43,77.0,50.0,
7,ARNE,Arnett,OK,36.07,-99.9,2019,4,15,15,20,...,10.0,,85.0,SW,22.0,32.0,1005.13,,,
8,BEAV,Beaver,OK,36.8,-100.53,2019,4,15,15,20,...,9.0,,84.0,SW,17.0,26.0,1003.9,91.0,34.0,
11,BLAC,Blackwell,OK,36.75,-97.25,2019,4,15,15,20,...,38.0,,,SSW,15.0,23.0,1007.02,80.0,44.0,


In [40]:
# Boolean indexing
idx = df['NAME'].str.contains('Blackwell') & df['NAME'].str.contains('Lake')
df[idx]

Unnamed: 0,STID,NAME,ST,LAT,LON,YR,MO,DA,HR,MI,...,RELH,CHIL,HEAT,WDIR,WSPD,WMAX,PRES,TMAX,TMIN,RAIN
22,CARL,Lake Carl Blackwell,OK,36.15,-97.29,2019,4,15,15,20,...,36,,80,S,17,25,1007.56,80,50,


In [42]:
idx = df['NAME'].str.contains('Blackwell') | df['NAME'].str.contains('Lake')
df[idx]

Unnamed: 0,STID,NAME,ST,LAT,LON,YR,MO,DA,HR,MI,...,RELH,CHIL,HEAT,WDIR,WSPD,WMAX,PRES,TMAX,TMIN,RAIN
11,BLAC,Blackwell,OK,36.75,-97.25,2019,4,15,15,20,...,38,,,SSW,15,23,1007.02,80,44,
22,CARL,Lake Carl Blackwell,OK,36.15,-97.29,2019,4,15,15,20,...,36,,80.0,S,17,25,1007.56,80,50,
