# Get average PM2.5 by satellite pass over window

**Terra flyover:** approx. 17-19Z -> 18Z and 19Z in DEQ discrete data

**Aqua flyover:** approx. 19-21Z -> 20Z and 21Z in DEQ discrete data

'date' column is in UTC and 'hour' column is in MDT/MST. Datetime is unreliable.

In [1]:
import pandas as pd

In [3]:
# open csv in dataframe, only keep useful columns
data = pd.read_csv('Raw_data_12-17.csv', 
                   usecols=['date', 'rawvalue', 'sitename', 'longitude', 'latitude'])

data.head(3)

Unnamed: 0,date,latitude,rawvalue,sitename,longitude
0,2012/01/01 17:00:00+00,46.842301,14.995,Missoula,-114.020582
1,2012/01/01 17:00:00+00,47.012907,10.366,Frenchtown,-114.224273
2,2012/01/01 17:00:00+00,46.658762,6.248,Helena,-112.013089


In [17]:
# ignore null values and negatives
data = data.loc[data['rawvalue'] >= 0]

In [6]:
# separate date column into date and time
def getTime(datetime):
    dt = str(datetime)
    time = int(dt.split(' ')[1].split(':')[0])
    return time

def getDate(datetime):
    dt = str(datetime)
    date = dt.split(' ')[0].replace('/', '-')
    return date
 
data['time'] = data.date.apply(getTime)
data['date'] = data.date.apply(getDate)

In [7]:
# get unique dates
unique_dates = data.date.unique()

In [8]:
# get unique ground stations
unique_sitenames = data.sitename.unique()

# get latitude and longitude by station in dictionary
lats = {}
longs = {}
for site in unique_sitenames:
    lat = data.loc[data.sitename == site, 'latitude'].iloc[0]
    long = data.loc[data.sitename == site, 'longitude'].iloc[0]
    lats[site] = lat
    longs[site] = long

In [9]:
# empty lists for holding averaged data
datetimes = []
sitenames = []
rawvalues = []

# filter by date and station
for date in unique_dates:
    open_sites = data.loc[data.date == date, 'sitename'].unique()
    for site in open_sites:
        subtable = data.loc[(data.date == date) & (data.sitename == site)]
        
        # average time window 1
        window1 = subtable.loc[(subtable.time == 18) | (subtable.time == 19)]
        rawvalues.append(window1.rawvalue.mean())
        sitenames.append(site)
        datetimes.append(date + ' 18:00')
        
        # average time window 2
        window2 = subtable.loc[(subtable.time == 20) | (subtable.time == 21)]
        rawvalues.append(window2.rawvalue.mean())
        sitenames.append(site)
        datetimes.append(date + ' 20:00')
        
# put data back into dataframe
output_data = pd.DataFrame({'datetime': datetimes,
                              'sitename': sitenames,
                              'rawvalue': rawvalues})

In [10]:
# add back coordinates
output_data['latitude'] = output_data.sitename.map(lats)
output_data['longitude'] = output_data.sitename.map(longs)

In [13]:
# some rawvalues are created due to observations being present at a station
# during only one of the time window
output_data = output_data.loc[output_data.rawvalue.isna()==False]

In [None]:
# output as a csv
output_data.to_csv('PM_averaged.csv')