# Arctic Lake Bathymetry
#### Filtering by data variables - ICESat ATL13
#### Melanie Frost
#### 4/18/2023

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import geopy.distance

In [4]:
df = pd.read_csv('data/ICESat_AKNS.csv')
df.head()

Unnamed: 0,cycle,id,size,type,cloud_flag,bkgrd_flag,shallow_flag,wind_flag,rgt,seg_lat,...,qual0,qual1,qual2,qual3,orientation,beg_lat,beg_lon,end_lat,end_lon,sc_orient
0,4,4009051,0,7,0,2,-1,0,183,70.71245,...,109,0,0,0,,70.712665,-160.126315,70.712665,-160.126492,0.0
1,4,4009051,0,7,0,2,-1,0,183,70.711955,...,117,0,0,0,,70.712209,-160.126492,70.712209,-160.126707,0.0
2,4,4009051,0,7,0,2,-1,0,183,70.711497,...,106,0,0,0,,70.711663,-160.126707,70.711663,-160.126835,0.0
3,4,4009051,0,7,0,2,-1,0,183,70.711161,...,110,0,0,0,,70.711338,-160.126835,70.711338,-160.126988,0.0
4,4,4009051,0,7,0,2,-1,0,183,70.710705,...,117,0,0,0,,70.710951,-160.126988,70.710951,-160.127163,0.0


In [5]:
df.shape

(1282216, 27)

In [6]:
#Filter type: lake = 1
df = df.loc[df['type'] == 1]
df.shape

(482262, 27)

In [7]:
#Filter 1-July - 15-Sept
df['start_date'] = pd.to_datetime(df['start_date'])
# df['month'] = df['start_date'].dt.month
# df['day'] = df['start_date'].dt.day
df = df.loc[(df['start_date'].dt.month == 7) | (df['start_date'].dt.month == 8) |
            ((df['start_date'].dt.month == 9) & (df['start_date'].dt.day <= 15))]
df.shape

(358966, 27)

In [8]:
#Filter size (does not = 7)
df = df[df['size'] != 7]
df.shape

(358966, 27)

In [9]:
#Filter cloud (does not = 1)
df = df[df['cloud_flag'] != 1]
df.shape

(268638, 27)

In [10]:
#filter background (does not = 6)
df = df[df['bkgrd_flag'] != 6]
df['bkgrd_flag'] = np.where(df['bkgrd_flag'] > 100, np.NaN, df['bkgrd_flag'])
df.shape

(268638, 27)

In [11]:
#filter shallow <3, >-3
df = df[df['shallow_flag'] != 3]
df = df[df['shallow_flag'] != -3]
df.shape

(268548, 27)

In [12]:
#filter wind <3, >-3
df = df[df['wind_flag'] != 3]
df = df[df['wind_flag'] != -3]
df.shape

(268428, 27)

In [13]:
#filter snow_ice flag 2 (snow), 3 (ice)
#for some reason, vast majority of points are labelled as 1 = "snow-free land"
df = df[df['snow_ice_flag'] < 2]
df.shape

(263303, 27)

In [14]:
#filter wave height > 2

df = df[df['wave_flag'] < 2]
df.shape

(248439, 27)

In [15]:
#filter bottom surface anomalies
#= 1 = Subsurface anomaly due to bottom likely
#= 2 = Subsurface signal may indicate bottom or otheranomaly
#= 3 = Possible subsurface anomaly 

#I am not going to filter as majority of points are 3 or NaN
# df = df[df['anomalies'] != 3]

#set NaN
df['anomalies'] = np.where(df['anomalies'] > 12.5, np.NaN, df['anomalies'])
# df.shape

In [16]:
#filter depth > 40m
##This dropped a lot of points, including NaN points
df = df[df['depth'] < 40]
df.shape

(115655, 27)

In [17]:
#quality flag - >90% of points must be nominal
df['quality_metric'] = df['qual0']/(df['qual0'] + df['qual1'] + df['qual2'] + df['qual3'])
df = df[df['quality_metric'] > .90]
df.shape

(100414, 28)

In [18]:
#beam, forward orientation: L = weak, R = strong, backward orientation: R = strong, L = weak
# 0='backward', 1 = 'forward', 2 = 'transition'])

#Beams Flying backwards until Sep 7 '19, forwards to May 14 '20, backwards until Jan 15 '21, 
#forwards until Oct 2 '21, backwards until June 9 '22, forwards until Feb 9 '23

#In our time period this means that:
 #July 1, 2019 - Sep 7, 2019 backward: L = strong, R = Weak
 #Sep 7, 2019 - Sep 15 2019 forward L = weak, R = Strong
 #July 1, 2020 - Sep 15, 2020 backward L = strong, R = Weak
 #July 1, 2021 - Sep 15, 2021 forward L = weak, R = Strong
 #July 1, 2022 - Sep 15, 2022 forward L = weak, R = Strong

#Filter out transistion
df = df[df['sc_orient'] <2]
df.shape
    
#split left and right beams
df_l = df.loc[df['beam'].str.endswith('l')]
df_r = df.loc[df['beam'].str.endswith('r')] 

df_r['beam_strength'] = np.where(df_r['sc_orient'] == 0, 'Strong', 'Weak')
df_l['beam_strength'] = np.where(df_l['sc_orient'] == 0, 'Weak', 'Strong')

df = pd.concat([df_r,df_l])
df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_r['beam_strength'] = np.where(df_r['sc_orient'] == 0, 'Strong', 'Weak')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_l['beam_strength'] = np.where(df_l['sc_orient'] == 0, 'Weak', 'Strong')


(100414, 29)

In [20]:
df.columns

Index(['cycle', 'id', 'size', 'type', 'cloud_flag', 'bkgrd_flag',
       'shallow_flag', 'wind_flag', 'rgt', 'seg_lat', 'seg_lon',
       'snow_ice_flag', 'wave_flag', 'depth', 'anomalies', 'beam',
       'start_date', 'qual0', 'qual1', 'qual2', 'qual3', 'orientation',
       'beg_lat', 'beg_lon', 'end_lat', 'end_lon', 'sc_orient',
       'quality_metric', 'beam_strength'],
      dtype='object')

In [54]:
##I've spent a bunch of time trying to get this to work
# #short segment distance
# from math import radians, cos, sin, asin, sqrt

# def haversine(lat1, lon1, lat2, lon2):

#       R = 3959.87433 # this is in miles.  For Earth radius in kilometers use 6372.8 km

#       dLat = radians(lat2 - lat1)
#       dLon = radians(lon2 - lon1)
#       lat1 = radians(lat1)
#       lat2 = radians(lat2)

#       a = sin(dLat/2)**2 + cos(lat1)*cos(lat2)*sin(dLon/2)**2
#       c = 2*asin(sqrt(a))

#       return R * c

# # Usage
# lon1 = -103.548851
# lat1 = 32.0004311
# lon2 = -103.6041946
# lat2 = 33.374939

# df['seg_dist'] = (haversine(df['beg_lat'], df['beg_lon'], df['end_lat'], df['end_lon']))


print(geopy.distance.distance((df['beg_lat'].iloc[0], df['beg_lon'].iloc[0]), (df['end_lat'].iloc[0], df['end_lon'].iloc[0])).km)
# df.head()

TypeError: 'method' object is not subscriptable

In [None]:
df.to_csv('data/lakes_clean.csv', sep=',', index = False)