In [1]:
#The goal is to enable interact to check different beam-series combinations, et al
#Interact variables: beam_name, series_name, start_date, end_date, quantile_cutoff
#Read '../data/out/df.pkl' file as input and matplotlib graphs as output

In [2]:
%matplotlib widget

In [3]:
%matplotlib widget
# %matplotlib inline
# %matplotlib qt 

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import folium
import geopandas as gpd
from shapely.geometry import shape, GeometryCollection
from pyproj import CRS
from tqdm.notebook import tqdm
from tqdm import trange
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets

import ee
try:
    ee.Initialize()
except Exception as e:
    ee.Authenticate()
    ee.Initialize()  
    
#import geemap.eefolium as emap
import geemap as emap
create_map = lambda: emap.Map()

In [4]:
import seaborn as sb

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import scale
from collections import Counter

In [5]:
df = pd.read_pickle('../data/out/df.pkl')
df2 = pd.read_pickle('../data/out/df2.pkl')
df3 = pd.read_pickle('../data/out/df3.pkl')

In [6]:
start_date = '2018-09-01'
end_date = '2019-09-01'
quantile_cutoff = 0.05

In [7]:
df

Unnamed: 0,track_id,date,beam,series,lon,lat,h,water_occurrence,filter
0,180,2019-01-09,gt3r,Noise,89.622678,23.727912,3.698254,-0.050000,"180, gt3r, Noise"
1,180,2019-01-09,gt3r,Noise,89.622680,23.727912,-30.436510,-0.050000,"180, gt3r, Noise"
2,180,2019-01-09,gt3r,Noise,89.622685,23.727913,-103.526760,-0.050000,"180, gt3r, Noise"
3,180,2019-01-09,gt3r,Noise,89.622679,23.727918,-19.456028,-0.050000,"180, gt3r, Noise"
4,180,2019-01-09,gt3r,Noise,89.622678,23.727918,-9.291720,-0.050000,"180, gt3r, Noise"
...,...,...,...,...,...,...,...,...,...
4150038,1239,2020-03-16,gt1l,High,89.751855,23.721827,-47.611958,-0.049445,"1239, gt1l, High"
4150039,1239,2020-03-16,gt1l,High,89.751852,23.721802,-47.792526,-0.049445,"1239, gt1l, High"
4150040,1239,2020-03-16,gt1l,High,89.751844,23.721725,-47.442990,-0.049841,"1239, gt1l, High"
4150041,1239,2020-03-16,gt1l,High,89.751837,23.721668,-47.488422,-0.049955,"1239, gt1l, High"


In [8]:
df.beam.unique()

array(['gt3r', nan, 'gt3l', 'gt2r', 'gt2l', 'gt1r', 'gt1l'], dtype=object)

In [9]:
df.series.unique()

array(['Noise', nan, 'Buffer', 'Low', 'Medium', 'High'], dtype=object)

In [10]:
def reg(beam_name = 'gt3r', series_name = 'High', start_date = '2018-09-01', end_date = '2019-09-01', quantile_cutoff = 0.05):
    columns = ['track_id','beam', 'series', 'm', 'c', 'R2', 'mse']
    result_df = pd.DataFrame(columns = columns)
    dict_df ={}
    for b in df[df.beam == beam_name].beam.unique():
        new_df = df[df.series != 'Noise']
        for s in new_df[new_df.series == series_name].series.unique():

            #Select only track_idd 622 (Directly passing through the Jamuna river)
            #for track_id in df[df.track_id == '622'].track_id.unique():

            #Run this for a more general analysis    
            for track_id in df.track_id.unique():



                #Select particular beam, series and track_id
                loop_df = df[(df.beam == b) & (df.series == s) & (df.track_id == track_id)]
                if loop_df.empty: continue

                #Filter dates
                loop_df = loop_df[(loop_df['date'] >= start_date) & (loop_df['date'] < end_date)]
                if loop_df.empty: continue

                #Remove water occurance <0
                #loop_df = loop_df[loop_df.water_occurrence >= 0]
                #if loop_df.empty: continue

                #Filter off extreme quantile data
                h_lowlim = loop_df.h.quantile(quantile_cutoff)
                h_uplim = loop_df.h.quantile(1 - quantile_cutoff)
                if(h_uplim < h_lowlim): h_lowlim, h_uplim = h_uplim, h_lowlim
                water_lowlim = loop_df.water_occurrence.quantile(quantile_cutoff)
                water_uplim = loop_df.water_occurrence.quantile(1 - quantile_cutoff)
                if(water_uplim < water_lowlim): water_lowlim, water_uplim = water_uplim, water_lowlim

                loop_df = loop_df[(loop_df['h'] > h_lowlim) & (loop_df['h'] < h_uplim)]
                loop_df = loop_df[(loop_df['water_occurrence'] > water_lowlim) & (loop_df['water_occurrence'] < water_uplim)]
                #Run linear regression model
                linreg = LinearRegression()
                X = loop_df[['h']]
                y = loop_df[['water_occurrence']]
                linreg.fit(X,y)
                y_pred = linreg.predict(X)
                mse = sklearn.metrics.mean_squared_error(y,y_pred)


                dict_df = {'track_id': track_id, 'beam': b, 'series': s, 'm': linreg.coef_, 'c':linreg.intercept_ , 'R2': linreg.score(X,y), 'mse': mse}
                print(dict_df)

                print(b, s)
                %matplotlib inline
                plt.scatter(X,y)
                plt.plot(X,y_pred, color='red')
                plt.show()

                print(linreg.coef_)
                print(linreg.intercept_)
                print(linreg.score(X,y))
                print(mse)
                print("")
                print("")

                result_df = result_df.append(dict_df, ignore_index = True)
                
    result_df.to_pickle('../data/out/regression_beam_series_1819_phase2.pkl')
        
        
    return result_df

In [11]:
beam_name = df.beam.unique()

In [12]:
series_name = df.series.unique()

In [13]:
result_df = interact(reg, beam_name=beam_name, series_name = series_name, start_date = '2018-09-01', end_date = '2019-09-01', quantile_cutoff = (0.05, 0.30));

interactive(children=(Dropdown(description='beam_name', options=('gt3r', nan, 'gt3l', 'gt2r', 'gt2l', 'gt1r', …

In [14]:
result_df = pd.read_pickle('../data/out/regression_beam_series_1819_phase2.pkl')

In [15]:
result_df.sort_values(by = 'R2', ascending = False).head()

Unnamed: 0,track_id,beam,series,m,c,R2,mse
1,622,gt3r,High,[[-0.02142038181292886]],[-0.9813992657636514],0.596464,0.003401
0,180,gt3r,High,[[-0.02715907747399894]],[-1.2931768341015952],0.378171,0.003171
3,1239,gt3r,High,[[-0.005256167142818094]],[-0.27460992767532544],0.171013,0.000859
2,797,gt3r,High,[[-0.0012660662660418278]],[-0.10273856749350488],0.069981,5.6e-05


In [16]:
#result_df.to_pickle('../data/out/regression_beam_series_1819_phase2.pkl')

In [17]:
df[(df.track_id == '622') & (df.series != 'Noise')].describe()

Unnamed: 0,lon,lat,h,water_occurrence
count,614736.0,614736.0,614736.0,614736.0
mean,89.75616,24.143898,-46.481287,0.029811
std,0.080926,0.255725,11.328714,0.089228
min,89.591613,23.721186,-322.58496,-0.050001
25%,89.686804,23.93197,-49.100185,-0.049842
50%,89.75864,24.150286,-47.259292,-0.029745
75%,89.816936,24.341411,-43.795147,0.149189
max,89.946789,24.618351,341.48462,0.15


In [18]:
result_df.sort_values(by = 'mse', ascending = True).head()

Unnamed: 0,track_id,beam,series,m,c,R2,mse
2,797,gt3r,High,[[-0.0012660662660418278]],[-0.10273856749350488],0.069981,5.6e-05
3,1239,gt3r,High,[[-0.005256167142818094]],[-0.27460992767532544],0.171013,0.000859
0,180,gt3r,High,[[-0.02715907747399894]],[-1.2931768341015952],0.378171,0.003171
1,622,gt3r,High,[[-0.02142038181292886]],[-0.9813992657636514],0.596464,0.003401
