# Groundwater level data exploration

The aim here is to explore the raw data of the groundwater levels in the Lower Saxony State in Germany. 

In [281]:
#Import libraries
import geopandas as gpd
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
import os
from functions import *

from IPython.display import Markdown as md

In [282]:
rpath="C:/Users/GomezOspina.M/MGO/" #Root path

## Selection criteria
There is a pre-selection of well stations based on the "climatic hydrograph" method (agreement between the theoretical and observed hydrograph) and a proposal list from previous projects (KIT-BGR). The assessment contains subjective components, based on past analyses. This selection exclude wells under strong anthropogenic incluences such as pumping, allowing to assure a dependencz between the climatic input variables and the groundwater data.

In [283]:
#Import Groundwatwer stations
GW_CD_ID= gpd.read_file(rpath+"/data/GIS/SHP/GWL_CDID.shp")
GW_CD_sel=GW_CD_ID[GW_CD_ID.KLIGL_GRUP.isin([1,12,13])] # selection of stations in good agreement with climatic variables

total_wells= GW_CD_ID.shape[0]

In [284]:
total_wells= GW_CD_ID.shape[0]
sel_wells= GW_CD_sel.shape[0]
md(f"The data consists of a total of {total_wells} wells. \
   <br> After excluding the wells under anthropogenic influence:  {sel_wells} wells.")

The data consists of a total of 962 wells.    <br> After excluding the wells under anthropogenic influence:  745 wells.

### Load groundwater level data according to criteria

In [285]:
gwlist=[]
gidlist=[]
for gid in GW_CD_sel.MEST_ID : # Load data from selected IDs
    gw=readGWdata(gid,rpath +"/data/Grundwasserstandsdaten/Einzelmessstellen/")
    gwlist.append(gw)
    gidlist.append(gid)
    nonan=gw.GW_NN.count()
    index=GW_CD_sel.loc[gw_seld.MEST_ID==gid].index.values[0]
    GW_CD_sel.loc[index, "nonan"]=nonan
gwdata_dic={"wellid":gidlist, "data":gwlist}
gwdata=pd.DataFrame(gwdata_dic)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


##  Time range

This is used as a spatial overview of the data length

In [286]:
#Convert monthly measurements to annual assuming no gaps to check the time series length
GW_CD_sel["nonan_yr"]=GW_CD_sel["nonan"]/12

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [287]:
#Plot GW stations with data information
germany_states = gpd.read_file(rpath+"/data/GIS/SHP/DEU_adm1.shp")
NS=germany_states[germany_states.NAME_1== "Niedersachsen"]
GW_CD_ID=GW_CD_ID.to_crs(epsg=4326)
GW_CD_sel=GW_CD_sel.to_crs(epsg=4326)
NSmap= NS.boundary.plot( figsize=(8, 8), alpha=0.5, edgecolor='k', linewidth=1)
#GW_CD_ID.plot(ax=NSmap,marker='*', color='c', markersize=8, label="GW")
GW_CD_sel.plot(ax=NSmap,column='nonan_yr',scheme="Quantiles", markersize=GW_CD_sel.nonan_yr.values/5,
         legend=True, label="GW_sel")



<IPython.core.display.Javascript object>

<AxesSubplot:>

## Gaps length

Check the gap lengths to define the interpolation method

In [288]:
gaps_list=[]
count=[]
for i in range(sel_wells):
    gwdatam=gwdata.data[i].dropna(subset='GW_NN', inplace=False)#Drop nan values 
    deltas=gwdatam["DATUM"].diff()[1:]
    gaps = deltas[deltas > timedelta(days=31)]
    gaps_list.append(gaps)
    count.append(len(gaps))
    
gwdata["count"]=count
gwdata["gaps"]=gaps_list
gwdata["max_gap"]=gwdata.gaps.apply(lambda x: x.max().days)

In [289]:
#Estimate gap interval in days and months - 2 months means 1 missing data
gwdata["max_gap"]=gwdata.gaps.apply(lambda x: x.max().days)
gwdata["max_gap_months"]=gwdata.gaps.apply(lambda x: x.max().days/30)

In [290]:
#Plot the maximum gap (in months) 
Fig= plt.figure(figsize=(10,5))
gwdata.max_gap_months.plot(marker='.', color="c", linewidth=0.6)
plt.ylabel("Maximum months of missing data per well")
plt.grid(True, alpha=0.2)

<IPython.core.display.Javascript object>

In [291]:
#Descriptive statistics
gwdata.max_gap_months.describe()

count    504.000000
mean       6.831878
std       16.190440
min        1.966667
25%        2.066667
50%        2.066667
75%        4.100000
max      220.200000
Name: max_gap_months, dtype: float64

In [292]:
gwwell=gwdata.data[35]
Fig= plt.figure(figsize=(10,2))
plt.plot(gwwell["DATUM"], gwwell["GW_NN"])
plt.grid(True, alpha=0.2)

<IPython.core.display.Javascript object>

### Filter data with gaps longer than 4 months

In [293]:
gwdata[gwdata["max_gap"]<30*4]

Unnamed: 0,wellid,data,count,gaps,max_gap,max_gap_months
0,9610009,MEST_ID JAHR MONAT DATUM HJAHR H...,8,761 61 days 763 61 days 766 61 days 776 ...,62.0,2.066667
1,9610709,MEST_ID JAHR MONAT DATUM HJAHR H...,9,332 92 days 350 59 days 763 61 days 776 ...,92.0,3.066667
2,9610749,MEST_ID JAHR MONAT DATUM HJAHR H...,3,763 61 days 776 62 days 799 61 days Name...,62.0,2.066667
5,9610875,MEST_ID JAHR MONAT DATUM HJAHR H...,5,758 59 days 763 61 days 788 62 days 790 ...,62.0,2.066667
6,9610879,MEST_ID JAHR MONAT DATUM HJAHR H...,8,612 92 days 623 91 days 760 61 days 763 ...,92.0,3.066667
...,...,...,...,...,...,...
711,40000327,MEST_ID JAHR MONAT DATUM HJAHR ...,2,"631 61 days 633 61 days Name: DATUM, dtype...",61.0,2.033333
719,400060421,MEST_ID JAHR MONAT DATUM HJAHR ...,4,422 90 days 819 59 days 848 62 days 855 ...,90.0,3.000000
732,9700203,MEST_ID JAHR MONAT DATUM HJAHR H...,2,"349 62 days 814 61 days Name: DATUM, dtype...",62.0,2.066667
733,9700208,MEST_ID JAHR MONAT DATUM HJAHR H...,4,793 62 days 811 61 days 814 61 days 816 ...,62.0,2.066667
