### Acquired raw data has blanks and outliers. Plotting every station against time interval in order to pick an interval where all of the stations would have satisfactory number of data points, but also keeping in mind the tradeoff with minimum data length for later analysis. The data is later used in kriging operations for predictions using predictors like latitude, longitude, distance to coast, height etc...

In [None]:
import geopandas as gpd
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr

In [None]:
base = '/Users/lisac/Documents/Data_analysis/'
fp_csv = base + 'Chile data/cr2_daily_precep_2017.csv'
fp_metadata = base + 'Chile data/meta_for_big_data.xlsx'

metadata = pd.read_excel(fp_metadata,header=None)
metadata.head(7)

### Bounding box to extract rainfall stations

In [None]:
# boolean array
box_coords = [bool(True)] + [(float(metadata[i][5]) > -42.) and (float(metadata[i][5]) < -37.) and (float(metadata[i][6]) > -74.) and (float(metadata[i][6]) < -72.) for i in range(1,len(metadata.columns))]

In [None]:
# translate bolean array to columns number, in the metadata file
cols_boxindex = metadata.loc[:3,box_coords].columns

In [None]:
# read metadata just for the bounding box rainfall stations
metadata1 = pd.read_excel(fp_metadata, header=None, usecols=cols_boxindex)

In [None]:
# replacing commas with lambda expression
rain_raw = pd.read_csv(fp_csv, header=None, error_bad_lines=False, sep=';', skiprows=0, usecols=cols_boxindex,
                       index_col=0, na_values='-9999', low_memory=False)
rain_raw = rain_raw.apply(lambda x: x.str.replace(',','.'))

### Preparing for plotting

In [None]:
just_rain_raw = rain_raw.iloc[15:,:].copy()

In [None]:
just_rain_raw_drop = just_rain_raw.dropna(thresh=1000, axis=1)#.dropna(thresh=80)
names = just_rain_raw_drop.columns
just_rain_raw_drop.columns = [i for i in range(1,len(just_rain_raw_drop.columns)+1)]
just_rain_raw_drop = just_rain_raw_drop.apply(pd.to_numeric)
just_rain_raw_drop.index = pd.date_range('1900-01-01', periods=len(just_rain_raw_drop), freq='D')

### Plotting of timeseries durations

In [None]:
just_rain_raw_drop_plotting = just_rain_raw_drop.copy()

In [None]:
# assigning numbers to y-axis for stations
for idx,i in enumerate(range(1,len(just_rain_raw_drop_plotting.columns)+1)):
    just_rain_raw_drop_plotting[i][~np.isnan(just_rain_raw_drop_plotting[idx+1])] = i

In [None]:
just_rain_raw_drop_plotting['1935':].plot(legend=False, style='.', figsize=(20,12), color='k', grid=True) #, title='Station #s and timespan of measurements')
xposition = [pd.to_datetime('2014-09-23'), pd.to_datetime('2014-09-24'), pd.to_datetime('2014-09-25')]
for xc in xposition:
    plt.axvline(x=xc, color='b', linestyle='-')
    
plt.xlabel('time', size=20)
plt.ylabel('station number', size=20)

plt.text(x='2014-11-30', y=74, s='picked days', size=16, color='b', bbox=dict(boxstyle="round",
                   ec=(1., 0.5, 0.5),
                   fc=(1., 0.8, 0.8),
                   ))

fname = base + 'Chile data/kriging/bottom_catchments/daily_interp/figures/timeseries_durations.png'
plt.savefig(fname, dpi=100, facecolor='w', edgecolor='w', orientation='portrait')