## reading CSV files with Pandas and some simple Dataframe manipulations

In [None]:
%matplotlib inline

### pathlib
import pathlib

### scipy
import numpy as np
import pandas as pd

### plotting
from matplotlib import pyplot as plt
import matplotlib

In [None]:
import pathlib

In [None]:
plt.rcParams['figure.figsize'] = (10,5)
plt.rcParams['font.size'] = 14
plt.rcParams['lines.linewidth'] = 2

#### Daily temperature data for 30 NZ stations from https://data.mfe.govt.nz/table/105056-daily-temperature-1909-2019/ 

In [None]:
dpath = pathlib.Path('/media/nicolasf/END19101/data/NZ_station_data/temperature/')

#### read the data in csv, setting up the index (labels along the rows) to be the first column (indexing starts at 0 in Python)

In [None]:
data = pd.read_csv(dpath.joinpath('daily-temperature-1909-2019.csv'), index_col=0)

In [None]:
data.head()

In [None]:
location_names = data.index.unique()

In [None]:
location_names

In [None]:
len(location_names)

### select a location (index) and a statistic in the `statistic` column 

In [None]:
location = 'Hamilton'

In [None]:
statistic = 'Maximum'

In [None]:
df = data.loc[location,:].query(f"statistic == '{statistic}'")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df = df.set_index('date')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.index

In [None]:
df.index = pd.to_datetime(df.index)

In [None]:
df.index

In [None]:
df = df[['temperature']]

In [None]:
df.head()

In [None]:
df.plot()

### resampling, take the maximum daily temperature for each calendar month

In [None]:
df_monthly = df.resample('1M').max()

In [None]:
df_monthly.plot()

In [None]:
df_monthly.describe()

In [None]:
df.describe()

### calculate the climatological 90th percentile (value exceeded only 10% of the time)

In [None]:
clim = df_monthly.loc['1990':'2020',:]

In [None]:
clim_q90 = clim.groupby(clim.index.month).quantile(0.90)

In [None]:
clim_q90

In [None]:
clim_q90.plot()

### repeat these values along the whole time-series 

In [None]:
clim_q90 = clim_q90.loc[df_monthly.index.month,:]

In [None]:
clim_q90.head()

In [None]:
len(clim_q90)

In [None]:
len(df_monthly)

In [None]:
df_monthly.loc[:,'q90'] = clim_q90.values

In [None]:
df_monthly.head()

### we can now do some comparisons 

In [None]:
df_monthly.loc[:,'temperature'] >= df_monthly.loc[:,'q90']

In [None]:
hot_months = df_monthly.loc[df_monthly.temperature >= df_monthly.q90,'temperature']

In [None]:
hot_months.head()

In [None]:
len(hot_months)

### use the `reindex` method to create a continuous time-series (NaNs (Not a Number, i.e. missing values) will be inserted)

In [None]:
hot_months = hot_months.reindex(df_monthly.index)

In [None]:
hot_months.plot(marker='o', linewidth=0)