# Data munging

Munging and transformation of the data.

In [None]:
%matplotlib notebook
# from collections import defaultdict
import csv
import json
import pathlib
# import re
from typing import NamedTuple

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

## Load the station details

In [None]:
with open('data/stations.json', 'rt') as f:
    station_details = json.load(f)

station_details['Monaghan'][3]

Let's load details for a single station programmatically.

In [None]:
csv_directory = pathlib.Path('data/daily/csvs')
emyvale = station_details['Monaghan'][3]
emyvale_filepath = csv_directory / emyvale['filename']

df = pd.read_csv(emyvale_filepath, skiprows=emyvale["header_line_num"]-1, usecols=['date', 'rain'])

# format the date, then extract the year, month, and day of year
# to separate columns
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.dayofyear
df.head()

In [None]:
df.groupby('day')['rain'].agg('max').plot()

We want to look at data from the last decade separately.

In [None]:
last_decade = df[df.year >= 2008]
historic = df[df.year < 2008]
last_decade.shape, historic.shape, df.shape

In [None]:
def mean_max_rainfall(df, group='day', column='rain'):
    """return a DataFrame with the mean and maximum rainfall by day of year"""
    return df.groupby(group)[column].agg(['max', 'mean'])

In [None]:
historic_agg = min_max_mean_rainfall(historic)
historic_agg.head()

In [None]:
last_decade_agg = min_max_mean_rainfall(last_decade)
last_decade_agg.head()

In [None]:
def record_high(df, df2, test='max'):
    """
    check if values in `df` are higher than those in `df2`.
    """
    if test == 'max':
        return df[df['max'] > df2['max']].drop(['min', 'mean'], axis=1)
    elif test == 'mean':
        return df[df['mean'] > df2['mean']].drop(['min', 'max'], axis=1)
    else:
        raise ValueError('unknown test')

In [None]:
rec_high = record_high(last_decade_agg, historic_agg)
rec_high.shape

In [None]:
from matplotlib.ticker import FuncFormatter, FixedLocator

sx, sy = 8.0, 6.0
multiplier = 1.25
sx *= multiplier
sy *= multiplier

text_alpha = 0.75
line_alpha = 0.5
plt.figure(figsize=(sx, sy))
ax = plt.gca()
plt.plot(
    range(366),
    historic_agg['max'],
#     color='blue',
#     color='#67a9cf',
    color='#8da0cb',
    label='max rainfall 1984--2007',
    linewidth=1,
#     alpha=line_alpha,
)
# plt.plot(
#     range(366), 
#     last_decade_agg['max'], 
#     color='blue', 
#     label='historic mean', 
#     linewidth=1, 
#     alpha=line_alpha,
# )

plt.scatter(
    rec_high.index-1,
    rec_high['max'],
#     c='red',
#     color='#ef8a62
    color='#fc8d62',
#     color='#66c2a5',
    s=16,
    label='record high in past decade',
)

# set up the x-axis ticks and labels
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
days_in_month = [0, 30, 29, 31, 30, 31, 30, 31, 31, 31, 31, 30, 31]
month_pos = np.cumsum(days_in_month)[:-1]

# apply the x-axis ticks and labels
ax.set_xticks(month_pos)
ax.set_xticklabels(months)
# ax.set_xlim([0, 365])

# create a NumPy array of desired Y-ticks and set these
# majors = np.arange(-30, 45, 15) * 10
# ax.yaxis.set_major_locator(FixedLocator(majors))
# ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: int(x/10)))

# set the y-axis label
ax.set_ylabel('Rainfall (mm)', alpha=text_alpha)

# set the title
ax.set_title(
    'Maximum rainfall recorded per day 1984–2007\n'
    'and record rainfall levels per day 2008–2018',
    alpha=text_alpha,
)

# disable the frame on the legend box
leg = ax.legend(frameon=False)

# set the alpha for the text for the x-axis, y-axis, and legend
for t in leg.get_texts():
    t.set_alpha(text_alpha)
for l in ax.yaxis.get_ticklabels():
    l.set_alpha(text_alpha)
for l in ax.xaxis.get_ticklabels():
    l.set_alpha(text_alpha)

# plt.savefig('emyvale-max-rainfall-day.png')

In [None]:
matplotlib.rcParams['lines.markersize'] ** 2

In [None]:
rec_high = record_high(last_decade_agg, historic_agg, test='mean')

In [None]:
from matplotlib.ticker import FuncFormatter, FixedLocator

sx, sy = 8.0, 6.0
multiplier = 1.2
sx *= multiplier
sy *= multiplier

text_alpha = 0.75
line_alpha = 0.5
plt.figure(figsize=(sx, sy))
ax = plt.gca()
plt.plot(
    range(366),
    historic_agg['mean'],
#     color='blue',
#     color='#67a9cf',
    color='#8da0cb',
    label='mean rainfall 1984--2007',
    linewidth=1,
#     alpha=line_alpha,
)
# plt.plot(
#     range(366), 
#     last_decade_agg['max'], 
#     color='blue', 
#     label='historic mean', 
#     linewidth=1, 
#     alpha=line_alpha,
# )

plt.scatter(
    rec_high.index-1,
    rec_high['mean'],
#     c='red',
#     color='#ef8a62
    color='#fc8d62',
#     color='#66c2a5',
    s=16,
    label='record high in past decade',
)

# set up the x-axis ticks and labels
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
days_in_month = [0, 30, 29, 31, 30, 31, 30, 31, 31, 31, 31, 30, 31]
month_pos = np.cumsum(days_in_month)[:-1]

# apply the x-axis ticks and labels
ax.set_xticks(month_pos)
ax.set_xticklabels(months)
# ax.set_xlim([0, 365])

# create a NumPy array of desired Y-ticks and set these
# majors = np.arange(-30, 45, 15) * 10
# ax.yaxis.set_major_locator(FixedLocator(majors))
# ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: int(x/10)))

# set the y-axis label
ax.set_ylabel('Rainfall (mm)', alpha=text_alpha)

# set the title
ax.set_title(
    'Mean rainfall recorded per day 1984–2007\n'
    'and record rainfall levels per day 2008–2018',
    alpha=text_alpha,
)

# disable the frame on the legend box
leg = ax.legend(frameon=False)

# set the alpha for the text for the x-axis, y-axis, and legend
for t in leg.get_texts():
    t.set_alpha(text_alpha)
for l in ax.yaxis.get_ticklabels():
    l.set_alpha(text_alpha)
for l in ax.xaxis.get_ticklabels():
    l.set_alpha(text_alpha)

# plt.savefig('emyvale-mean-rainfall.png')

## Weekly percipitation

Let's reinitialise the `DataFrame`

In [None]:
df = pd.read_csv(emyvale_filepath, skiprows=emyvale["header_line_num"]-1, usecols=['date', 'rain'])

# format the date, then extract the year, month, and day of year
# to separate columns
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.dayofyear
df['week'] = df['date'].dt.weekofyear
df.head()

In [None]:
last_decade = df[df.year >= 2008]
historic = df[df.year < 2008]
historic_agg = min_max_mean_rainfall(historic, group='week')
last_decade_agg = min_max_mean_rainfall(last_decade, group='week')
rec_high = record_high(last_decade_agg, historic_agg, test='mean')

In [None]:
from matplotlib.ticker import FuncFormatter, FixedLocator

sx, sy = 8.0, 6.0
multiplier = 1.25
sx *= multiplier
sy *= multiplier

text_alpha = 0.75
line_alpha = 0.5
plt.figure(figsize=(sx, sy))
ax = plt.gca()
plt.plot(
    range(53),
    historic_agg['mean'],
#     color='blue',
#     color='#67a9cf',
    color='#8da0cb',
    label='mean rainfall 1984--2007',
    linewidth=1,
#     alpha=line_alpha,
)
# plt.plot(
#     range(53),
#     last_decade_agg['mean'],
# #     color='blue',
# #     color='#67a9cf',
#     color='#fc8d62',
#     label='record high in past decade',
#     linewidth=1,
# #     alpha=line_alpha,
# )
plt.scatter(
    rec_high.index-1,
    rec_high['mean'],
#     c='red',
#     color='#ef8a62
    color='#fc8d62',
#     color='#66c2a5',
    s=16,
    label='record high in past decade',
)

# set up the x-axis ticks and labels
# months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
# days_in_month = [0, 30, 29, 31, 30, 31, 30, 31, 31, 31, 31, 30, 31]
# month_pos = np.cumsum(days_in_month)[:-1]

# apply the x-axis ticks and labels
# ax.set_xticks(month_pos)
# ax.set_xticklabels(months)
# ax.set_xlim([0, 365])

# create a NumPy array of desired Y-ticks and set these
# majors = np.arange(-30, 45, 15) * 10
# ax.yaxis.set_major_locator(FixedLocator(majors))
# ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: int(x/10)))3

# set the x-axis label
ax.set_xlabel('Week of the year', alpha=text_alpha)


# set the y-axis label
ax.set_ylabel('Rainfall (mm)', alpha=text_alpha)

# set the title
ax.set_title(
    'Mean rainfall recorded per week 1984–2007\n'
    'and record rainfall levels per week 2008–2018',
    alpha=text_alpha,
)

# disable the frame on the legend box
leg = ax.legend(frameon=False)

# set the alpha for the text for the x-axis, y-axis, and legend
for t in leg.get_texts():
    t.set_alpha(text_alpha)
for l in ax.yaxis.get_ticklabels():
    l.set_alpha(text_alpha)
for l in ax.xaxis.get_ticklabels():
    l.set_alpha(text_alpha)

# plt.savefig('emyvale-mean-rainfall-week.png')

## Monthly precipitation 

In [None]:
df = pd.read_csv(emyvale_filepath, skiprows=emyvale["header_line_num"]-1, usecols=['date', 'rain'])

# format the date, then extract the year, month, and day of year
# to separate columns
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.dayofyear
df['week'] = df['date'].dt.weekofyear
df[df.month == 1].head()

In [None]:
last_decade = df[df.year >= 2008]
historic = df[df.year < 2008]
historic_agg = min_max_mean_rainfall(historic, group='month')
last_decade_agg = min_max_mean_rainfall(last_decade, group='month')
rec_high = record_high(last_decade_agg, historic_agg, test='mean')
rec_high

In [None]:
from matplotlib.ticker import FuncFormatter, FixedLocator

sx, sy = 8.0, 6.0
multiplier = 1.25
sx *= multiplier
sy *= multiplier

text_alpha = 0.75
line_alpha = 0.5
plt.figure(figsize=(sx, sy))
ax = plt.gca()
plt.bar(
    range(12),
    historic_agg['mean'],
#     color='blue',
#     color='#67a9cf',
    color='#8da0cb',
    label='mean rainfall 1984--2007',
    linewidth=1,
#     alpha=line_alpha,
)
plt.plot(
    range(12),
    last_decade_agg['mean'],
#     color='blue',
#     color='#67a9cf',
    color='#fc8d62',
    label='mean rainfall in past decade',
#     marker='.',
    linewidth=1,
#     alpha=line_alpha,
)
plt.scatter(
    rec_high.index-1,
    rec_high['mean'],
#     c='red',
#     color='#ef8a62
    color='#fc8d62',
#     color='#66c2a5',
    s=36,
    label='record high in past decade',
)

# set up the x-axis ticks and labels
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
# days_in_month = [0, 30, 29, 31, 30, 31, 30, 31, 31, 31, 31, 30, 31]
month_pos = range(12)

# apply the x-axis ticks and labels
ax.set_xticks(month_pos)
ax.set_xticklabels(months)
# ax.set_xlim([0, 12])

# create a NumPy array of desired Y-ticks and set these
# majors = np.arange(-30, 45, 15) * 10
# ax.yaxis.set_major_locator(FixedLocator(majors))
# ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: int(x/10)))3

# set the x-axis label
# ax.set_xlabel('Week of the year', alpha=text_alpha)


# set the y-axis label
ax.set_ylabel('Rainfall (mm)', alpha=text_alpha)

# set the title
ax.set_title(
    'Mean monthly rainfall 1984–2007\n'
    'and record monthly rainfall in past decade',
    alpha=text_alpha,
)

# disable the frame on the legend box
leg = ax.legend(frameon=False)

# set the alpha for the text for the x-axis, y-axis, and legend
for t in leg.get_texts():
    t.set_alpha(text_alpha)
for l in ax.yaxis.get_ticklabels():
    l.set_alpha(text_alpha)
for l in ax.xaxis.get_ticklabels():
    l.set_alpha(text_alpha)

# plt.savefig('emyvale-mean-rainfall-week.png')

In [None]:
historic_agg = min_max_mean_rainfall(historic, group='month')
last_decade_agg = min_max_mean_rainfall(last_decade, group='month')
rec_high = record_high(last_decade_agg, historic_agg, test='max')

In [None]:
from matplotlib.ticker import FuncFormatter, FixedLocator

sx, sy = 8.0, 6.0
multiplier = 1.25
sx *= multiplier
sy *= multiplier

text_alpha = 0.75
line_alpha = 0.5
plt.figure(figsize=(sx, sy))
ax = plt.gca()
plt.bar(
    range(12),
    historic_agg['max'],
#     color='blue',
#     color='#67a9cf',
    color='#8da0cb',
    label='max rainfall 1984--2007',
    linewidth=1,
#     alpha=line_alpha,
)
plt.plot(
    range(12),
    last_decade_agg['max'],
#     color='blue',
#     color='#67a9cf',
    color='#fc8d62',
    label='max rainfall in past decade',
#     marker='.',
    linewidth=1,
#     alpha=line_alpha,
)
plt.scatter(
    rec_high.index-1,
    rec_high['max'],
#     c='red',
#     color='#ef8a62
    color='#fc8d62',
#     color='#66c2a5',
    s=36,
    label='record high in past decade',
)

# set up the x-axis ticks and labels
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
# days_in_month = [0, 30, 29, 31, 30, 31, 30, 31, 31, 31, 31, 30, 31]
month_pos = range(12)

# apply the x-axis ticks and labels
ax.set_xticks(month_pos)
ax.set_xticklabels(months)
# ax.set_xlim([0, 12])

# create a NumPy array of desired Y-ticks and set these
# majors = np.arange(-30, 45, 15) * 10
# ax.yaxis.set_major_locator(FixedLocator(majors))
# ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: int(x/10)))3

# set the x-axis label
# ax.set_xlabel('Week of the year', alpha=text_alpha)


# set the y-axis label
ax.set_ylabel('Rainfall (mm)', alpha=text_alpha)

# set the title
ax.set_title(
    'Maximum monthly rainfall 1984–2007\n'
    'and record monthly rainfall in past decade',
    alpha=text_alpha,
)

# disable the frame on the legend box
leg = ax.legend(frameon=False)

# set the alpha for the text for the x-axis, y-axis, and legend
for t in leg.get_texts():
    t.set_alpha(text_alpha)
for l in ax.yaxis.get_ticklabels():
    l.set_alpha(text_alpha)
for l in ax.xaxis.get_ticklabels():
    l.set_alpha(text_alpha)

# plt.savefig('emyvale-mean-rainfall-week.png')