# 2. Regressão linear: exemplo de estimativa de condições climáticas

O conjunto [weatherww2](https://www.kaggle.com/smid80/weatherww2) é utilizado aqui.

## Introdução

In [None]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from google.colab import drive

In [None]:
#@title

sns.set()

from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

drive.mount('/content/drive')

In [None]:
DATA_DIR = '/content/drive/My Drive/Colab Notebooks/ml-notes/datasets/weatherww2'
FILES = sorted(os.listdir(DATA_DIR))

print('Files found:', *FILES, sep='\n  - ')

w, s = (pd.read_csv(os.path.join(DATA_DIR, f)) for f in FILES)

# Upper case columns make me so fucking mad.
w.columns = map(str.lower, w.columns)
s.columns = map(str.lower, s.columns)

In [None]:
display_side_by_side(w.head().round(), s.head().round())

## Pre-processamento e limpeza dos dados


In [None]:
def preprocessing(w, s):
    w = replace_trace_by_epsilon(w)
    w = parse_unknown_snowfall(w)
    
    w['date'] = pd.to_datetime(w.date)
    s['elev'] = s.elev.astype('int')

    w = (w.merge(s, left_on='sta', right_on='wban')
          .rename(columns={'state/country id': 'location'})
          .drop(columns='wban windgustspd tshdsbrsgf lat lon'.split()))
    
    return w


def replace_trace_by_epsilon(w):
    p = w.precip

    p[p == 'T'] = np.finfo(np.float32).eps
    w['precip'] = p.astype('float')

    return w


def parse_unknown_snowfall(w):
    w.loc[w.snowfall == '#VALUE!', 'snowfall'] = np.nan
    w['snowfall'] = w.snowfall.astype('float')
    return w


w = preprocessing(w, s)
w.head()

## Observação das múltiplas características

In [None]:
#@title Características categóricas

def count_sort_and_plot(x, sampling = None):
    names, observations = np.unique(x, return_counts=True)

    p = np.argsort(observations)[::-1]
    names, observations = names[p], observations[p]
    
    if sampling:
        s = np.random.rand(len(p)) <= sampling
        names, observations = names[s], observations[s]

    plt.figure(figsize=(16, 4))
    sns.barplot(names, observations)
    plt.xticks(rotation=90)

In [None]:
count_sort_and_plot(w.location)

In [None]:
count_sort_and_plot(w.name, sampling=.4)

In [None]:
#@title Localização geográfica e elevação das medições

plt.figure(figsize=(6, 3))
ax = sns.scatterplot(data=w.drop_duplicates('location'),
                     x='longitude',
                     y='latitude',
                     hue='elev')
ax.get_legend().remove();

In [None]:
#@title Características contínuas

plt.figure(figsize=(12, 3))

for i, c in enumerate('precip maxtemp mintemp meantemp'.split()):
    plt.subplot(1, 4, i + 1, title=c)
    try: sns.distplot(w[c].dropna())
    except: ...

plt.tight_layout();

In [None]:
count_sort_and_plot(w.snowfall[w.snowfall != 0])  # the great majority is 0.0!

## Estimando medições diárias a partir de características locais e sazonais

In [None]:
locations = 'PM IN HI LI AT'.split()

x = w[w.location.isin(locations)]

plt.figure(figsize=(12, 4))
sns.lineplot(x='date',
             y='value',
             hue='location',
             data=x.sample(frac=.05)
                   .melt(['date', 'location'],
                         ['meantemp', 'mintemp', 'maxtemp']));

Essas diferentes localizações com certeza não podem ser estimadas com um modelo de efeito fixo se as úúnicas caracteríísticas forem a informação sazional.  
Caralho, acho que eu escolhi um exemplo muito difícil. Vamos olhar só pra uma localização primeiro:

#### Estimando medições do dia seguinte em *Saint Pierre and Miquelon*

In [None]:
LOCATION = 'PM'
FEATURES = ('date precip maxtemp mintemp meantemp snowfall poorweather '
            'yr mo da prcp elev latitude longitude').split()

x = w[w.location == LOCATION][FEATURES]

plt.figure(figsize=(12, 4))
plt.title(f'Temperatures at {LOCATION}')
sns.lineplot(x='date',
             y='value',
             hue='variable',
             data=x.sample(frac=.05)
                   .melt(['date'], ['meantemp', 'mintemp', 'maxtemp']));

Segundo [este site](https://www.timeanddate.com/weather/st-pierre-miquelon/saint-pierre/climate), o clima mais frio foi de -2º C. Essas temperaturas próximas à -20 devem ser ruído.

In [None]:
x = x.groupby('date', as_index=False, sort=True).agg({
    'precip': 'mean', 'snowfall': 'mean',
    'maxtemp': 'mean', 'mintemp': 'mean', 'meantemp': 'mean',
    'yr': 'mean', 'mo': 'mean', 'da': 'mean',
    'elev': 'mean', 'latitude': 'mean', 'longitude': 'mean',
})

x = x[x.meantemp >= -2]

In [None]:
plt.figure(figsize=(12, 4))
plt.title(f'Temperatures at {LOCATION}')
sns.lineplot(x='date',
             y='value',
             hue='variable',
             data=x.sample(frac=.5)
                   .melt(['date'], ['meantemp', 'mintemp', 'maxtemp']));

In [None]:
x.head().round()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

_palette = sns.color_palette("husl", 8)

def add_features(x):
    x['yrp'] = x.yr/40
    x['mop'] = x.mo/12
    x['dap'] = x.da/30

    return x

def simulate_past_future(x, days=.5):
    if isinstance(days, int):
        train, test = x[:days], x[days:]
    else:
        train, test = x[:int(len(x)*days)], x[int(len(x)*days):]
    
    print('Days passed (observed):', len(train))
    print('Days to pass (unknown):', len(test))

    return train, test

def explain(model, features):
    print('Explaining the model')
    print('  f :=',
          '\n     + '.join(f'{c}{f}' for f, c in zip(features, model.coef_.round(2).T)))
    print('     +', model.intercept_.round(2))

def evaluate(y, p, t):
    print('MAE:', metrics.mean_absolute_error(y, p).round(3))

    with _palette:
        f = plt.figure(figsize=(16, 6), constrained_layout=True)
        f.suptitle(t)
        gs = f.add_gridspec(2, 2)
        
        f1 = f.add_subplot(gs[0, 0])
        f1.set_title('Observations and Predictions Distribution')
        sns.distplot(y, label='ground truth')
        sns.distplot(p, label='predicted')
        plt.legend();
        
        f2 = f.add_subplot(gs[0, 1])
        f2.set_title('Error Distribution')
        sns.distplot(np.abs(y - p), label='error', kde=False)
        
        f3 = f.add_subplot(gs[1, :])
        f3.set_title('Observations and Predictions over Time')
        sns.lineplot(x=np.arange(len(y)), y=y, label='ground truth')
        sns.lineplot(x=np.arange(len(y)), y=p, label='predicted')

#### 14 dias após o início das medições, feature: dia do mês

In [None]:
FEATURES = ['dap']
TARGETS = ['mintemp', 'meantemp', 'maxtemp', 'precip']
DAYS = 14

In [None]:
z = x.copy()
z = add_features(z)
train, test = simulate_past_future(z, days=DAYS)
test_sampled = test.sample(100)

lr = LinearRegression().fit(train[FEATURES], train[TARGETS])
predictions = lr.predict(test_sampled[FEATURES])

In [None]:
explain(lr, FEATURES)

In [None]:
for i, (t, y, p) in enumerate(zip(TARGETS, test_sampled[TARGETS].values.T, predictions.T)):
    print(t, end=' ')
    evaluate(y, p, t)

#### Um ano após o início das medições, feature: dia do mês, mês do ano

In [None]:
FEATURES = ['dap', 'mop']
TARGETS = ['mintemp', 'meantemp', 'maxtemp', 'precip']
DAYS = 365

In [None]:
z = x.copy()
z = add_features(z)
train, test = simulate_past_future(z, days=DAYS)
test_sampled = test.sample(100)

lr = LinearRegression().fit(train[FEATURES], train[TARGETS])
predictions = lr.predict(test_sampled[FEATURES])

In [None]:
explain(lr, FEATURES)

In [None]:
for i, (t, y, p) in enumerate(zip(TARGETS, test_sampled[TARGETS].values.T, predictions.T)):
    print(t, end=' ')
    evaluate(y, p, t)

**Para pensar:** a linha se tornou muito mais comportada ao adicionarmos mais dados. O que aconteceu aqui?

De qualquer forma, nós não estamos indo bem nos pontos com temperaturas fora do usual (que era exatamente o interesse no período de guerra). Podemos melhorar adicionando informação local (as temperaturas dos dias anteriores) para predizer as próximas temperaturas:

### Adição da informação do dia anterior

#### Enriquecimento, Definições e Treinamento

In [None]:
def add_features(x):
    x['yrp'] = x.yr/40
    x['mop'] = x.mo/12
    x['dap'] = x.da/30

    x['maxtemp_y'] = x.maxtemp.shift(1)
    x['mintemp_y'] = x.mintemp.shift(1)
    x['meantemp_y'] = x.meantemp.shift(1)

    return x.dropna()

In [None]:
FEATURES = ['dap', 'mop', 'mintemp_y', 'meantemp_y', 'maxtemp_y']
TARGET = ['mintemp', 'meantemp', 'maxtemp', 'precip']
DAYS = 365

In [None]:
z = x.copy()
z = add_features(z)
train, test = simulate_past_future(z, days=DAYS)
test_sampled = test.sample(100)

lr = LinearRegression().fit(train[FEATURES], train[TARGETS])
predictions = lr.predict(test_sampled[FEATURES])

In [None]:
explain(lr, FEATURES)

#### Avaliação

In [None]:
for i, (t, y, p) in enumerate(zip(TARGETS, test_sampled[TARGETS].values.T, predictions.T)):
    print(t, end=' ')
    evaluate(y, p, t)

### Análise recorrente

Características: as medições locais de 7 dias à 7 dias de distância do dia atual (previsão com 1 semana de antecedência).

#### Definições, enriquecimento e treinamento

In [None]:
FEATURES = ['dap', 'mop', 'mintemp', 'meantemp', 'maxtemp', 'precip']
TARGET = ['mintemp', 'meantemp', 'maxtemp', 'precip']
DAYS = 2 * 365

LENGTH = 7
STRIDE = 7
BATCH_SIZE = 128

EPOCHS = 400

In [None]:
from sklearn.preprocessing import StandardScaler

z = x.copy()
z = add_features(z)

zs = StandardScaler()
ts = StandardScaler()

ze = zs.fit_transform(z[FEATURES])
te = ts.fit_transform(z[TARGETS])

In [None]:
train = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    ze, te,
    length=LENGTH,
    stride=STRIDE,
    batch_size=BATCH_SIZE,
    end_index=7*DAYS//10)

valid = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    ze, te,
    length=LENGTH,
    stride=STRIDE,
    batch_size=BATCH_SIZE,
    start_index=7*DAYS//10,
    end_index=DAYS)

test = tf.keras.preprocessing.sequence.TimeseriesGenerator(
    ze, te,
    length=LENGTH,
    stride=STRIDE,
    batch_size=BATCH_SIZE,
    start_index=DAYS)

In [None]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import LSTM, Dense

def create_model():
    x = Input(shape=[LENGTH, len(FEATURES)])
    y = LSTM(128, name='rc1')(x)
    y = Dense(1024, activation='relu', name='fc1')(y)
    y = Dense(1024, activation='relu', name='fc2')(y)
    y = Dense(len(TARGETS), name='predictions')(y)
    
    nn = Model(inputs=x,
               outputs=y,
               name='weather_estimator')
    
    nn.compile(loss='mse', metrics=['mse', 'mae'])

    return nn

nn = create_model()

In [None]:
tf.keras.utils.plot_model(nn, show_shapes=True, rankdir='LR')

In [None]:
nn.fit(
    train,
    validation_data=test,
    epochs=EPOCHS,
    verbose=0,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=100),
        tf.keras.callbacks.ModelCheckpoint('weather.h5', save_best_only=True)
    ]);

In [None]:
h = pd.DataFrame(nn.history.history)
h['epoch'] = h.index

sns.lineplot(
    x='epoch',
    y='value',
    hue='variable',
    data=h.melt(['epoch'], ['mse', 'mae']));

In [None]:
nn.load_weights('weather.h5')

In [None]:
test_targets = np.concatenate([e[1] for e in test])
test_targets = ts.inverse_transform(test_targets)

predictions = nn.predict(test)
predictions = ts.inverse_transform(predictions)

# s = np.random.rand(len(test_targets)) < .1
# test_targets, predictions = test_targets[s], predictions[s]

#### Avaliação

In [None]:
for i, (t, y, p) in enumerate(zip(TARGETS, test_targets.T, predictions.T)):
    print(t, end=' ')
    evaluate(y, p, t)