In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# See http://drivendata.github.io/cookiecutter-data-science/
import os
import sys

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [3]:
import pickle

import numpy as np
import pandas as pd

# Set the seed for Keras
np.random.seed(4)

from data.data import get_data, noise_bagging
from features.features import Features

from model.model import split_train_test

Using Theano backend.


In [4]:
# Define the properties of this run

# Set the number of time steps that will be used in for the RNN
ts_steps = 4

# Set the radius in km that will be used to compute the nest count of neighbouring nests
nestCount_radius = 200

# Set the radius in which the krill data will be reported for each nest
krill_radius = 100

# Set the area which is used for computing the sea ice feature
padding = 1

In [5]:
# Get the data with the specified time lag. The time series is assembled as relative difference
# betweene the successive steps.
df_features, scaler = get_data(ts_steps)
df_features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,y_true,inferred_y_true,inferred_t,t0,t1,t2,t3,countError
site_id,species,year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ACUN,adelie penguin,1983,1.0,True,True,,1.0,1.0,1.0,0.707107
ACUN,chinstrap penguin,1983,1.0,True,False,,1.0,1.0,1.0,0.707107
ADAM,adelie penguin,1983,1.0,True,True,,1.0,1.0,1.0,0.707107
ADAR,adelie penguin,1983,1.0331,True,False,,1.0,1.0,1.0,0.196116
AILS,chinstrap penguin,1983,1.0,True,False,,1.0,1.0,1.0,0.707107


In [6]:
# Remove the first occurence
df_features.dropna(inplace=True)

In [7]:
# Add the features to the DataFrame
features = Features(krill_radius, nestCount_radius, padding)
df_features = features.add_features(df_features)

df_features.head()


Loading sea ice distMat from data/interim/
Computing sea ice distMat and caching result in data/interim/
This can take a while.. (apologies for computing this via brute force)
Done.
Computing krill distMat and caching result in data/interim/
This can take a while.. (apologies for computing this via brute force)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,y_true,inferred_y_true,inferred_t,t0,t1,t2,t3,countError,adelie penguin,chinstrap penguin,...,sea_ice_month_3,sea_ice_month_4,sea_ice_month_5,sea_ice_month_6,sea_ice_month_7,sea_ice_month_8,sea_ice_month_9,sea_ice_month_10,sea_ice_month_11,krill
site_id,species,year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
ACUN,adelie penguin,1984,1.0,True,True,1.0,1.0,1.0,1.0,0.707107,1.0,0.0,...,0.0,0.0,0.007222,0.177222,0.0,0.0,0.064167,0.141944,0.147778,1.051298
ACUN,adelie penguin,1985,1.0,True,True,1.0,1.0,1.0,1.0,0.707107,1.0,0.0,...,0.006667,0.0,0.0,0.060278,0.059444,0.016111,0.0,0.14,0.165556,5.788702
ACUN,adelie penguin,1986,1.0,True,True,1.0,1.0,1.0,1.0,0.707107,1.0,0.0,...,0.141111,0.0,0.000278,0.043889,0.150556,0.065,0.003333,0.171944,0.0,0.0
ACUN,adelie penguin,1987,1.0,True,True,1.0,1.0,1.0,1.0,0.707107,1.0,0.0,...,0.083611,0.0,0.150833,0.168333,0.164722,0.0,0.151389,0.181667,0.181667,0.0
ACUN,adelie penguin,1988,1.0,True,True,1.0,1.0,1.0,1.0,0.707107,1.0,0.0,...,0.158333,0.129444,0.122778,0.0,0.054444,0.158889,0.151389,0.008889,0.0,4.264471


In [8]:
# Split into train and test
df_train, df_test = split_train_test(df_features)
df_train.shape, df_test.shape

((17496, 26), (1296, 26))

In [9]:
# There are a lot of missing values in the original data and I believe that the truly observed values
# should be trusted more. In order to increase the training data, noise is added to which artificially
# increase the number of observations. Here, the truly observed data is enriched relatively more to
# put more emphasise on them.

df_YObserved = df_train[~df_train['inferred_y_true']]
df_trainNoiseYObserved = noise_bagging(df_YObserved, 5*df_YObserved.shape[0], strength=0.5)
print('Noise y observed', df_trainNoiseYObserved.shape)

df_TObserved = df_train[~df_train['inferred_t']]
df_trainNoiseTObserved = noise_bagging(df_TObserved, 2*df_YObserved.shape[0], strength=0.5)
print('Noise t observed', df_trainNoiseTObserved.shape)

df_trainNoise = noise_bagging(df_train, 1*df_train.shape[0], strength=0.4)
print('Noise all', df_trainNoise.shape)

df_trainNoise = pd.concat([df_trainNoise, df_trainNoiseTObserved, df_trainNoiseYObserved])
print('Final', df_trainNoise.shape)

Noise y observed (8184, 26)
Noise t observed (6261, 26)
Noise all (34992, 26)
Final (49437, 26)


In [10]:
fname = '../data/interim/features_weighted.p'
pickle.dump([df_trainNoise, df_test, df_features, features, scaler], open(fname, 'wb'))