In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# See http://drivendata.github.io/cookiecutter-data-science/
import os
import sys

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [3]:
import pickle

import numpy as np
import pandas as pd

# Set the seed for Keras
np.random.seed(4)

from data.data import get_data, noise_bagging
from features.features import Features

from model.model import split_train_test

Using Theano backend.


In [4]:
# Define the properties of this run

# Set the number of time steps that will be used in for the RNN
ts_steps = 4

# Set the radius in km that will be used to compute the nest count of neighbouring nests
nestCount_radius = 50 # was 200 # tiny bit better

# Set the radius in which the krill data will be reported for each nest
krill_radius = 100 # was 300

# Set the area which is used for computing the sea ice feature
padding = 1

In [5]:
# Get the data with the specified time lag. The time series is assembled as relative difference
# betweene the successive steps.
df_features, scaler = get_data(ts_steps)
df_features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,y_true,inferred_y_true,inferred_t,t0,t1,t2,t3,countError
site_id,species,year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ACUN,adelie penguin,1983,1.0,True,True,,1.0,1.0,1.0,0.707107
ACUN,chinstrap penguin,1983,1.0,True,False,,1.0,1.0,1.0,0.707107
ADAM,adelie penguin,1983,1.0,True,True,,1.0,1.0,1.0,0.707107
ADAR,adelie penguin,1983,1.0331,True,False,,1.0,1.0,1.0,0.196116
AILS,chinstrap penguin,1983,1.0,True,False,,1.0,1.0,1.0,0.707107


In [6]:
# Remove the first occurence
df_features.dropna(inplace=True)

In [7]:
# Add the features to the DataFrame
features = Features(krill_radius, nestCount_radius, padding)
df_features = features.add_features(df_features)

df_features.head()

Found nest count pre-computed distance matrix in data/interim


Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"



Loading sea ice distMat from data/interim/
Found krill pre-computed distance matrix in data/interim


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,y_true,inferred_y_true,inferred_t,t0,t1,t2,t3,countError,adelie penguin,chinstrap penguin,...,temp_month_2,temp_month_3,temp_month_4,temp_month_5,temp_month_6,temp_month_7,temp_month_8,temp_month_9,temp_month_10,temp_month_11
site_id,species,year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
ACUN,adelie penguin,1984,1.0,True,True,1.0,1.0,1.0,1.0,0.707107,1.0,0.0,...,-0.21,0.38,-0.27,-0.28,2.39,-3.26,-0.48,3.94,4.12,-0.49
ACUN,adelie penguin,1985,1.0,True,True,1.0,1.0,1.0,1.0,0.707107,1.0,0.0,...,1.59,0.53,-0.32,1.42,0.49,-1.65,5.43,4.25,4.07,0.76
ACUN,adelie penguin,1986,1.0,True,True,1.0,1.0,1.0,1.0,0.707107,1.0,0.0,...,0.55,0.43,1.13,1.22,0.29,3.89,0.27,0.44,-0.73,-0.69
ACUN,adelie penguin,1987,1.0,True,True,1.0,1.0,1.0,1.0,0.707107,1.0,0.0,...,0.25,0.37,0.68,-0.48,0.69,-2.0,-5.22,1.8,-2.63,-1.79
ACUN,adelie penguin,1988,1.0,True,True,1.0,1.0,1.0,1.0,0.707107,1.0,0.0,...,0.24,0.93,0.93,0.26,1.74,-1.3,0.12,-4.3,-0.22,-1.7


In [8]:
df_features.columns

Index(['y_true', 'inferred_y_true', 'inferred_t', 't0', 't1', 't2', 't3',
       'countError', 'adelie penguin', 'chinstrap penguin', 'gentoo penguin',
       'proximityNestCountChange', 'siteCount', 'sea_ice_month_0',
       'sea_ice_month_1', 'sea_ice_month_2', 'sea_ice_month_3',
       'sea_ice_month_4', 'sea_ice_month_5', 'sea_ice_month_6',
       'sea_ice_month_7', 'sea_ice_month_8', 'sea_ice_month_9',
       'sea_ice_month_10', 'sea_ice_month_11', 'krill', 'temp_month_0',
       'temp_month_1', 'temp_month_2', 'temp_month_3', 'temp_month_4',
       'temp_month_5', 'temp_month_6', 'temp_month_7', 'temp_month_8',
       'temp_month_9', 'temp_month_10', 'temp_month_11'],
      dtype='object')

In [9]:
# There are a lot of missing values in the original data and I believe that the truly observed values
# should be trusted more. In order to increase the training data, noise is added to which artificially
# increase the number of observations. Here, the truly observed data is enriched relatively more to
# put more emphasise on them.
yobs = 10
tobs = 5
other = 2

df_YObserved = df_features[~df_features['inferred_y_true']]
df_trainNoiseYObserved = noise_bagging(df_YObserved, yobs*df_YObserved.shape[0], strength=0.5)
print('Noise y observed', df_trainNoiseYObserved.shape)

df_TObserved = df_features[~df_features['inferred_t']]
df_trainNoiseTObserved = noise_bagging(df_TObserved, tobs*df_YObserved.shape[0], strength=0.5)
print('Noise t observed', df_trainNoiseTObserved.shape)

df_featuresNoise = noise_bagging(df_features, other*df_features.shape[0], strength=0.4)
print('Noise all', df_featuresNoise.shape)

df_featuresNoise = pd.concat([df_featuresNoise, df_trainNoiseTObserved, df_trainNoiseYObserved])
print('Final', df_featuresNoise.shape)

Noise y observed (17083, 38)
Noise t observed (11881, 38)
Noise all (56376, 38)
Final (85340, 38)


In [10]:
# Split into train and test
df_train, df_test = split_train_test(df_features)
df_train.shape, df_test.shape

((17496, 38), (1296, 38))

In [11]:
# Split noise data into train and test
df_trainNoise, df_testNoise = split_train_test(df_featuresNoise)
df_trainNoise.shape, df_testNoise.shape

((77648, 38), (7692, 38))

In [12]:
fname = '../data/interim/Submission_02/features_weighted.p'
pickle.dump([df_trainNoise, df_test, df_features, df_featuresNoise, features, scaler], open(fname, 'wb'))

In [13]:
fname = '../data/interim/Submission_02/features.p'
pickle.dump([df_train, df_test, df_features, None, features, scaler], open(fname, 'wb'))