In [2]:
# -*- coding: utf-8 -*-
import os
import numpy as np
import pandas as pd
from sklearn import cross_validation

In [3]:
THRESHOLD = 73 
N_FOLDS = 21
RND_SEED = 56

In [4]:
#####  set default dir ##### 
### os.chdir('./kaggle-Rain/')

In [5]:
####### 1. Import training data and extract ids #######
train_raw = pd.read_csv("./data/train.csv")
# train_raw = pd.read_csv("./data/train.csv")

In [43]:
train_raw.groupby('Id').Expected.max()

Id
1             0.254000
2             1.016000
3            26.162014
4             4.064002
5           774.700440
6             0.254000
7             0.508000
8             3.225002
9            18.288010
10            0.010000
11            2.540001
12            3.302002
13            0.254000
14            1.270001
15            2.286001
16         1308.100700
17            0.060000
18            1.524001
19            3.810002
20            0.254000
21           28.000013
22          166.370090
23            1.270001
24            1.075001
25            1.016000
26            1.016000
27            2.540001
28            1.016000
29            1.524001
30            0.254000
              ...     
1180916    2009.903100
1180917     100.000050
1180918       0.254000
1180919       1.778001
1180920       0.254000
1180921       0.508000
1180922    2627.885300
1180923      15.494008
1180924       2.794001
1180925       1.778001
1180926       0.508000
1180927       1.524001
1180928 

In [7]:
raw_ids_all = train_raw["Id"]
raw_ids = raw_ids_all.unique()

In [8]:
####### 2. Remove ids with only NaNs in the "Ref" column #######
train_raw_tmp = train_raw[~np.isnan(train_raw.Ref)]
raw_ids_tmp = train_raw_tmp["Id"].unique()
train_new = train_raw[np.in1d(raw_ids_all, raw_ids_tmp)]

In [9]:
train_new.shape

(9125329, 24)

In [10]:
####### 3. Convert all NaN to zero #######
train_new = train_new.fillna(0.0)
train_new = train_new.reset_index(drop=True)

In [11]:
####### 4. Define and exclude outliers from training set #######
train_new_group = train_new.groupby('Id')
df = pd.DataFrame(train_new_group['Expected'].mean()) # mean, or any value
meaningful_ids = np.array(df[df['Expected'] < THRESHOLD].index)

In [12]:
####### 5. Split off holdout validation subset #######
# Count the no. of observations per hour for each gauge reading
train_new_ids_all = train_new["Id"]
obs_freq = train_new_ids_all.value_counts(ascending=True)
obs_bins = obs_freq.unique()
obs_num = ([(obs_freq==i).sum() for i in obs_bins])
obs_ids = [np.array(obs_freq.index[obs_freq.values==i]) for i in obs_bins]


In [13]:
# Construct stratified c.v. holdout set w.r.t. no. observations per hour
y = np.array(obs_freq)
X = np.concatenate(obs_ids)

In [14]:
rng = np.random.RandomState(RND_SEED)
skf = cross_validation.StratifiedKFold(y, n_folds=N_FOLDS, shuffle=True,
                                       random_state=rng)

In [15]:
skf

sklearn.cross_validation.StratifiedKFold(labels=[ 1  1  1 ..., 19 19 19], n_folds=21, shuffle=True, random_state=<mtrand.RandomState object at 0x7f52a8b81168>)

In [16]:
X_train_list = []
X_valid_list = []

In [17]:
cv = 20
for train_index, valid_index in skf:
    X_train, X_valid = X[train_index], X[valid_index]
    print("train.shape before: %s" % (X_train.shape))
    X_train = X_train[np.in1d(X_train, meaningful_ids)]
    
    X_train_list.append(X_train)
    X_valid_list.append(X_valid)
    print("train.shape after: %s" % (X_train.shape))
    print("valid.shape: %s" % (X_valid.shape))
    
    cv += 1
    break # remove if full n-fold cross-validation is desired

train.shape before: 696708
train.shape after: 680780
valid.shape: 34848


In [18]:
np.save("./data/processed_train", np.array(train_new))

In [19]:
####### 5. Save the partitioned IDs into folders #######
if not os.path.exists("train"):
    os.makedirs("train")
if not os.path.exists("valid"):
    os.makedirs("valid")
if not os.path.exists("test"):
    os.makedirs("test")

In [20]:
for i, item in enumerate(X_train_list):
    np.save("./train/obs_ids_train_cv%s" % (i), item)

In [21]:
for i, item in enumerate(X_valid_list):
    np.save("./valid/obs_ids_valid_cv%s" % (i), item)

In [22]:
####### 6. Preprocess the test data #######
test_raw = pd.read_csv("./data/test.csv")
test_raw_ids_all = test_raw["Id"]
test_raw_ids = np.array(test_raw_ids_all.unique())

In [23]:
# Convert all NaNs to zero
test_new = test_raw.fillna(0.0)
test_new = test_new.reset_index(drop=True)

In [24]:
np.save("./data/processed_test", np.array(test_new))
np.save("./test/obs_ids_test", test_raw_ids)