## 1. Libraries 
* numpy: working with multidimensional arrays
* pandas: file operation
* train_test_split: to split data into train and validation
* pickle: to store the created features and then used for different models

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
from sklearn.model_selection import train_test_split
import pickle

## 2. Import data
Import the json data.

In [2]:
def data_import():
    train = pd.read_json("data/train.json")
    test = pd.read_json("data/test.json")
    train.inc_angle = train.inc_angle.replace('na', 0)
    train.inc_angle = train.inc_angle.astype(float).fillna(0.0)
    test.inc_angle = test.inc_angle.replace('na', 0)
    test.inc_angle = test.inc_angle.astype(float).fillna(0.0)
    return train, test

train, test = data_import()

## 3. Preprocess
Merge bands and add another band as the mean of Band 1 and Band 2. Subtract the values by its mean and divide by standard deviation. Reshape the images into 75x75 arrays. One hot encoding for the class.

In [5]:
def data_reshape(df):
    X_band1 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df["band_1"]])
    X_band2 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in df["band_2"]])
    X_mean = np.array((X_band1+X_band2)/2)
    X_mean = (X_mean - X_mean.mean()) / X_mean.std()
    return X_mean[:, :, :, np.newaxis]

def one_hot(df):
    classi = [[]]
    for i in range(df.shape[0]):
        if(df["is_iceberg"][i]==0):
            classi.append([0,1])
        else:
            classi.append([1,0])
    classi.remove([])    
    return np.array(classi)

X_train = data_reshape(train)
X_test = data_reshape(test)

y_train = one_hot(train)

## 4. Train and Validation Sets

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                     y_train, random_state=666, train_size=0.75)

X_train, X_test, y_train, y_test = train_test_split(X_train,
                                     y_train, random_state=666, train_size=0.75)



## 5. Save the features

In [7]:
trainDict = {"xtrain": X_train, 
             "ytrain": y_train}
validDict = {"xvalid": X_valid, 
             "yvalid": y_valid}
testDict = {"xtest": X_test}
pickle_train = open("train.pickle","wb")
pickle_valid = open("valid.pickle","wb")
pickle_test = open("test.pickle","wb")

pickle.dump(trainDict,pickle_train)
pickle.dump(validDict,pickle_valid)
pickle.dump(testDict,pickle_test)