# FIT5149 - Applied Data Analysis
## Assignment 2 - Group 1
## Student name - ID
### Duy Tho Le - 30902819
### Minh Thai Nguyen
### Duy Ngoc Anh Nguyen

# 1. Feature engineering
## 1.1 Import libraries

In [1]:
%config Completer.use_jedi = False
import pandas as pd
import torch
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
from multiprocessing import cpu_count
from pathlib import Path
import tsfresh
from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters, ComprehensiveFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series
from tqdm import tqdm, tqdm_notebook
from sklearn.metrics import f1_score
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 500)
import collections

In [2]:
TRAIN_DATA_DIR = '../FIT5149_A2_data/train_data_withlabels.csv'
TEST_DATA_DIR = "../FIT5149_A2_data/test_data_nolabels.csv"
MODEL_DIR = "./bestf1.pth"

## 1.2 Helper functions

In [3]:
# https://numpy.org/doc/stable/reference/generated/numpy.fft.rfft.html
# Compute the one-dimensional discrete Fourier Transform for real input.
# This function computes the one-dimensional n-point discrete Fourier Transform (DFT)
# of a real-valued array by means of an efficient algorithm called the Fast Fourier Transform (FFT).

def absfft(x):
    return np.abs(np.fft.rfft(x))

# Helper function, used in building training and validating datasets
def build_datasets(data, target, train_size, valid_pct = 0.2, seed = None):
    x, x_fft = data # split the data in to raw and FFT data
    idx = np.arange(train_size) # Create a list of indexes
#     train_idx, val_idx = train_test_split(idx, test_size = valid_pct, random_state = seed)
    # Start spliting the data into training and validating parts using the validate percentage (valid_pct) value
    train_idx, val_idx = idx[round(train_size * valid_pct):], idx[:round(train_size * valid_pct)]
    # Build the train data, which include raw x, FFT x, and target y 
    train_ds = TensorDataset(torch.tensor(x[:train_size][train_idx]).float(),
                            torch.tensor(x_fft[:train_size][train_idx]).float(),
                            torch.tensor(target[:train_size][train_idx]).long())
    print("There are",len(set(target[:train_size][train_idx])),"class in training data")
    # Build the validating data, which include raw x, FFT x, and target y 
    val_ds = TensorDataset(torch.tensor(x[:train_size][val_idx]).float(),
                            torch.tensor(x_fft[:train_size][val_idx]).float(),
                            torch.tensor(target[:train_size][val_idx]).long())
    return train_ds, val_ds 

# Helper function, 
def build_loaders(data, batch_size = 128, jobs = 8):
    train_ds, valid_ds = data
    # Build a train dataloader
    train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle = False, num_workers = jobs)
    # Build a test dataloader
    valid_dl = DataLoader(valid_ds, batch_size = batch_size, shuffle = False, num_workers = jobs)
    return train_dl, valid_dl 

# https://www.kaggle.com/purplejester/pytorch-deep-time-series-classification
class _SepConv1d(nn.Module):
    """A simple separable convolution implementation.
    
    The separable convlution is a method to reduce number of the parameters 
    in the deep learning network for slight decrease in predictions quality.
    """
    def __init__(self, ni, no, kernel, stride, pad):
        super().__init__()
        self.depthwise = nn.Conv1d(ni, ni, kernel, stride, padding=pad, groups=ni)
        self.pointwise = nn.Conv1d(ni, no, kernel_size=1)

    def forward(self, x):
        return self.pointwise(self.depthwise(x))
    
class SepConv1d(nn.Module):
    """Implementes a 1-d convolution with 'batteries included'.
    
    The module adds (optionally) activation function and dropout 
    layers right after a separable convolution layer.
    """
    def __init__(self, ni, no, kernel, stride, pad, 
                 drop=None, bn=False,
                 activ=lambda: nn.ReLU()):
    
        super().__init__()
        # Check the drop out rate
        assert drop is None or (0.0 < drop < 1.0)
        # Build a separable convolution layer, using channel_in, channel_out, kernel size, stride, and padding size 
        layers = [_SepConv1d(ni, no, kernel, stride, pad)]
        # Add an activation function
        if activ:
            layers.append(activ())
        # Apply batch normalization if required
        if bn:
            layers.append(nn.BatchNorm1d(no))
        # Apply drop out rate to prevent the model from overfitting
        if drop is not None:
            layers.append(nn.Dropout(drop))
        # chain all of the layers into one object
        self.layers = nn.Sequential(*layers)
    
    # Define forward function
    def forward(self, x): 
        return self.layers(x)


# Helper function, used to build flatten layers
class Flatten(nn.Module):
    """Converts N-dimensional tensor into 'flat' one."""

    def __init__(self, keep_batch_dim=True):
        super().__init__()
        self.keep_batch_dim = keep_batch_dim
    # Define forward function, which flatten the input into 1 dimension, or 2 dimension which is [batchsize, -1]
    def forward(self, x):
        if self.keep_batch_dim:
            return x.view(x.size(0), -1)
        return x.view(-1)
    
# Helper function to print the shape of a layer's output, this function is useful when building and debugging 
# models
class PrintSize(nn.Module):
    def __init__(self):
        super(PrintSize, self).__init__()
        
    def forward(self, x):
        print(x.shape)
        return x
    
# model building, which includes 2 branches, one brach for raw input with 30 time steps, one branch for 
# FFT input with 30/2 + 1 = 16 timesteps.
# model building, which includes 2 branches, one brach for raw input with 30 time steps, one branch for 
# FFT input with 30/2 + 1 = 16 timesteps.
class Classifier(nn.Module):
    def __init__(self, raw_ni, fft_ni, no, drop=.5):
        super().__init__()
        #PKS input = 64 [[3,8,2],[3,8,2],[3,8,2],[3,8,2],[3,8,2]]
        self.raw = nn.Sequential( 
            #         (in ,out ,kernel, stride, pad)
            SepConv1d(raw_ni,  32, 8, 2, 3, drop=drop),
            SepConv1d(    32,  32, 8, 2, 3, drop=drop),
            SepConv1d(    32,  64, 8, 2, 3, drop=drop),
            SepConv1d(    64, 128, 8, 2, 3, drop=drop),
            SepConv1d(   128, 256, 8, 2, 3),
            Flatten(),
#             PrintSize(),
            nn.Dropout(drop), nn.Linear(512, 256), nn.ReLU(inplace = True),
            nn.Dropout(drop), nn.Linear(256, 64), nn.ReLU(inplace = True))
        #PKS input = 33 [[4,8,1],[2,8,2],[3,8,2],[3,8,2],[3,8,2]]
        self.fft = nn.Sequential(
            SepConv1d(fft_ni,  32, 8, 1, 4, drop=drop),
            SepConv1d(    32,  64, 8, 2, 2, drop=drop),
            SepConv1d(    64,  64, 8, 2, 3, drop=drop),
            SepConv1d(    64, 128, 8, 2, 3, drop=drop),
            SepConv1d(   128, 256, 8, 2, 3),
            Flatten(),
#             PrintSize(),
            nn.Dropout(drop), nn.Linear(512, 256), nn.ReLU(inplace = True),
            nn.Dropout(drop), nn.Linear(256, 64), nn.ReLU(inplace = True))
        
        self.out = nn.Sequential(
            nn.Linear(128, 64), nn.ReLU(inplace=True), nn.Linear(64, no))
        self.init_weights(nn.init.kaiming_normal_)
        

    def init_weights(self, init_fn):
        def init(m): 
            for child in m.children():
                if isinstance(child, nn.Conv1d):
                    # Fills the input Tensor with values according to the method described in Delving deep into 
                    # rectifiers: Surpassing human-level performance on 
                    # ImageNet classification - He, K. et al. (2015), using a normal distribution
                    init_fn(child.weights)
        init(self)
        
    # Define a forward function for the model
    def forward(self, t_raw, t_fft):
        # raw x branch
        raw_out = self.raw(t_raw)
        # fft x branch
        fft_out = self.fft(t_fft)
        # concat 2 branches into one
        t_in = torch.cat([raw_out, fft_out], dim=1)
        # push it through fully connected layers to get the output
        out = self.out(t_in)
        return out
    
# https://github.com/gokulprasadthekkel/pytorch-multi-class-focal-loss/blob/master/focal_loss.py
# Helper function, implementation of focal loss in order to solve class imbalance problem
class FocalLoss(nn.modules.loss._WeightedLoss):
    def __init__(self, weight=None, gamma=2,reduction='mean'):
        super(FocalLoss, self).__init__(weight,reduction=reduction)
        self.gamma = gamma
        self.weight = weight #weight parameter will act as the alpha parameter to balance class weights

    def forward(self, input, target):

        ce_loss = F.cross_entropy(input, target,reduction=self.reduction,weight=self.weight)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma * ce_loss).mean()
        return focal_loss

## 1.3 Data preprocessing

In [4]:
# Load the testing data
x_test = pd.read_csv(TEST_DATA_DIR)
x_test.rename(columns={'Unnamed: 0': 'id'}, inplace= 1) # Rename 
x_test['id'] = x_test['id'] + 62 # Increase the ID column by 28 for later rolling
x_test.head(100)

Unnamed: 0,id,load,hourofday,dayofweek,dif,absdif,max,var,entropy,nonlinear,hurst
0,63,1.869,0,Mon,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,64,1.673,0,Mon,-0.196,0.196,0.0,0.0,0.0,0.0,0.0
2,65,1.66,0,Mon,-0.013,0.013,0.0,0.0,0.0,0.0,0.0
3,66,1.772,0,Mon,0.112,0.112,0.0,0.0,0.0,0.0,0.0
4,67,1.679,0,Mon,-0.093,0.093,0.0,0.0,0.0,0.0,0.0
5,68,1.775,0,Mon,0.096,0.096,0.0,0.0,0.0,0.0,0.0
6,69,1.759,0,Mon,-0.016,0.016,0.0,0.0,0.0,0.0,0.0
7,70,1.705,0,Mon,-0.054,0.054,0.0,0.0,0.0,0.0,0.0
8,71,1.829,0,Mon,0.124,0.124,0.0,0.0,0.0,0.0,0.0
9,72,1.72,0,Mon,-0.109,0.109,0.0,0.0,0.0,0.0,0.0


In [5]:
# Build the Label encoder and Standard scaler from training data
data = pd.read_csv(TRAIN_DATA_DIR)
data.rename(columns={"Unnamed: 0":"id"}, inplace= True)
x_train = data.loc[:,data.columns.difference(['ac', 'ev', 'oven', 'wash', 'dryer'])]
y_train = data[['ac', 'ev', 'oven', 'wash', 'dryer']]

### 1.3.1 Rebuild the same label encoder and scaler as the training file

In [6]:
le = preprocessing.LabelEncoder() # Create a label encoder for variable "dayofweeek"
le.fit(x_train['dayofweek']) # Fit the values
x_train['dayofweek'] = le.transform(x_train['dayofweek']) # Perform transformation
x_test['dayofweek'] = le.transform(x_test['dayofweek'])

le.classes_ # have a look at the classes

array(['Fri', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue', 'Wed'], dtype=object)

In [7]:
# Standardize features by removing the mean and scaling to unit variance
scale_list = ['absdif', 'dayofweek', 'dif', 'entropy', 'hourofday', 'hurst', 'load', 'max', 'nonlinear', 'var']
scaler = preprocessing.StandardScaler().fit(x_train[scale_list]) # Fit the data values
x_train[scale_list] = scaler.transform(x_train[scale_list]) # Perform scaling
x_test[scale_list] = scaler.transform(x_test[scale_list]) # Perform scaling


### 1.3.2 Get the y_train label encoder for later part of predictions decoding

In [8]:
# concatenate the values of the 5 appliances into a string, for example 0,1,0,0,1 will be 01001
y_train['transformed'] = y_train.apply(lambda x: ''.join(x.astype(str)),axis = 1)
# create a label encoder for y_train
le_y = preprocessing.LabelEncoder()
# fit and transform the values of the concatenated values 
y_train['encoded'] = le_y.fit_transform(y_train['transformed']) 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['transformed'] = y_train.apply(lambda x: ''.join(x.astype(str)),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train['encoded'] = le_y.fit_transform(y_train['transformed'])


In [9]:
# Since the design choice is to use 30 minutes at each time-step, therefore each of the first 29 observations 
# will not have enough 29 observations before it in order to create a series of 30 observations.
# Thus I will add 29 copies of the first observation to the top of the testing dataframe.
extra_len = pd.DataFrame(x_test.iloc[0:1,:].values.repeat(63, axis = 0), columns=x_test.columns)
extra_len['id'] = list(range(0,63))

# concatenate the testing and the additional data
x_test = pd.concat([extra_len, x_test])
x_test['dummy_id'] = 1 # Create a dummy ID column for rolling
x_test.reset_index(drop= True, inplace=True) # Reset index
# Rearrange the data
x_test = x_test[['absdif','dayofweek','dif','entropy','hourofday','hurst','id','load','max','nonlinear',
                 'var','dummy_id']]

In [10]:
x_test.head(100)

Unnamed: 0,absdif,dayofweek,dif,entropy,hourofday,hurst,id,load,max,nonlinear,var,dummy_id
0,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,0,-0.166969,-1.86622,-0.562601,-1.046775,1
1,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,1,-0.166969,-1.86622,-0.562601,-1.046775,1
2,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,2,-0.166969,-1.86622,-0.562601,-1.046775,1
3,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,3,-0.166969,-1.86622,-0.562601,-1.046775,1
4,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,4,-0.166969,-1.86622,-0.562601,-1.046775,1
5,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,5,-0.166969,-1.86622,-0.562601,-1.046775,1
6,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,6,-0.166969,-1.86622,-0.562601,-1.046775,1
7,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,7,-0.166969,-1.86622,-0.562601,-1.046775,1
8,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,8,-0.166969,-1.86622,-0.562601,-1.046775,1
9,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,9,-0.166969,-1.86622,-0.562601,-1.046775,1


### 1.3.3 Perform rolling on test data

### *Note: Might take 1 hour to finish running. Instead, you can download the preprocessed file to the preprocessed_data folder, comment the 2 lines, uncomment the last line to load (only take ~30 seconds)*

In [11]:
# roll_time_series method creates sub windows of the time series. It rolls the (sorted) data frames for each 
# kind and each id separately in the “time” domain (which is represented by the sort order of the sort column given
# by column_sort).
# For example when applying the roll_time_series to a data [a,b,c,d,e,f,g] with time_shift = 3, the result will be 
# [a,b,c, b,c,d, c,d,e, d,e,f, e,f,g]. The input of the CNN model will have a shape of [30,10] (30 timestep, 10
# variables) and its prediction will be the appliances status at time_step 30.
# Notice: run these 3 lines of code will take hours

# If downloaded preprocessed data, comment these lines
x_test_rolled = roll_time_series(x_test, column_id="dummy_id", column_sort="id",
                            max_timeshift = 63, min_timeshift = 63, n_jobs = 16)
# Save the preprocessed data for later re-use
x_test_rolled.to_hdf('./preprocessed_data/x_test_rolled_64.h5','x_test_rolled_64')
x_test_rolled.head(100)


# Load the preprocessed data
# If download preprocessed data, uncomment this
# x_test_rolled = pd.read_hdf('./preprocessed_data/x_test_rolled_64.h5','x_test_rolled_64') 


In [13]:
# rearrange the columns
x_test_rolled = x_test_rolled[['absdif','dayofweek','dif','entropy','hourofday','hurst','id','load','max',
                               'nonlinear','var','dummy_id']]
x_test_rolled.head(10)

Unnamed: 0,absdif,dayofweek,dif,entropy,hourofday,hurst,id,load,max,nonlinear,var,dummy_id
507264,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,"(1, 63)",-0.166969,-1.86622,-0.562601,-1.046775,1
507265,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,"(1, 63)",-0.166969,-1.86622,-0.562601,-1.046775,1
507266,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,"(1, 63)",-0.166969,-1.86622,-0.562601,-1.046775,1
507267,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,"(1, 63)",-0.166969,-1.86622,-0.562601,-1.046775,1
507268,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,"(1, 63)",-0.166969,-1.86622,-0.562601,-1.046775,1
507269,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,"(1, 63)",-0.166969,-1.86622,-0.562601,-1.046775,1
507270,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,"(1, 63)",-0.166969,-1.86622,-0.562601,-1.046775,1
507271,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,"(1, 63)",-0.166969,-1.86622,-0.562601,-1.046775,1
507272,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,"(1, 63)",-0.166969,-1.86622,-0.562601,-1.046775,1
507273,-0.315136,-1.00607,1e-06,-7.50016,-1.659524,-14.864864,"(1, 63)",-0.166969,-1.86622,-0.562601,-1.046775,1


In [14]:
# check if there're any shape differences between rolled data and original data
print(len(x_test_rolled)/64)
print(len(x_test)-63)

105540.0
105540


In [15]:
# Drop the ID and dummy_id columns
x_test_rolled.drop(['id','dummy_id'], axis=1, inplace=True)
# reshape the rolled x_train to a new shape of [105540,30,10]
x_test_rolled = np.reshape(x_test_rolled.to_numpy(dtype=np.float64), [-1 , 64, int(x_test_rolled.shape[1])])
x_test_rolled.shape

(105540, 64, 10)

### 1.3.4  Extract the Fast Fourier Transform features

In [16]:
# https://numpy.org/doc/stable/reference/generated/numpy.fft.rfft.html
# Compute the one-dimensional discrete Fourier Transform for real input.
# This function computes the one-dimensional n-point discrete Fourier Transform (DFT)
# of a real-valued array by means of an efficient algorithm called the Fast Fourier Transform (FFT).
x_test_fft = np.copy(x_test_rolled)
x_test_fft = np.apply_along_axis(absfft, 1, x_test_fft)
x_test_fft

array([[[2.01687050e+01, 6.43884618e+01, 8.48411542e-05, ...,
         1.19438110e+02, 3.60064840e+01, 6.69936052e+01],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        ...,
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00]],

       [[1.97816427e+01, 6.43884618e+01, 3.69080231e-01, ...,
         1.19438110e+02, 3.60064840e+01, 6.69936052e+01],
        [3.87062265e-01, 0.00000000e+00, 3.69165072e-01, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [3.87062265e-01, 

In [17]:
print(x_test_rolled.shape)
print(x_test_fft.shape)

(105540, 64, 10)
(105540, 33, 10)


In [18]:
# Transpose the shape x_test and x_test_fft to [observations, features, time_steps]
x_test = x_test_rolled.transpose(0,2,1)
x_test_fft = x_test_fft.transpose(0,2,1)

In [19]:
print(x_test.shape)
print(x_test_fft.shape)

(105540, 10, 64)
(105540, 10, 33)


# 2. Inference

## 2.1 Build datasets and select testing device

In [20]:
device_test = torch.device('cpu')

In [21]:
# Get the testing dataset
test_ds = TensorDataset(torch.tensor(x_test).float(), torch.tensor(x_test_fft).float())
# Get the testing dataloader
test_dl = DataLoader(test_ds, batch_size=1, shuffle=False)
test_preds = [] # to store prediction

## 2.2 Load the trained model and start inferencing

In [22]:
# Build a new classifier model
model = Classifier(10, 10, 20)
# load the state dict of the previously trained models
model.load_state_dict(torch.load("./trained_model/CNN_best_f1.pth"))

<All keys matched successfully>

In [23]:
# Change the model mode to evaluate and send to the device
model.eval()
model.to(device_test)

Classifier(
  (raw): Sequential(
    (0): SepConv1d(
      (layers): Sequential(
        (0): _SepConv1d(
          (depthwise): Conv1d(10, 10, kernel_size=(8,), stride=(2,), padding=(3,), groups=10)
          (pointwise): Conv1d(10, 32, kernel_size=(1,), stride=(1,))
        )
        (1): ReLU()
        (2): Dropout(p=0.5, inplace=False)
      )
    )
    (1): SepConv1d(
      (layers): Sequential(
        (0): _SepConv1d(
          (depthwise): Conv1d(32, 32, kernel_size=(8,), stride=(2,), padding=(3,), groups=32)
          (pointwise): Conv1d(32, 32, kernel_size=(1,), stride=(1,))
        )
        (1): ReLU()
        (2): Dropout(p=0.5, inplace=False)
      )
    )
    (2): SepConv1d(
      (layers): Sequential(
        (0): _SepConv1d(
          (depthwise): Conv1d(32, 32, kernel_size=(8,), stride=(2,), padding=(3,), groups=32)
          (pointwise): Conv1d(32, 64, kernel_size=(1,), stride=(1,))
        )
        (1): ReLU()
        (2): Dropout(p=0.5, inplace=False)
      )
    

In [24]:
# Perform predicting on testing date
for test_batch in tqdm(test_dl): # Iterate through every batch of testing dataloader
    x_raw, x_fft = [t.to(device_test) for t in test_batch] # Send the data to device
    out = model(x_raw, x_fft) # Get the output
    preds = F.log_softmax(out, dim=1).argmax(dim=1) # Get the final prediction
    # decode its value for submisson compatability
    decoded_preds = le_y.inverse_transform([preds.detach().cpu().clone().numpy()]) 
    # Save the prediction
    test_preds.append([i for i in decoded_preds[0]])

  return f(**kwargs)
100%|██████████| 105540/105540 [03:36<00:00, 488.28it/s]


In [25]:
# Convert the prediction list to dataframe
predictions = pd.DataFrame(np.array(test_preds)).reset_index()
predictions.columns = ['id', 'ac', 'ev', 'oven', 'wash', 'dryer'] # Add column names
predictions['id'] = predictions['id'] + 1 # For submission compatability
predictions.set_index('id',inplace=True) # set ID column as index
predictions.head() #have a look to check its format

Unnamed: 0_level_0,ac,ev,oven,wash,dryer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0
5,0,0,0,0,0


In [26]:
# Check how many classes are predicted
predictions[['ac', 'ev', 'oven', 'wash', 'dryer']].drop_duplicates()

Unnamed: 0_level_0,ac,ev,oven,wash,dryer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,0,0,0,0
137,0,0,0,0,1
138,1,0,0,0,0
139,1,0,0,0,1
1150,0,0,0,1,0
1396,0,0,1,0,0
2784,0,1,0,0,0
5674,0,0,1,1,0
25238,1,0,1,0,0
25472,1,0,0,1,0


In [27]:
print(f"There are {len(predictions[['ac', 'ev', 'oven', 'wash', 'dryer']].drop_duplicates())} classes are predicted by model, compared to 20 classes on which the model was trained.")

There are 16 classes are predicted by model, compared to 20 classes on which the model was trained.


In [29]:
# Export to .csv file for submission
predictions.to_csv('./output/CNN_predictions.csv')