In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
import glob

In [2]:
def prep_nosubject(df):
    scs = {}
    for col in df.columns[3:]:
        sc = StandardScaler()
        df[col] = sc.fit_transform(df[col].values.reshape(-1,1))
        scs[col] = sc

    for sensor in df.columns[3:]:
        df[sensor+'_square'] = np.square(df[sensor])
        df[sensor+'_diff'] = df[sensor].diff()
        df.loc[df['step']==0, sensor+'_diff'] = 0.0
        df[sensor+'_mean5'] = df[sensor].rolling(5).mean().fillna(0)
        df[sensor+'_mean10'] = df[sensor].rolling(10).mean().fillna(0)
        df[sensor+'_mean20'] = df[sensor].rolling(20).mean().fillna(0)

    for col in df.columns[16:]:
        sc = StandardScaler()
        df[col] = sc.fit_transform(df[col].values.reshape(-1,1))
        scs[col] = sc

    return scs

In [3]:
class MyDataset(Dataset):
    def __init__(self, series, labels=None, roll=False):
        self.X = series.drop(columns=['sequence','subject','step']).values
        self.X = self.X.reshape(-1,60,series.shape[1]-3).copy()
        if labels is None:
            self.y = None
        else:
            self.y = labels['state'].values
        self.roll = roll

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        ''' input tensor shape is N*L*C '''
        X = self.X[idx]
        toroll = np.random.choice(np.arange(-60,60))
        X = np.roll(X, toroll, axis=0)
        if self.y is None:
            y = -1
        else:
            y = self.y[idx]
        return (torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32))

In [4]:
class MyModel(nn.Module):
    def __init__(self, input_feature, hidden_size, num_layers=1):
        super(MyModel, self).__init__()

        self.fc_pre = nn.Linear(input_feature, hidden_size)
        self.bn_pre = nn.BatchNorm1d(60)

        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, num_layers=num_layers, dropout=0.5)
        ''' gru input is (N,L,H_in=H_hidden), output is (N,L,H_hidden), hidden is (num_layers, h_hidden)'''

        self.bn_post = nn.BatchNorm1d(hidden_size)
        self.fc_post = nn.Linear(hidden_size, 1)

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.25)

        self.max = nn.MaxPool1d(60, padding=0)
        self.avg = nn.AvgPool1d(60, padding=0)

    def forward(self, input, hidden):
        ''' X is in the shape of (N,L,input_feature) '''
        output = self.dropout(F.relu(self.bn_pre(self.fc_pre(input))))
        output, hidden = self.gru(output, hidden)
        #output = self.dropout(F.relu(self.bn_post(output[:,-1,:])))

        #output = self.max(output.transpose(1,2)).squeeze(dim=2) 
        output = self.avg(output.transpose(1,2)).squeeze(dim=2)
        output = self.dropout(F.relu(self.bn_post(output)))

        output = self.sigmoid(self.fc_post(output).squeeze(dim=1))
        return output

    def initHidden(self, batch_size):
        return torch.zeros((self.num_layers, batch_size, self.hidden_size))

In [5]:
train_series = pd.read_csv('train.csv')
train_labels = pd.read_csv('train_labels.csv')
test_series = pd.read_csv('test.csv')
all_series = pd.concat([train_series, test_series], axis=0)
scs = prep_nosubject(all_series)
train_series = all_series.loc[all_series['sequence']<=25967]
test_series = all_series.loc[all_series['sequence']>25967]

In [6]:
model_names = glob.glob('gru*1e-05*isplit*')
model_names

['gru_model_1e-05_isplit_1.pickle',
 'gru_model_1e-05_isplit_3.pickle',
 'gru_model_1e-05_isplit_0.pickle',
 'gru_model_1e-05_isplit_2.pickle',
 'gru_model_1e-05_isplit_4.pickle']

In [11]:
mydataset = MyDataset(train_series, train_labels)
train_dataloader = DataLoader(mydataset, shuffle=False, batch_size=len(mydataset))
X, y = next(iter(train_dataloader))

model = MyModel(78, 32, num_layers=1)
train_output = pd.DataFrame()
train_output['label'] = train_labels['state']
for model_name in model_names:
    tmp = torch.load(model_name)
    print(model_name, tmp['score'])
    if tmp['score'] > 0.90:        
        model.load_state_dict(tmp['model'])
        model.eval()
        with torch.no_grad():
            hidden = model.initHidden(len(X))
            pred = model(X, hidden).detach().numpy()
            train_output[model_name] = pred



gru_model_1e-05_isplit_1.pickle 0.9475514103080793
gru_model_1e-05_isplit_3.pickle 0.9538450312812955
gru_model_1e-05_isplit_0.pickle 0.9632108851710569
gru_model_1e-05_isplit_2.pickle 0.9499399036490701
gru_model_1e-05_isplit_4.pickle 0.9558650076172948


In [12]:
train_output

Unnamed: 0,label,gru_model_1e-05_isplit_1.pickle,gru_model_1e-05_isplit_3.pickle,gru_model_1e-05_isplit_0.pickle,gru_model_1e-05_isplit_2.pickle,gru_model_1e-05_isplit_4.pickle
0,0,0.000185,0.000079,0.000003,0.000054,0.000115
1,1,0.876932,0.615339,0.743665,0.652782,0.823054
2,1,0.998456,0.997478,0.997330,0.988788,0.972890
3,1,0.913660,0.773436,0.820726,0.341078,0.404788
4,1,0.964888,0.998668,0.999307,0.929100,0.963485
...,...,...,...,...,...,...
25963,1,0.999389,0.999596,0.992314,0.998707,0.998608
25964,0,0.061340,0.013662,0.021130,0.007287,0.012153
25965,1,0.551010,0.840445,0.944923,0.959467,0.734983
25966,1,0.995877,0.997890,0.998813,0.997893,0.993944


In [13]:
train_output.to_csv('gru_train_output.csv',index=False)

In [14]:
for col in train_output.columns[1:]:
    print(f"{col} {roc_auc_score(train_output['label'], train_output[col])}")

gru_model_1e-05_isplit_1.pickle 0.9768982502638043
gru_model_1e-05_isplit_3.pickle 0.9785030201913885
gru_model_1e-05_isplit_0.pickle 0.9802978444681099
gru_model_1e-05_isplit_2.pickle 0.9773209758619351
gru_model_1e-05_isplit_4.pickle 0.9788664576116282


In [15]:
train_output.corr(method='spearman')

Unnamed: 0,label,gru_model_1e-05_isplit_1.pickle,gru_model_1e-05_isplit_3.pickle,gru_model_1e-05_isplit_0.pickle,gru_model_1e-05_isplit_2.pickle,gru_model_1e-05_isplit_4.pickle
label,1.0,0.82601,0.828789,0.831898,0.826742,0.829419
gru_model_1e-05_isplit_1.pickle,0.82601,1.0,0.942043,0.938889,0.938681,0.940491
gru_model_1e-05_isplit_3.pickle,0.828789,0.942043,1.0,0.941757,0.936311,0.940404
gru_model_1e-05_isplit_0.pickle,0.831898,0.938889,0.941757,1.0,0.940838,0.942774
gru_model_1e-05_isplit_2.pickle,0.826742,0.938681,0.936311,0.940838,1.0,0.94168
gru_model_1e-05_isplit_4.pickle,0.829419,0.940491,0.940404,0.942774,0.94168,1.0


### looks like isplit_0 is the best so far

In [17]:
mydataset = MyDataset(test_series)
test_dataloader = DataLoader(mydataset, shuffle=False, batch_size=len(mydataset))
X, y = next(iter(test_dataloader))

model = MyModel(78, 32, num_layers=1)
test_output = pd.DataFrame()
test_output['sequence'] = test_series['sequence'].unique()

for model_name in model_names:
    tmp = torch.load(model_name)
    print(model_name, tmp['score'])
    if tmp['score'] > 0.90:        
        model.load_state_dict(tmp['model'])
        model.eval()
        with torch.no_grad():
            hidden = model.initHidden(len(X))
            pred = model(X, hidden).detach().numpy()
            test_output[model_name] = pred



gru_model_1e-05_isplit_1.pickle 0.9475514103080793
gru_model_1e-05_isplit_3.pickle 0.9538450312812955
gru_model_1e-05_isplit_0.pickle 0.9632108851710569
gru_model_1e-05_isplit_2.pickle 0.9499399036490701
gru_model_1e-05_isplit_4.pickle 0.9558650076172948


In [18]:
test_output

Unnamed: 0,sequence,gru_model_1e-05_isplit_1.pickle,gru_model_1e-05_isplit_3.pickle,gru_model_1e-05_isplit_0.pickle,gru_model_1e-05_isplit_2.pickle,gru_model_1e-05_isplit_4.pickle
0,25968,0.975159,0.992379,0.991872,0.955924,0.926214
1,25969,0.999122,0.999387,0.999008,0.995885,0.998611
2,25970,0.000002,0.000010,0.000016,0.000005,0.000048
3,25971,0.992676,0.886703,0.972923,0.870804,0.953395
4,25972,0.667639,0.001045,0.731680,0.070348,0.003636
...,...,...,...,...,...,...
12213,38181,0.247908,0.466485,0.974230,0.924690,0.475540
12214,38182,0.163819,0.437104,0.819549,0.094952,0.396854
12215,38183,0.822619,0.137272,0.266712,0.483808,0.250369
12216,38184,0.001910,0.007833,0.000770,0.010178,0.003099


In [19]:
test_output[['sequence','gru_model_1e-05_isplit_0.pickle']].to_csv('gru_baseline_new.csv', 
                                  index=False, header=['sequence','state'])

### this has score of 0.942, a little off

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [27]:
params = {'C': [1e-4,1e-3,1e-2,1e-1,1.0,1e+1]}
gcv = GridSearchCV(LogisticRegression(), params)

In [28]:
X, y = train_output.drop(columns=['label']), train_output['label']
gcv.fit(X, y)

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]})

In [29]:
gcv.best_params_, gcv.best_estimator_.coef_, gcv.best_score_

({'C': 0.001},
 array([[0.9258931 , 0.95346649, 0.96720356, 0.91990954, 0.93060852]]),
 0.9423135361640597)

In [31]:
test_output['blend'] = gcv.predict_proba(test_output.drop(columns=['sequence','blend']))[:,1]

test_output

In [33]:
test_output[['sequence','blend']].to_csv('gru_baseline_blend.csv', 
                                  index=False, header=['sequence','state'])

### this has score of 0.955

In [35]:
test_output.to_csv('gru_test_output.csv', index=False)