In [11]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
import glob

In [3]:
def prep_nosubject(df):
    scs = {}
    for col in df.columns[3:]:
        sc = StandardScaler()
        df[col] = sc.fit_transform(df[col].values.reshape(-1,1))
        scs[col] = sc

    for sensor in df.columns[3:]:
        df[sensor+'_square'] = np.square(df[sensor])
        df[sensor+'_diff'] = df[sensor].diff()
        df.loc[df['step']==0, sensor+'_diff'] = 0.0
        df[sensor+'_mean5'] = df[sensor].rolling(5).mean().fillna(0)
        df[sensor+'_mean10'] = df[sensor].rolling(10).mean().fillna(0)
        df[sensor+'_mean20'] = df[sensor].rolling(20).mean().fillna(0)

    for col in df.columns[16:]:
        sc = StandardScaler()
        df[col] = sc.fit_transform(df[col].values.reshape(-1,1))
        scs[col] = sc

    return scs

In [4]:
class MyDataset(Dataset):
    def __init__(self, series, labels=None, roll=False):
        self.X = series.drop(columns=['sequence','subject','step']).values
        self.X = self.X.reshape(-1,60,series.shape[1]-3).transpose([0,2,1]).copy()
        if labels is None:
            self.y = None
        else:
            self.y = labels['state'].values
        self.roll = roll

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        ''' input tensor shape is N*C*L '''
        X = self.X[idx]
        if self.roll:
            toroll = np.random.choice(np.arange(-60,60))
            X = np.roll(X, toroll, axis=1)
        if self.y is None:
            y = -1
        else:
            y = self.y[idx]
        return (torch.tensor(X, dtype=torch.float32), torch.tensor(y))

In [5]:
class MyFCNModel(nn.Module):
    def __init__(self, input_channel):
        super(MyFCNModel, self).__init__()
        torch.manual_seed(123)

        pmode = 'circular'

        self.conv1d_1 = nn.Conv1d(input_channel, input_channel, 5, groups=input_channel, padding=2, padding_mode=pmode)
        self.bn_1 = nn.BatchNorm1d(input_channel)
        self.conv1d_2 = nn.Conv1d(input_channel, input_channel, 9, groups=input_channel, padding=4, padding_mode=pmode)
        self.bn_2 = nn.BatchNorm1d(input_channel)
        self.conv1d_3 = nn.Conv1d(input_channel, input_channel, 19, groups=input_channel, padding=9, padding_mode=pmode)
        self.bn_3 = nn.BatchNorm1d(input_channel)

        self.conv1d_4 = nn.Conv1d(3*input_channel, 32, 3, padding=1, padding_mode=pmode)
        self.bn_4 = nn.BatchNorm1d(32)

        self.conv1d_5 = nn.Conv1d(32, 2, 3, padding=1, padding_mode=pmode)
        self.bn_5 = nn.BatchNorm1d(2)

        self.avg = nn.AvgPool1d(60, padding=0)
        self.dropout = nn.Dropout(p=0.25)

    def forward(self, X):
        ''' input shape (N,C,L) '''

        X1 = F.relu(self.bn_1(self.conv1d_1(X)))
        X2 = F.relu(self.bn_2(self.conv1d_2(X)))
        X3 = F.relu(self.bn_3(self.conv1d_3(X)))

        X = torch.cat([X1,X2,X3], dim=1)
        X = self.dropout(X)

        X = F.relu(self.bn_4(self.conv1d_4(X)))
        X = self.dropout(X)

        X = F.relu(self.bn_5(self.conv1d_5(X)))
#        X = self.conv1d_5(X)  #this is about 0.1 worse

        X = self.avg(X).squeeze(dim=2)
        output = F.log_softmax(X, dim=1)

        return output

In [8]:
train_series = pd.read_csv('train.csv')
train_labels = pd.read_csv('train_labels.csv')
test_series = pd.read_csv('test.csv')
all_series = pd.concat([train_series, test_series], axis=0)
scs = prep_nosubject(all_series)
train_series = all_series.loc[all_series['sequence']<=25967]
test_series = all_series.loc[all_series['sequence']>25967]

In [14]:
model_names = glob.glob('nn*1e-05*isplit*')

In [18]:
mydataset = MyDataset(train_series, train_labels)
train_dataloader = DataLoader(mydataset, shuffle=False, batch_size=len(mydataset))
X, y = next(iter(train_dataloader))

model = MyFCNModel(78)
train_output = pd.DataFrame()
train_output['label'] = train_labels['state']
for model_name in model_names:
    tmp = torch.load(model_name)
    print(model_name, tmp['score'])
    if tmp['score'] > 0.95:        
        model.load_state_dict(tmp['model'])
        model.eval()
        with torch.no_grad():
            pred = model(X).detach().numpy()
            train_output[model_name] = np.exp(pred[:,1])

nn_model_1e-05_isplit_2.pickle 0.9648601140600864
nn_model_1e-05_isplit_0.pickle 0.9766217373021355
nn_model_1e-05_isplit_4.pickle 0.9705471305612051
nn_model_1e-05_isplit_3.pickle 0.9715526448112124
nn_model_1e-05_isplit_1.pickle 0.962505420337617


In [21]:
train_output

Unnamed: 0,label,nn_model_1e-05_isplit_2.pickle,nn_model_1e-05_isplit_0.pickle,nn_model_1e-05_isplit_4.pickle,nn_model_1e-05_isplit_3.pickle,nn_model_1e-05_isplit_1.pickle
0,0,0.000428,0.000670,0.001373,0.004204,0.000526
1,1,0.822656,0.948457,0.830469,0.989444,0.875767
2,1,0.993019,0.998480,0.991656,0.982240,0.985368
3,1,0.783070,0.870413,0.520563,0.864861,0.651066
4,1,0.963354,0.990503,0.659270,0.984968,0.913403
...,...,...,...,...,...,...
25963,1,0.992234,0.988758,0.989263,0.988569,0.967122
25964,0,0.007409,0.000589,0.003515,0.022195,0.002654
25965,1,0.837370,0.553210,0.819429,0.954351,0.712913
25966,1,0.986278,0.972274,0.977593,0.975883,0.985457


In [64]:
train_output.to_csv('nn_train_output.csv',index=False)

In [23]:
for col in train_output.columns[1:]:
    print(f"{col} {roc_auc_score(train_output['label'], train_output[col])}")

nn_model_1e-05_isplit_2.pickle 0.987833813795948
nn_model_1e-05_isplit_0.pickle 0.9916117075045059
nn_model_1e-05_isplit_4.pickle 0.9885196673863818
nn_model_1e-05_isplit_3.pickle 0.9889905323749753
nn_model_1e-05_isplit_1.pickle 0.9884527242416505


In [25]:
train_output.corr(method='spearman')

Unnamed: 0,label,nn_model_1e-05_isplit_2.pickle,nn_model_1e-05_isplit_0.pickle,nn_model_1e-05_isplit_4.pickle,nn_model_1e-05_isplit_3.pickle,nn_model_1e-05_isplit_1.pickle
label,1.0,0.844951,0.851494,0.846139,0.846954,0.846023
nn_model_1e-05_isplit_2.pickle,0.844951,1.0,0.965376,0.966923,0.967187,0.960671
nn_model_1e-05_isplit_0.pickle,0.851494,0.965376,1.0,0.966098,0.96803,0.962858
nn_model_1e-05_isplit_4.pickle,0.846139,0.966923,0.966098,1.0,0.968106,0.961435
nn_model_1e-05_isplit_3.pickle,0.846954,0.967187,0.96803,0.968106,1.0,0.963952
nn_model_1e-05_isplit_1.pickle,0.846023,0.960671,0.962858,0.961435,0.963952,1.0


### looks like isplit_0 is the best so far

In [26]:
mydataset = MyDataset(test_series)
test_dataloader = DataLoader(mydataset, shuffle=False, batch_size=len(mydataset))
X, y = next(iter(test_dataloader))

model = MyFCNModel(78)
test_output = pd.DataFrame()
test_output['sequence'] = test_series['sequence'].unique()

for model_name in model_names:
    tmp = torch.load(model_name)
    print(model_name, tmp['score'])
    if tmp['score'] > 0.95:        
        model.load_state_dict(tmp['model'])
        model.eval()
        with torch.no_grad():
            pred = model(X).detach().numpy()
            test_output[model_name] = np.exp(pred[:,1])

nn_model_1e-05_isplit_2.pickle 0.9648601140600864
nn_model_1e-05_isplit_0.pickle 0.9766217373021355
nn_model_1e-05_isplit_4.pickle 0.9705471305612051
nn_model_1e-05_isplit_3.pickle 0.9715526448112124
nn_model_1e-05_isplit_1.pickle 0.962505420337617


In [27]:
test_output

Unnamed: 0,sequence,nn_model_1e-05_isplit_2.pickle,nn_model_1e-05_isplit_0.pickle,nn_model_1e-05_isplit_4.pickle,nn_model_1e-05_isplit_3.pickle,nn_model_1e-05_isplit_1.pickle
0,25968,0.988292,0.982424,0.968129,0.991629,0.982700
1,25969,0.983622,0.987861,0.957940,0.978525,0.982838
2,25970,0.000108,0.000620,0.000238,0.000859,0.000418
3,25971,0.608966,0.925262,0.710716,0.878715,0.826165
4,25972,0.028058,0.030483,0.031900,0.138249,0.422702
...,...,...,...,...,...,...
12213,38181,0.996996,0.865621,0.891795,0.965554,0.497939
12214,38182,0.838029,0.922800,0.459102,0.916497,0.667446
12215,38183,0.170850,0.155390,0.107199,0.212193,0.102878
12216,38184,0.000924,0.002528,0.001428,0.003295,0.002429


In [28]:
test_output[['sequence','nn_model_1e-05_isplit_0.pickle']].to_csv('nn_baseline_new.csv', 
                                  index=False, header=['sequence','state'])

### this has score of 0.972, which is very close to test_score

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [51]:
params = {'C': [1e-3,1e-2,1e-1,1.0,1e+1]}
gcv = GridSearchCV(LogisticRegression(), params)

In [52]:
X, y = train_output.drop(columns=['label']), train_output['label']
gcv.fit(X, y)

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.001, 0.01, 0.1, 1.0, 10.0]})

In [53]:
gcv.best_params_

{'C': 0.1}

In [56]:
gcv.best_estimator_.coef_

array([[0.95755271, 3.31088253, 1.47102994, 1.52269924, 1.93851549]])

In [57]:
test_output

Unnamed: 0,sequence,nn_model_1e-05_isplit_2.pickle,nn_model_1e-05_isplit_0.pickle,nn_model_1e-05_isplit_4.pickle,nn_model_1e-05_isplit_3.pickle,nn_model_1e-05_isplit_1.pickle,blend
0,25968,0.988292,0.982424,0.968129,0.991629,0.982700,0.992312
1,25969,0.983622,0.987861,0.957940,0.978525,0.982838,0.992210
2,25970,0.000108,0.000620,0.000238,0.000859,0.000418,0.011996
3,25971,0.608966,0.925262,0.710716,0.878715,0.826165,0.972800
4,25972,0.028058,0.030483,0.031900,0.138249,0.422702,0.040569
...,...,...,...,...,...,...,...
12213,38181,0.996996,0.865621,0.891795,0.965554,0.497939,0.962758
12214,38182,0.838029,0.922800,0.459102,0.916497,0.667446,0.955751
12215,38183,0.170850,0.155390,0.107199,0.212193,0.102878,0.045905
12216,38184,0.000924,0.002528,0.001428,0.003295,0.002429,0.012204


In [60]:
test_output['blend'] = gcv.predict_proba(test_output.drop(columns=['sequence','blend']))[:,1]

In [61]:
test_output

Unnamed: 0,sequence,nn_model_1e-05_isplit_2.pickle,nn_model_1e-05_isplit_0.pickle,nn_model_1e-05_isplit_4.pickle,nn_model_1e-05_isplit_3.pickle,nn_model_1e-05_isplit_1.pickle,blend
0,25968,0.988292,0.982424,0.968129,0.991629,0.982700,0.991262
1,25969,0.983622,0.987861,0.957940,0.978525,0.982838,0.991077
2,25970,0.000108,0.000620,0.000238,0.000859,0.000418,0.013358
3,25971,0.608966,0.925262,0.710716,0.878715,0.826165,0.965270
4,25972,0.028058,0.030483,0.031900,0.138249,0.422702,0.043019
...,...,...,...,...,...,...,...
12213,38181,0.996996,0.865621,0.891795,0.965554,0.497939,0.963075
12214,38182,0.838029,0.922800,0.459102,0.916497,0.667446,0.948618
12215,38183,0.170850,0.155390,0.107199,0.212193,0.102878,0.049809
12216,38184,0.000924,0.002528,0.001428,0.003295,0.002429,0.013577


In [62]:
test_output[['sequence','blend']].to_csv('nn_baseline_blend.csv', 
                                  index=False, header=['sequence','state'])

In [65]:
test_output

Unnamed: 0,sequence,nn_model_1e-05_isplit_2.pickle,nn_model_1e-05_isplit_0.pickle,nn_model_1e-05_isplit_4.pickle,nn_model_1e-05_isplit_3.pickle,nn_model_1e-05_isplit_1.pickle,blend
0,25968,0.988292,0.982424,0.968129,0.991629,0.982700,0.991262
1,25969,0.983622,0.987861,0.957940,0.978525,0.982838,0.991077
2,25970,0.000108,0.000620,0.000238,0.000859,0.000418,0.013358
3,25971,0.608966,0.925262,0.710716,0.878715,0.826165,0.965270
4,25972,0.028058,0.030483,0.031900,0.138249,0.422702,0.043019
...,...,...,...,...,...,...,...
12213,38181,0.996996,0.865621,0.891795,0.965554,0.497939,0.963075
12214,38182,0.838029,0.922800,0.459102,0.916497,0.667446,0.948618
12215,38183,0.170850,0.155390,0.107199,0.212193,0.102878,0.049809
12216,38184,0.000924,0.002528,0.001428,0.003295,0.002429,0.013577


In [66]:
test_output.to_csv("gru_test_output.csv", index=False)