In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [9]:
def prep_withsubject(df):
    scs = {}
    for col in df.columns[3:]:
        sc = StandardScaler()
        df[col] = sc.fit_transform(df[col].values.reshape(-1,1))
        scs[col] = sc
    
    for sensor in df.columns[3:]:
        df[sensor+'_square'] = np.square(df[sensor])
        df[sensor+'_diff'] = df[sensor].diff()
        df.loc[df['step']==0, sensor+'_diff'] = 0.0
        
        tmp = df.groupby(['subject','step']).agg({sensor:'mean'}).to_dict()[sensor]
        df[sensor+'_subject_mean'] = pd.Series(zip(df['subject'], df['step'])).map(tmp)
        
        tmp = df.groupby(['subject','step']).agg({sensor+'_square':'mean'}).to_dict()[sensor+'_square']
        df[sensor+'_square_subject_mean'] = pd.Series(zip(df['subject'], df['step'])).map(tmp)
        
        tmp = df.groupby(['subject','step']).agg({sensor+'_diff':'mean'}).to_dict()[sensor+'_diff']
        df[sensor+'_diff_subject_mean'] = pd.Series(zip(df['subject'], df['step'])).map(tmp)
    
    for col in df.columns[16:]:
        sc = StandardScaler()
        df[col] = sc.fit_transform(df[col].values.reshape(-1,1))
        scs[col] = sc
    
    return scs

In [10]:
    train_series = pd.read_csv('train.csv')
    train_labels = pd.read_csv('train_labels.csv')
    test_series = pd.rea`d_csv('test.csv')
    all_series = pd.concat([train_series, test_series], axis=0)
    scs = prep_withsubject(all_series)
    train_series = all_series.loc[all_series['sequence']<=25967]

In [11]:
train_series

Unnamed: 0,sequence,subject,step,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,...,sensor_11_square,sensor_11_diff,sensor_11_subject_mean,sensor_11_square_subject_mean,sensor_11_diff_subject_mean,sensor_12_square,sensor_12_diff,sensor_12_subject_mean,sensor_12_square_subject_mean,sensor_12_diff_subject_mean
0,0,47,0,-0.071577,0.025398,0.527133,0.077786,-0.573614,-0.081093,-0.037586,...,-0.025933,-0.000022,0.168770,-0.020132,0.000008,-0.188896,0.000141,-0.328722,-0.196842,0.000148
1,0,47,1,-0.162994,0.030337,0.527133,-0.153988,0.093636,0.213528,-0.061907,...,-0.026239,-0.109142,-1.893829,1.189209,-1.330854,-0.191629,-0.089298,-0.548938,-0.249138,-0.179915
2,0,47,2,0.118849,-0.155230,0.527133,0.077994,0.271536,0.800437,-0.027858,...,-0.025863,-0.050758,-0.569109,-0.065291,0.854765,-0.188933,-0.107894,0.132294,-0.228386,0.557168
3,0,47,3,0.190294,0.168396,0.527133,0.229810,-0.321286,-0.448553,0.235614,...,-0.023936,0.276014,0.689967,-0.046081,0.812409,-0.191632,0.107651,0.338776,-0.236459,0.168981
4,0,47,4,0.098878,0.240835,0.527133,-0.031455,0.228603,0.029098,0.166435,...,-0.026286,-0.174585,1.845034,0.530400,0.745299,-0.191650,-0.013442,-0.110551,-0.239970,-0.367251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1558075,25967,327,55,-0.103081,-0.272356,-0.626783,0.138222,-0.531331,-0.280465,-0.192967,...,-0.026146,0.053509,-0.428508,-0.133116,-0.477073,-0.191555,-0.032216,-0.054662,-0.290757,-0.056195
1558076,25967,327,56,0.047404,0.078552,-0.626783,-0.075692,0.443984,-0.206693,0.029700,...,-0.025822,-0.037082,-0.266833,-0.133045,0.104327,-0.191624,0.008445,0.000358,-0.290749,0.045136
1558077,25967,327,57,-0.211092,0.096427,-0.626783,0.075502,0.177312,0.177576,-0.151893,...,-0.022832,0.320427,0.165521,-0.133474,0.278978,-0.191549,0.030199,-0.038437,-0.290757,-0.031573
1558078,25967,327,58,0.465387,0.383362,-0.564748,0.188897,-0.262547,-0.033468,0.713917,...,-0.026130,-0.180762,0.260551,-0.134936,0.061325,-0.191650,-0.014782,0.079244,-0.290526,0.096372


In [13]:
pd.unique?

In [12]:
train_series['sequence'].unique()

array([    0,     1,     2, ..., 25965, 25966, 25967])

In [14]:
from sklearn.model_selection import KFold

In [16]:
a = [1,2,3,4,5,6,7,8,9,10]

In [19]:
for train_index, test_index in KFold(shuffle=True, random_state=123).split(a):
    print(train_index)

[1 2 3 5 6 7 8 9]
[0 1 2 3 4 6 8 9]
[0 1 2 4 5 6 7 9]
[0 2 3 4 5 7 8 9]
[0 1 3 4 5 6 7 8]


In [20]:
for train_index, test_index in KFold(shuffle=True, random_state=23).split(a):
    print(train_index)

[0 1 2 3 4 6 7 9]
[0 1 3 4 5 6 7 8]
[0 1 2 3 5 6 8 9]
[2 3 4 5 6 7 8 9]
[0 1 2 4 5 7 8 9]


In [21]:
train_labels

Unnamed: 0,sequence,state
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
25963,25963,1
25964,25964,0
25965,25965,1
25966,25966,1


In [27]:
train_series['subject'].value_counts().sort_values()

472      120
73       300
265      360
519      420
486      420
       ...  
87      7740
647     8640
635     9480
1      10500
437    11940
Name: subject, Length: 672, dtype: int64

In [28]:
test_series['subject'].value_counts().sort_values()

823     480
943     600
973     660
975     660
830     660
       ... 
874    5880
865    5940
781    6180
682    6420
748    8880
Name: subject, Length: 319, dtype: int64