In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
df_label = pd.read_csv('../data/label.csv')
df_sub = pd.read_csv('../data/sample_submission.csv')
df_sensor_log = pd.read_csv('../data/sensor_log.csv')

In [3]:
num_features = []

In [4]:
df_sensor_log.head(2)

Unnamed: 0,timestamp,block_id,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09
0,2018-04-01 00:00:00,0,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353
1,2018-04-01 00:01:00,0,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353


In [5]:
feat_last = df_sensor_log.groupby('block_id').last().add_prefix('last_').reset_index()
# num_features += list(feat_last.columns)[2:]
feat_last.drop(columns=['block_id'], inplace=True)
feat_last.head(2)

Unnamed: 0,last_timestamp,last_sensor_00,last_sensor_01,last_sensor_02,last_sensor_03,last_sensor_04,last_sensor_05,last_sensor_06,last_sensor_07,last_sensor_08,last_sensor_09
0,2018-04-01 00:09:00,2.445718,47.17882,53.1684,46.397568,641.7823,74.57428,13.38252,16.24711,15.61777,15.11863
1,2018-04-01 00:19:00,2.46441,48.61111,53.1684,46.31076,644.3287,78.49116,13.34635,16.13136,15.70457,15.08247


In [6]:
df_diff = df_sensor_log[df_sensor_log.columns[2:]].diff()
df_diff = df_diff.drop(0)
df_diff['block_id'] = df_sensor_log['block_id']

feat_diff = df_diff.groupby('block_id').mean().add_prefix('diff_').reset_index()
feat_diff.drop(columns=['block_id'], inplace=True)
num_features += list(feat_diff.columns)
feat_diff.head(2)

Unnamed: 0,diff_sensor_00,diff_sensor_01,diff_sensor_02,diff_sensor_03,diff_sensor_04,diff_sensor_05,diff_sensor_06,diff_sensor_07,diff_sensor_08,diff_sensor_09
0,-0.002186,0.009646,-0.004822,0.009645,0.823033,-0.209497,-0.003216,0.012861,0.005627,0.007233
1,0.001869,0.143229,0.0,-0.008681,0.25464,0.391688,-0.003617,-0.011575,0.00868,-0.003616


In [7]:
feat_mean = df_sensor_log.groupby('block_id').mean().add_prefix('mean_').reset_index()
num_features += list(feat_mean.columns)[1:]
feat_mean.head(2)

Unnamed: 0,block_id,mean_sensor_00,mean_sensor_01,mean_sensor_02,mean_sensor_03,mean_sensor_04,mean_sensor_05,mean_sensor_06,mean_sensor_07,mean_sensor_08,mean_sensor_09
0,0,2.454966,47.131073,53.18576,46.380206,634.75692,75.8982,13.358651,16.155959,15.698062,15.084637
1,1,2.455064,48.315972,53.203124,46.362845,636.80556,77.24151,13.357204,16.168978,15.709636,15.086083


In [8]:
feat_all = pd.concat([feat_mean, feat_diff], axis=1)

In [9]:
feat_all.head(2)

Unnamed: 0,block_id,mean_sensor_00,mean_sensor_01,mean_sensor_02,mean_sensor_03,mean_sensor_04,mean_sensor_05,mean_sensor_06,mean_sensor_07,mean_sensor_08,...,diff_sensor_00,diff_sensor_01,diff_sensor_02,diff_sensor_03,diff_sensor_04,diff_sensor_05,diff_sensor_06,diff_sensor_07,diff_sensor_08,diff_sensor_09
0,0,2.454966,47.131073,53.18576,46.380206,634.75692,75.8982,13.358651,16.155959,15.698062,...,-0.002186,0.009646,-0.004822,0.009645,0.823033,-0.209497,-0.003216,0.012861,0.005627,0.007233
1,1,2.455064,48.315972,53.203124,46.362845,636.80556,77.24151,13.357204,16.168978,15.709636,...,0.001869,0.143229,0.0,-0.008681,0.25464,0.391688,-0.003617,-0.011575,0.00868,-0.003616


In [10]:
## Split train test
blockid_train = df_label['block_id'].values
_train = feat_all['block_id'].isin(blockid_train)
df_train = feat_all[_train].reset_index(drop=True)
df_test = feat_all[~_train].reset_index(drop=True)

df_train.head(2)

Unnamed: 0,block_id,mean_sensor_00,mean_sensor_01,mean_sensor_02,mean_sensor_03,mean_sensor_04,mean_sensor_05,mean_sensor_06,mean_sensor_07,mean_sensor_08,...,diff_sensor_00,diff_sensor_01,diff_sensor_02,diff_sensor_03,diff_sensor_04,diff_sensor_05,diff_sensor_06,diff_sensor_07,diff_sensor_08,diff_sensor_09
0,144,2.454769,45.694442,52.32205,44.826386,634.57176,79.325703,13.093894,16.158127,15.633679,...,0.000689,-0.008681,0.00434,0.008681,0.39353,0.409471,0.014468,-0.007958,0.007957,-0.002894
1,145,2.454572,45.815969,52.33073,44.900174,632.03701,81.686038,13.027343,16.159576,15.632958,...,0.000492,0.043403,0.0,0.00434,-0.28935,-0.30077,-0.014468,0.007958,0.0,0.0


In [11]:
block_id = df_label['block_id'].values
anomalous = df_label['anomalous'].values

x_train, x_test, y_train, y_test = train_test_split(block_id, anomalous, test_size=0.2, random_state=42)

len(x_train), len(x_test)

(4624, 1157)

In [12]:
data_train = df_train[df_train['block_id'].isin(x_train)]
data_val = df_train[df_train['block_id'].isin(x_test)]

data_train_label = df_label[df_label['block_id'].isin(x_train)]
data_val_label = df_label[df_label['block_id'].isin(x_test)]

len(data_train) + len(data_val), len(data_train_label) + len(data_val_label)

(5781, 5781)

In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler

In [14]:
from sklearn.svm import SVC

In [15]:
train = data_train[num_features].values
val = data_val[num_features].values

y_train = data_train_label['anomalous'].values
y_val = data_val_label['anomalous'].values

KeyError: "['last_sensor_00', 'last_sensor_01', 'last_sensor_02', 'last_sensor_03', 'last_sensor_04', 'last_sensor_05', 'last_sensor_06', 'last_sensor_07', 'last_sensor_08', 'last_sensor_09'] not in index"

In [16]:
clf = make_pipeline(RobustScaler(), SVC(gamma='auto', probability=True))
clf.fit(train, y_train)

NameError: name 'train' is not defined

In [35]:
result = clf.predict_proba(val)

In [36]:
y_pred = 1 - result[:, 0]

In [37]:
val_metric = roc_auc_score(y_val, y_pred)

In [38]:
val_metric

0.5676426183543621