In [126]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [127]:
df_label = pd.read_csv('../data/label.csv')
df_sub = pd.read_csv('../data/sample_submission.csv')
df_sensor_log = pd.read_csv('../data/sensor_log.csv')

In [128]:
num_features = []

In [134]:
df_sensor_log['timestamp'] = df_sensor_log['timestamp'].apply(lambda x: int(str(x).split()[-1].split(':')[0]))

In [135]:
df_sensor_log.head(10)

Unnamed: 0,timestamp,block_id,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09
0,0,0,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353
1,0,0,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353
2,0,0,2.444734,47.35243,53.2118,46.39757,638.8889,73.54598,13.32465,16.03733,15.61777,15.01013
3,0,0,2.460474,47.09201,53.1684,46.397568,628.125,76.98898,13.31742,16.24711,15.69734,15.08247
4,0,0,2.445718,47.13541,53.2118,46.397568,636.4583,76.58897,13.35359,16.21094,15.69734,15.08247
5,0,0,2.453588,47.09201,53.1684,46.397568,637.6157,78.18568,13.41146,16.16753,15.89265,15.16204
6,0,0,2.455556,47.04861,53.1684,46.397568,633.3333,75.81614,13.43316,16.13136,15.65393,15.08247
7,0,0,2.449653,47.13541,53.1684,46.397568,630.6713,75.77331,13.25231,16.12413,16.19647,15.08247
8,0,0,2.463426,47.09201,53.1684,46.397568,631.9444,74.58916,13.28848,16.13136,15.47309,15.11863
9,0,0,2.445718,47.17882,53.1684,46.397568,641.7823,74.57428,13.38252,16.24711,15.61777,15.11863


In [136]:
feat_last = df_sensor_log.groupby('block_id').last().add_prefix('last_').reset_index()
# num_features += list(feat_last.columns)[2:]
feat_last.drop(columns=['block_id'], inplace=True)
feat_last.head(2)

Unnamed: 0,last_timestamp,last_sensor_00,last_sensor_01,last_sensor_02,last_sensor_03,last_sensor_04,last_sensor_05,last_sensor_06,last_sensor_07,last_sensor_08,last_sensor_09
0,0,2.445718,47.17882,53.1684,46.397568,641.7823,74.57428,13.38252,16.24711,15.61777,15.11863
1,0,2.46441,48.61111,53.1684,46.31076,644.3287,78.49116,13.34635,16.13136,15.70457,15.08247


In [137]:
df_diff = df_sensor_log[df_sensor_log.columns[2:]].diff()
df_diff = df_diff.drop(0)
df_diff['block_id'] = df_sensor_log['block_id']

feat_diff = df_diff.groupby('block_id').mean().add_prefix('diff_').reset_index()
feat_diff.drop(columns=['block_id'], inplace=True)
# num_features += list(feat_diff.columns)
feat_diff.head(2)

Unnamed: 0,diff_sensor_00,diff_sensor_01,diff_sensor_02,diff_sensor_03,diff_sensor_04,diff_sensor_05,diff_sensor_06,diff_sensor_07,diff_sensor_08,diff_sensor_09
0,-0.002186,0.009646,-0.004822,0.009645,0.823033,-0.209497,-0.003216,0.012861,0.005627,0.007233
1,0.001869,0.143229,0.0,-0.008681,0.25464,0.391688,-0.003617,-0.011575,0.00868,-0.003616


In [138]:
feat_mean = df_sensor_log.groupby('block_id').mean().add_prefix('mean_').reset_index()
num_features += list(feat_mean.columns)[1:]
feat_mean.head(2)

Unnamed: 0,block_id,mean_timestamp,mean_sensor_00,mean_sensor_01,mean_sensor_02,mean_sensor_03,mean_sensor_04,mean_sensor_05,mean_sensor_06,mean_sensor_07,mean_sensor_08,mean_sensor_09
0,0,0.0,2.454966,47.131073,53.18576,46.380206,634.75692,75.8982,13.358651,16.155959,15.698062,15.084637
1,1,0.0,2.455064,48.315972,53.203124,46.362845,636.80556,77.24151,13.357204,16.168978,15.709636,15.086083


In [139]:
feat_all = pd.concat([feat_mean], axis=1)

In [140]:
feat_all.head(2)

Unnamed: 0,block_id,mean_timestamp,mean_sensor_00,mean_sensor_01,mean_sensor_02,mean_sensor_03,mean_sensor_04,mean_sensor_05,mean_sensor_06,mean_sensor_07,mean_sensor_08,mean_sensor_09
0,0,0.0,2.454966,47.131073,53.18576,46.380206,634.75692,75.8982,13.358651,16.155959,15.698062,15.084637
1,1,0.0,2.455064,48.315972,53.203124,46.362845,636.80556,77.24151,13.357204,16.168978,15.709636,15.086083


In [142]:
feat_all = pd.get_dummies(feat_all, columns=['mean_timestamp'])

In [202]:
import numpy as np
def denoise(df):
    for col in df.columns:
        if col not in ['timestamp','block_id']:
            df[col] = np.floor(df[col]*100) / 100
    return df

feat_all = denoise(feat_all)

In [203]:
## Split train test
blockid_train = df_label['block_id'].values
_train = feat_all['block_id'].isin(blockid_train)
df_train = feat_all[_train].reset_index(drop=True)
df_test = feat_all[~_train].reset_index(drop=True)

df_train.head(2)

num_features = list(df_train.columns)[1:]

In [204]:
block_id = df_label['block_id'].values
anomalous = df_label['anomalous'].values

x_train, x_test, y_train, y_test = train_test_split(block_id, anomalous, test_size=0.2, random_state=42)

len(x_train), len(x_test)

(4624, 1157)

In [205]:
data_train = df_train[df_train['block_id'].isin(x_train)]
data_val = df_train[df_train['block_id'].isin(x_test)]

data_train_label = df_label[df_label['block_id'].isin(x_train)]
data_val_label = df_label[df_label['block_id'].isin(x_test)]

len(data_train) + len(data_val), len(data_train_label) + len(data_val_label)

(5781, 5781)

In [206]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler

In [207]:
from sklearn.svm import SVC

In [208]:
train = data_train[num_features].values
val = data_val[num_features].values

y_train = data_train_label['anomalous'].values
y_val = data_val_label['anomalous'].values

In [209]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
clf.fit(train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto', probability=True))])

In [210]:
result = clf.predict_proba(val)

In [211]:
y_pred = result[:, 1]

In [212]:
val_metric = roc_auc_score(y_val, y_pred)

In [213]:
val_metric

0.5915291707106654

In [160]:
df_sensor_log['timestamp'] = df_sensor_log['timestamp'].apply(lambda x: str(x).split()[-1].split(':')[0])

In [161]:
df_sensor_log = pd.get_dummies(df_sensor_log, columns=['timestamp'])

In [162]:
df_sensor_log.columns

Index(['block_id', 'sensor_00', 'sensor_01', 'sensor_02', 'sensor_03',
       'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08',
       'sensor_09', 'timestamp_0', 'timestamp_1', 'timestamp_10',
       'timestamp_11', 'timestamp_12', 'timestamp_13', 'timestamp_14',
       'timestamp_15', 'timestamp_16', 'timestamp_17', 'timestamp_18',
       'timestamp_19', 'timestamp_2', 'timestamp_20', 'timestamp_21',
       'timestamp_22', 'timestamp_23', 'timestamp_3', 'timestamp_4',
       'timestamp_5', 'timestamp_6', 'timestamp_7', 'timestamp_8',
       'timestamp_9'],
      dtype='object')

In [163]:
data = pd.read_csv('../analysis/train.csv')

In [116]:
len(data)

46240

In [118]:
print("Loading data")
# train = pd.read_csv('analysis/pct_rank/train_pct_rank.csv')[['block_id', 'timestamp', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_07', 'sensor_09']]
# val = pd.read_csv('analysis/pct_rank/val_pct_rank.csv')[['block_id', 'timestamp', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_07', 'sensor_09']]
train = pd.read_csv('../analysis/train.csv')
val = pd.read_csv('../analysis/val.csv')
target_train = pd.read_csv('../analysis/train_label.csv')['anomalous'].to_numpy()
target_val = pd.read_csv('../analysis/val_label.csv')['anomalous'].to_numpy()

# Add feature
# train_suffle = pd.read_csv('analysis/suffle/train_suffle_100.csv')
# target_train_shuffle = pd.read_csv('analysis/suffle/train_label_suffle_100.csv')

# train_new_data = pd.read_csv('analysis/new_data/new_data.csv')
# target_new_data = pd.read_csv('analysis/new_data/df_new_data_label.csv')

# Combine
# train = pd.concat([train, train_new_data], axis=0)
# target_train = pd.concat([target_train,target_new_data], axis=0)

# target_train = target_train['anomalous'].to_numpy()

# target_train = target_train[:, None]
# target_train = np.repeat(target_train, 10, axis=1)

# target_val = target_val[:, None]
# target_val = np.repeat(target_val, 10, axis=1)

# target_test = pd.read_csv('val_labels.csv')['anomalous'].to_numpy()
#exit()

# train = pd.get_dummies(train, columns=['timestamp'])
# val = pd.get_dummies(val, columns=['timestamp'])

Loading data


In [119]:
train

Unnamed: 0,timestamp,block_id,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09
0,2018-04-02 00:00:00,144,2.459491,45.833330,52.34375,44.791664,639.583300,78.57581,13.09317,16.16753,15.49479,15.11863
1,2018-04-02 00:01:00,144,2.459491,45.833330,52.34375,44.791664,639.583300,78.57581,13.09317,16.16753,15.49479,15.11863
2,2018-04-02 00:02:00,144,2.449653,45.746520,52.30035,44.791664,634.838000,79.06493,13.12211,16.13136,15.56713,15.11863
3,2018-04-02 00:03:00,144,2.464410,45.833330,52.30035,44.791664,632.175900,80.07732,13.05700,16.13136,15.49479,15.08247
4,2018-04-02 00:04:00,144,2.444734,45.616320,52.34375,44.791664,638.310200,77.70200,13.14381,16.13136,15.81308,15.11863
...,...,...,...,...,...,...,...,...,...,...,...,...
46235,2018-08-27 23:55:00,18709,2.475232,48.350693,49.04514,42.968750,624.421265,72.64686,14.67737,16.65220,15.53096,15.05353
46236,2018-08-27 23:56:00,18709,2.468345,48.350693,49.00174,42.968750,627.314800,73.31248,14.64844,16.65220,15.56713,15.08247
46237,2018-08-27 23:57:00,18709,2.471296,48.350693,49.04514,42.968750,631.365700,74.63629,14.60503,16.60880,15.56713,15.05353
46238,2018-08-27 23:58:00,18709,2.471296,48.350693,49.04514,42.968750,631.365700,76.36526,14.60503,16.60880,15.81308,15.08247
