In [219]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

### Load data

In [220]:
df_label = pd.read_csv('../data/label.csv')
df_sub = pd.read_csv('../data/sample_submission.csv')
df_sensor_log = pd.read_csv('../data/sensor_log.csv')

# df_new_data = pd.read_csv('../analysis/new_data/new_data.csv')
# df_new_label = pd.read_csv('../analysis/new_data/df_new_data_label.csv')

In [221]:
# df_sensor_log = pd.concat([df_sensor_log, df_new_data], axis=0)
# df_label = pd.concat([df_label, df_new_label], axis=0)
# df_sensor_log.head()

### Process noise

In [222]:
import numpy as np
def denoise(df):
    for col in df.columns:
        if col not in ['timestamp','block_id']:
            df[col] = np.floor(df[col]*100) / 100
    return df

df_sensor_log = denoise(df_sensor_log)

In [223]:
## feature for trainning
num_features = []

### Function process data

In [224]:
num_feat = list(df_sensor_log.columns)[2:]
print(num_feat)

['sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09']


In [225]:
def get_last(df, num_feat):
    df_last = df.groupby('block_id')[num_feat].last().add_prefix('last_')

    return df_last

def get_diff(df, num_feat):
    diff = df_sensor_log.groupby('block_id')[num_feat].diff().add_prefix('tmp_')
    diff['block_id'] = df['block_id'].values
    diff.dropna(inplace=True)

    num_new_feat = list(diff.columns)[:-1]

    df_diff = diff.groupby('block_id')[num_new_feat].mean().add_prefix('diff_')

    return df_diff

def get_mean(df, num_feat):
    df_mean = df.groupby('block_id')[num_feat].mean().add_prefix('mean_')

    return df_mean

def get_sum(df, num_feat):
    df_sum = df.groupby('block_id')[num_feat].sum().add_prefix('sum_')

    return df_sum

def get_std(df, num_feat):
    df_std = df.groupby('block_id')[num_feat].std().add_prefix('std_')

    return df_std

def get_max(df, num_feat):
    df_max = df.groupby('block_id')[num_feat].max().add_prefix('max_')

    return df_max

def get_min(df, num_feat):
    df_min = df.groupby('block_id')[num_feat].min().add_prefix('min_')

    return df_min
    

In [226]:
df_mean = get_mean(df_sensor_log, num_feat)
df_mean.head(2)

Unnamed: 0_level_0,mean_sensor_00,mean_sensor_01,mean_sensor_02,mean_sensor_03,mean_sensor_04,mean_sensor_05,mean_sensor_06,mean_sensor_07,mean_sensor_08,mean_sensor_09
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2.45,47.127,53.18,46.374,634.752,75.891,13.355,16.152,15.692,15.081
1,2.45,48.312,53.196,46.358,636.8,77.237,13.352,16.164,15.705,15.081


In [227]:
df_std = get_std(df_sensor_log, num_feat)
df_std.head(2)

Unnamed: 0_level_0,std_sensor_00,std_sensor_01,std_sensor_02,std_sensor_03,std_sensor_04,std_sensor_05,std_sensor_06,std_sensor_07,std_sensor_08,std_sensor_09
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.009428,0.085641,0.02582,0.033731,4.059994,1.355536,0.062583,0.06391,0.207246,0.040675
1,0.009428,0.357454,0.171218,0.041312,5.398378,1.692658,0.037357,0.027162,0.145316,0.047714


In [228]:
df_max = get_max(df_sensor_log, num_feat)
df_max.head(2)

Unnamed: 0_level_0,max_sensor_00,max_sensor_01,max_sensor_02,max_sensor_03,max_sensor_04,max_sensor_05,max_sensor_06,max_sensor_07,max_sensor_08,max_sensor_09
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2.46,47.35,53.21,46.39,641.78,78.18,13.43,16.24,16.19,15.16
1,2.46,48.61,53.68,46.39,644.32,80.65,13.41,16.21,15.89,15.16


In [229]:
df_min = get_min(df_sensor_log, num_feat)
df_min.head(2)

Unnamed: 0_level_0,min_sensor_00,min_sensor_01,min_sensor_02,min_sensor_03,min_sensor_04,min_sensor_05,min_sensor_06,min_sensor_07,min_sensor_08,min_sensor_09
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2.44,47.04,53.16,46.31,628.12,73.54,13.25,16.03,15.47,15.01
1,2.44,47.48,53.12,46.31,630.09,74.58,13.3,16.13,15.45,15.01


In [230]:
df_sum = get_sum(df_sensor_log, num_feat)
df_sum.head(2)

Unnamed: 0_level_0,sum_sensor_00,sum_sensor_01,sum_sensor_02,sum_sensor_03,sum_sensor_04,sum_sensor_05,sum_sensor_06,sum_sensor_07,sum_sensor_08,sum_sensor_09
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,24.5,471.27,531.8,463.74,6347.52,758.91,133.55,161.52,156.92,150.81
1,24.5,483.12,531.96,463.58,6368.0,772.37,133.52,161.64,157.05,150.81


In [231]:
df_all = pd.concat([df_mean, df_std, df_max, df_min, df_sum], axis=1).reset_index()
df_all.head(2)

Unnamed: 0,block_id,mean_sensor_00,mean_sensor_01,mean_sensor_02,mean_sensor_03,mean_sensor_04,mean_sensor_05,mean_sensor_06,mean_sensor_07,mean_sensor_08,...,sum_sensor_00,sum_sensor_01,sum_sensor_02,sum_sensor_03,sum_sensor_04,sum_sensor_05,sum_sensor_06,sum_sensor_07,sum_sensor_08,sum_sensor_09
0,0,2.45,47.127,53.18,46.374,634.752,75.891,13.355,16.152,15.692,...,24.5,471.27,531.8,463.74,6347.52,758.91,133.55,161.52,156.92,150.81
1,1,2.45,48.312,53.196,46.358,636.8,77.237,13.352,16.164,15.705,...,24.5,483.12,531.96,463.58,6368.0,772.37,133.52,161.64,157.05,150.81


In [232]:
# df_all = denoise(df_all)

### Split data train, test

In [233]:
## Split train test
blockid_train = df_label['block_id'].values
_train = df_all['block_id'].isin(blockid_train)
df_train = df_all[_train].reset_index(drop=True)
df_test = df_all[~_train].reset_index(drop=True)

df_train.head(2)

Unnamed: 0,block_id,mean_sensor_00,mean_sensor_01,mean_sensor_02,mean_sensor_03,mean_sensor_04,mean_sensor_05,mean_sensor_06,mean_sensor_07,mean_sensor_08,...,sum_sensor_00,sum_sensor_01,sum_sensor_02,sum_sensor_03,sum_sensor_04,sum_sensor_05,sum_sensor_06,sum_sensor_07,sum_sensor_08,sum_sensor_09
0,144,2.449,45.689,52.32,44.822,634.568,79.32,13.089,16.155,15.629,...,24.49,456.89,523.2,448.22,6345.68,793.2,130.89,161.55,156.29,150.98
1,145,2.449,45.809,52.328,44.895,632.032,81.681,13.02,16.157,15.627,...,24.49,458.09,523.28,448.95,6320.32,816.81,130.2,161.57,156.27,150.74


In [234]:
block_id = df_label['block_id'].values
anomalous = df_label['anomalous'].values

x_train, x_test, y_train, y_test = train_test_split(block_id, anomalous, test_size=0.2, random_state=42)

len(x_train), len(x_test)

(4624, 1157)

In [235]:
data_train = df_train[df_train['block_id'].isin(x_train)]
data_val = df_train[df_train['block_id'].isin(x_test)]

data_train_label = df_label[df_label['block_id'].isin(x_train)]
data_val_label = df_label[df_label['block_id'].isin(x_test)]

len(data_train) + len(data_val), len(data_train_label) + len(data_val_label)

(5781, 5781)

### Trainning

In [240]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.svm import SVC

In [241]:
num_features = df_all.columns.to_list()[1:]

In [242]:
train = data_train[num_features].values
val = data_val[num_features].values

y_train = data_train_label['anomalous'].values
y_val = data_val_label['anomalous'].values

In [243]:
# clf = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
clf = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=15, random_state=42, verbose=10, n_estimators=100))
clf.fit(train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.1s remaining:    0.0s


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.9s finished


In [244]:
result = clf.predict_proba(val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [245]:
y_pred = result[:, 1]
y_pred

array([0.00896137, 0.05365669, 0.00984985, ..., 0.0194127 , 0.00527502,
       0.01310197])

In [246]:
val_metric = roc_auc_score(y_val, y_pred)

In [247]:
val_metric

0.9328157014989755

In [239]:
## Generate result

test = df_test[num_features].values
result = clf.predict_proba(test)
anomalous = result[:, 1]
df_sub['anomaly_score'] = anomalous
df_sub.to_csv('submission_gb.csv', index=False)

### Kfold

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10)
kf.get_n_splits(block_id)
clf = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=15, random_state=42, verbose=10, n_estimators=100))

num_features = df_all.columns.to_list()[1:]
result_anomalous = []

block_id = df_label['block_id'].values
anomalous = df_label['anomalous'].values
f = open("log.txt", 'w')
for i, (train_index, test_index) in enumerate(kf.split(block_id)):
    print("Fflod: ", i)

    x_train, x_test, y_train, y_test = block_id[train_index], block_id[test_index], anomalous[train_index], anomalous[test_index]
    
    data_train = df_train[df_train['block_id'].isin(x_train)]
    data_val = df_train[df_train['block_id'].isin(x_test)]

    data_train_label = df_label[df_label['block_id'].isin(x_train)]
    data_val_label = df_label[df_label['block_id'].isin(x_test)]

    train = data_train[num_features].values
    val = data_val[num_features].values

    y_train = data_train_label['anomalous'].values
    y_val = data_val_label['anomalous'].values
    
    clf.fit(train, y_train)

    result_val = clf.predict_proba(val)
    
    y_pred = result_val[:, 1]
    try:
        val_metric = roc_auc_score(y_val, y_pred)
    except:
        continue

    if val_metric > 0.8:
        test = df_test[num_features].values
        result_test = clf.predict_proba(test)
        anomalous_test = result_test[:, 1]
        result_anomalous.append(anomalous_test)

    print("Val metric: ", val_metric)
    f.write(f"Fold {i}: val metric: {val_metric}\n")
f.close()


In [67]:
len(result_anomalous), result_anomalous[0].shape

(6, (13502,))

In [68]:
anomalous_score = np.vstack(result_anomalous)
anomalous_score_mean = np.mean(anomalous_score, axis=0)
anomalous_score_mean.shape

(13502,)

In [69]:
df_sub['anomaly_score'] = anomalous_score_mean
df_sub.to_csv('submission_06.csv', index=False)

#### Gird search

In [248]:
train = data_train[num_features].values
val = data_val[num_features].values

y_train = data_train_label['anomalous'].values
y_val = data_val_label['anomalous'].values

In [251]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[50, 100, 200, 300, 400, 500],
              'max_depth': [5, 10, 20, 30, 35, 40, 45, 50],
              'min_samples_split': [2, 5, 10, 25, 30, 35, 40],


              
              'min_samples_leaf': [1, 3,  5,  7, 9],
              'max_features': [5, 10, 15, 20, 25, 30, 45, 50],
              'min_weight_fraction_leaf': [0.0, 0.1, 0.2, 0.3],
              'bootstrap': [True, False],
              'ccp_alpha': [0.0, 0.1, 0.2, 0.3],
              'max_samples': [0.1, 0.2, 0.4, 0.6, 0.8]
              }
rf = RandomForestClassifier(random_state=42)
clf = GridSearchCV(rf, parameters, verbose=3)

clf.fit(train, y_train)

Fitting 5 folds for each of 2150400 candidates, totalling 10752000 fits
[CV 1/5] END bootstrap=True, ccp_alpha=0.0, max_depth=5, max_features=5, max_samples=0.1, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50;, score=0.972 total time=   0.0s
[CV 2/5] END bootstrap=True, ccp_alpha=0.0, max_depth=5, max_features=5, max_samples=0.1, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50;, score=0.972 total time=   0.0s
[CV 3/5] END bootstrap=True, ccp_alpha=0.0, max_depth=5, max_features=5, max_samples=0.1, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50;, score=0.972 total time=   0.0s
[CV 4/5] END bootstrap=True, ccp_alpha=0.0, max_depth=5, max_features=5, max_samples=0.1, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50;, score=0.971 total time=   0.0s
[CV 5/5] END bootstrap=True, ccp_alpha=0.0, max_depth=5, max_features=5, max_samples=0.1

KeyboardInterrupt: 