In [731]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

### Load data

In [758]:
df_label = pd.read_csv('../data/label.csv')
df_sub = pd.read_csv('../data/sample_submission.csv')
df_sensor_log = pd.read_csv('../data/sensor_log.csv')

df_new_data = pd.read_csv('../analysis/new_data/new_data.csv')
df_new_label = pd.read_csv('../analysis/new_data/df_new_data_label.csv')

In [759]:
df_sensor_log = pd.concat([df_sensor_log, df_new_data], axis=0)
df_label = pd.concat([df_label, df_new_label], axis=0)
df_sensor_log.head()

Unnamed: 0,timestamp,block_id,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09
0,2018-04-01 00:00:00,0,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353
1,2018-04-01 00:01:00,0,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353
2,2018-04-01 00:02:00,0,2.444734,47.35243,53.2118,46.39757,638.8889,73.54598,13.32465,16.03733,15.61777,15.01013
3,2018-04-01 00:03:00,0,2.460474,47.09201,53.1684,46.397568,628.125,76.98898,13.31742,16.24711,15.69734,15.08247
4,2018-04-01 00:04:00,0,2.445718,47.13541,53.2118,46.397568,636.4583,76.58897,13.35359,16.21094,15.69734,15.08247


### Process noise

In [760]:
import numpy as np
def denoise(df):
    for col in df.columns:
        if col not in ['timestamp','block_id']:
            df[col] = np.floor(df[col]*100) / 100
    return df

df_sensor_log = denoise(df_sensor_log)

In [761]:
## feature for trainning
num_features = []

### Function process data

In [762]:
num_feat = list(df_sensor_log.columns)[2:]
print(num_feat)

['sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09']


In [763]:
def get_last(df, num_feat):
    df_last = df.groupby('block_id')[num_feat].last().add_prefix('last_')

    return df_last

def get_diff(df, num_feat):
    diff = df_sensor_log.groupby('block_id')[num_feat].diff().add_prefix('tmp_')
    diff['block_id'] = df['block_id'].values
    diff.dropna(inplace=True)

    num_new_feat = list(diff.columns)[:-1]

    df_diff = diff.groupby('block_id')[num_new_feat].mean().add_prefix('diff_')

    return df_diff

def get_mean(df, num_feat):
    df_mean = df.groupby('block_id')[num_feat].mean().add_prefix('mean_')

    return df_mean

def get_sum(df, num_feat):
    df_sum = df.groupby('block_id')[num_feat].sum().add_prefix('sum_')

    return df_sum

def get_std(df, num_feat):
    df_std = df.groupby('block_id')[num_feat].std().add_prefix('std_')

    return df_std

def get_max(df, num_feat):
    df_max = df.groupby('block_id')[num_feat].max().add_prefix('max_')

    return df_max

def get_min(df, num_feat):
    df_min = df.groupby('block_id')[num_feat].min().add_prefix('min_')

    return df_min
    

In [764]:
df_mean = get_mean(df_sensor_log, num_feat)
df_mean.head(2)

Unnamed: 0_level_0,mean_sensor_00,mean_sensor_01,mean_sensor_02,mean_sensor_03,mean_sensor_04,mean_sensor_05,mean_sensor_06,mean_sensor_07,mean_sensor_08,mean_sensor_09
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2.45,47.127,53.18,46.374,634.752,75.891,13.355,16.152,15.692,15.081
1,2.45,48.312,53.196,46.358,636.8,77.237,13.352,16.164,15.705,15.081


In [765]:
df_std = get_std(df_sensor_log, num_feat)
df_std.head(2)

Unnamed: 0_level_0,std_sensor_00,std_sensor_01,std_sensor_02,std_sensor_03,std_sensor_04,std_sensor_05,std_sensor_06,std_sensor_07,std_sensor_08,std_sensor_09
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.009428,0.085641,0.02582,0.033731,4.059994,1.355536,0.062583,0.06391,0.207246,0.040675
1,0.009428,0.357454,0.171218,0.041312,5.398378,1.692658,0.037357,0.027162,0.145316,0.047714


In [766]:
df_max = get_max(df_sensor_log, num_feat)
df_max.head(2)

Unnamed: 0_level_0,max_sensor_00,max_sensor_01,max_sensor_02,max_sensor_03,max_sensor_04,max_sensor_05,max_sensor_06,max_sensor_07,max_sensor_08,max_sensor_09
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2.46,47.35,53.21,46.39,641.78,78.18,13.43,16.24,16.19,15.16
1,2.46,48.61,53.68,46.39,644.32,80.65,13.41,16.21,15.89,15.16


In [767]:
df_min = get_min(df_sensor_log, num_feat)
df_min.head(2)

Unnamed: 0_level_0,min_sensor_00,min_sensor_01,min_sensor_02,min_sensor_03,min_sensor_04,min_sensor_05,min_sensor_06,min_sensor_07,min_sensor_08,min_sensor_09
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2.44,47.04,53.16,46.31,628.12,73.54,13.25,16.03,15.47,15.01
1,2.44,47.48,53.12,46.31,630.09,74.58,13.3,16.13,15.45,15.01


In [768]:
df_sum = get_sum(df_sensor_log, num_feat)
df_sum.head(2)

Unnamed: 0_level_0,sum_sensor_00,sum_sensor_01,sum_sensor_02,sum_sensor_03,sum_sensor_04,sum_sensor_05,sum_sensor_06,sum_sensor_07,sum_sensor_08,sum_sensor_09
block_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,24.5,471.27,531.8,463.74,6347.52,758.91,133.55,161.52,156.92,150.81
1,24.5,483.12,531.96,463.58,6368.0,772.37,133.52,161.64,157.05,150.81


In [769]:
df_all = pd.concat([df_mean, df_std, df_max, df_min, df_sum], axis=1).reset_index()
df_all.head(2)

Unnamed: 0,block_id,mean_sensor_00,mean_sensor_01,mean_sensor_02,mean_sensor_03,mean_sensor_04,mean_sensor_05,mean_sensor_06,mean_sensor_07,mean_sensor_08,...,sum_sensor_00,sum_sensor_01,sum_sensor_02,sum_sensor_03,sum_sensor_04,sum_sensor_05,sum_sensor_06,sum_sensor_07,sum_sensor_08,sum_sensor_09
0,0,2.45,47.127,53.18,46.374,634.752,75.891,13.355,16.152,15.692,...,24.5,471.27,531.8,463.74,6347.52,758.91,133.55,161.52,156.92,150.81
1,1,2.45,48.312,53.196,46.358,636.8,77.237,13.352,16.164,15.705,...,24.5,483.12,531.96,463.58,6368.0,772.37,133.52,161.64,157.05,150.81


In [770]:
# df_all = denoise(df_all)

### Split data train, test

In [771]:
## Split train test
blockid_train = df_label['block_id'].values
_train = df_all['block_id'].isin(blockid_train)
df_train = df_all[_train].reset_index(drop=True)
df_test = df_all[~_train].reset_index(drop=True)

df_train.head(2)

Unnamed: 0,block_id,mean_sensor_00,mean_sensor_01,mean_sensor_02,mean_sensor_03,mean_sensor_04,mean_sensor_05,mean_sensor_06,mean_sensor_07,mean_sensor_08,...,sum_sensor_00,sum_sensor_01,sum_sensor_02,sum_sensor_03,sum_sensor_04,sum_sensor_05,sum_sensor_06,sum_sensor_07,sum_sensor_08,sum_sensor_09
0,144,2.449,45.689,52.32,44.822,634.568,79.32,13.089,16.155,15.629,...,24.49,456.89,523.2,448.22,6345.68,793.2,130.89,161.55,156.29,150.98
1,145,2.449,45.809,52.328,44.895,632.032,81.681,13.02,16.157,15.627,...,24.49,458.09,523.28,448.95,6320.32,816.81,130.2,161.57,156.27,150.74


In [772]:
block_id = df_label['block_id'].values
anomalous = df_label['anomalous'].values

x_train, x_test, y_train, y_test = train_test_split(block_id, anomalous, test_size=0.2, random_state=42)

len(x_train), len(x_test)

(5424, 1357)

In [774]:
data_train = df_train[df_train['block_id'].isin(x_train)]
data_val = df_train[df_train['block_id'].isin(x_test)]

data_train_label = df_label[df_label['block_id'].isin(x_train)]
data_val_label = df_label[df_label['block_id'].isin(x_test)]

len(data_train) + len(data_val), len(data_train_label) + len(data_val_label)

(6781, 6781)

### Trainning

In [775]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.svm import SVC

In [776]:
num_features = df_all.columns.to_list()[1:]

In [777]:
train = data_train[num_features].values
val = data_val[num_features].values

y_train = data_train_label['anomalous'].values
y_val = data_val_label['anomalous'].values

In [786]:
train_all = np.concatenate([train, val], axis=0)
y_all = np.concatenate([y_train, y_val], axis=0)

In [787]:
# clf = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
# clf = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=15, random_state=42, n_estimators=100))
clf = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=15, random_state=42, n_estimators=200, ccp_alpha=0.0002, criterion="entropy", bootstrap=True))
# clf.fit(train, y_train)
clf.fit(train_all, y_all)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(ccp_alpha=0.0002, criterion='entropy',
                                        max_depth=15, n_estimators=200,
                                        random_state=42))])

In [788]:
# clf = make_pipeline(StandardScaler(), SVC(gamma='scale', probability=True, C=1, kernel='sigmoid'))
# clf.fit(train, y_train)

In [789]:
result = clf.predict_proba(val)

In [790]:
y_pred = result[:, 1]
# y_pred

In [791]:
val_metric = roc_auc_score(y_val, y_pred)

In [792]:
val_metric

1.0

In [793]:
## Generate result

test = df_test[num_features].values
result = clf.predict_proba(test)
anomalous = result[:, 1]
df_sub['anomaly_score'] = anomalous
df_sub.to_csv('submission_11.csv', index=False)

### Kfold

In [794]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10)
kf.get_n_splits(block_id)
clf = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=15, random_state=42, n_estimators=200, ccp_alpha=0.0002, criterion="entropy", bootstrap=True))

num_features = df_all.columns.to_list()[1:]
result_anomalous = []

block_id = df_label['block_id'].values
anomalous = df_label['anomalous'].values
f = open("log.txt", 'w')
for i, (train_index, test_index) in enumerate(kf.split(block_id)):
    print("Fflod: ", i)

    x_train, x_test, y_train, y_test = block_id[train_index], block_id[test_index], anomalous[train_index], anomalous[test_index]
    
    data_train = df_train[df_train['block_id'].isin(x_train)]
    data_val = df_train[df_train['block_id'].isin(x_test)]

    data_train_label = df_label[df_label['block_id'].isin(x_train)]
    data_val_label = df_label[df_label['block_id'].isin(x_test)]

    train = data_train[num_features].values
    val = data_val[num_features].values

    y_train = data_train_label['anomalous'].values
    y_val = data_val_label['anomalous'].values
    
    clf.fit(train, y_train)

    result_val = clf.predict_proba(val)
    
    y_pred = result_val[:, 1]
    try:
        val_metric = roc_auc_score(y_val, y_pred)
    except:
        continue

    if val_metric > 0.8:
        test = df_test[num_features].values
        result_test = clf.predict_proba(test)
        anomalous_test = result_test[:, 1]
        result_anomalous.append(anomalous_test)

    print("Val metric: ", val_metric)
    f.write(f"Fold {i}: val metric: {val_metric}\n")
f.close()


Fflod:  0
Val metric:  0.7134860050890585
Fflod:  1
Val metric:  0.842929292929293
Fflod:  2
Val metric:  0.8472746578574799
Fflod:  3
Val metric:  0.9124811178247734
Fflod:  4
Val metric:  0.8258431965827179
Fflod:  5
Val metric:  0.9236216012084593
Fflod:  6
Val metric:  0.8502269288956128
Fflod:  7
Val metric:  0.886535552193646
Fflod:  8
Val metric:  0.9964753492015144
Fflod:  9


In [795]:
len(result_anomalous), result_anomalous[0].shape

(8, (13502,))

In [796]:
anomalous_score = np.vstack(result_anomalous)
anomalous_score_mean = np.mean(anomalous_score, axis=0)
anomalous_score_mean.shape

(13502,)

In [797]:
df_sub['anomaly_score'] = anomalous_score_mean
df_sub.to_csv('submission_12.csv', index=False)

#### Gird search

In [262]:
train = data_train[num_features].values
val = data_val[num_features].values

y_train = data_train_label['anomalous'].values
y_val = data_val_label['anomalous'].values

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[50, 100, 200],
              'max_depth': [5, 15, 30, 40],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 9],
              'max_features': [5, 15, 25],
#               'min_weight_fraction_leaf': [0.0, 0.1, 0.2, 0.3],
#               'bootstrap': [True, False],
#               'ccp_alpha': [0.0, 0.1, 0.2],
#               'max_samples': [0.2, 0]
              }
rf = RandomForestClassifier(random_state=42)
clf = GridSearchCV(rf, parameters, verbose=3, n_jobs=6)

clf.fit(train, y_train)