In [1]:
import numpy as np
import pandas as pd
import gc

df = pd.read_csv("../input/remove-trends-giba/train_clean_giba.csv").sort_values("time").reset_index(drop=True)
test_df = pd.read_csv("../input/remove-trends-giba/test_clean_giba.csv").sort_values("time").reset_index(drop=True)

In [2]:
df["group"] = np.arange(df.shape[0])//500_000
df["batch"] = np.arange(df.shape[0])//100_000
df["group"].value_counts()

9    500000
8    500000
7    500000
6    500000
5    500000
4    500000
3    500000
2    500000
1    500000
0    500000
Name: group, dtype: int64

In [3]:
df["category"] = 0
test_df["category"] = 0

# train segments with more then 9 open channels classes
df.loc[2_000_000:2_500_000-1, 'category'] = 1
df.loc[4_500_000:5_000_000-1, 'category'] = 1

# test segments with more then 9 open channels classes (potentially)
test_df.loc[500_000:600_000-1, "category"] = 1
test_df.loc[700_000:800_000-1, "category"] = 1

In [4]:
TARGET = "open_channels"

aug_df = df[df["group"] == 5].copy()
aug_df["category"] = 1
aug_df["group"] = 10

for col in ["signal", TARGET]:
    aug_df[col] += df[df["group"] == 8][col].values
    
df = df.append(aug_df, sort=False).reset_index(drop=True)

del aug_df
gc.collect()

0

In [5]:
df.groupby("group")["signal"].agg({"mean", "std"})

Unnamed: 0_level_0,std,mean
group,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.2622,0.031618
1,0.27516,0.03845
2,0.471509,0.74792
3,0.801176,2.147772
4,1.452549,7.129406
5,1.021415,3.576812
6,0.468368,0.753886
7,0.811309,2.13494
8,1.026026,3.570526
9,1.453593,7.094024


In [6]:
df['batch'] = np.arange(df.shape[0])//100_000
test_df['batch'] = np.arange(test_df.shape[0])//100_000

shift_sizes = np.arange(1,21)
for temp in [df,test_df]:
    for shift_size in shift_sizes:    
        temp['signal_shift_pos_'+str(shift_size)] = temp.groupby('batch')['signal'].shift(shift_size).fillna(-3)
        # temp['signal_shift_pos_'+str(shift_size)] = temp.groupby("batch")['signal_shift_pos_'+str(shift_size)].transform(lambda x: x.bfill())
        temp['signal_shift_neg_'+str(shift_size)] = temp.groupby('batch')['signal'].shift(-1*shift_size).fillna(-3)
        # temp['signal_shift_neg_'+str(shift_size)] = temp.groupby("batch")['signal_shift_neg_'+str(shift_size)].transform(lambda x: x.ffill())

In [7]:
remove_fea=['time','batch','batch_index','batch_slices','batch_slices2','group',"open_channels","type","category"]
features=[i for i in df.columns if i not in remove_fea]
df[features].head()

Unnamed: 0,signal,signal_shift_pos_1,signal_shift_neg_1,signal_shift_pos_2,signal_shift_neg_2,signal_shift_pos_3,signal_shift_neg_3,signal_shift_pos_4,signal_shift_neg_4,signal_shift_pos_5,...,signal_shift_pos_16,signal_shift_neg_16,signal_shift_pos_17,signal_shift_neg_17,signal_shift_pos_18,signal_shift_neg_18,signal_shift_pos_19,signal_shift_neg_19,signal_shift_pos_20,signal_shift_neg_20
0,-0.03651,-3.0,-0.113152,-3.0,0.24539,-3.0,-0.341122,-3.0,-0.350929,-3.0,...,-3.0,-0.170745,-3.0,0.006632,-3.0,0.314478,-3.0,-0.225048,-3.0,-0.385705
1,-0.113152,-0.03651,0.24539,-3.0,-0.341122,-3.0,-0.350929,-3.0,0.057489,-3.0,...,-3.0,0.006632,-3.0,0.314478,-3.0,-0.225048,-3.0,-0.385705,-3.0,-0.270212
2,0.24539,-0.113152,-0.341122,-0.03651,-0.350929,-3.0,0.057489,-3.0,0.011333,-3.0,...,-3.0,0.314478,-3.0,-0.225048,-3.0,-0.385705,-3.0,-0.270212,-3.0,-0.11953
3,-0.341122,0.24539,-0.350929,-0.113152,0.057489,-0.03651,0.011333,-3.0,0.095803,-3.0,...,-3.0,-0.225048,-3.0,-0.385705,-3.0,-0.270212,-3.0,-0.11953,-3.0,-0.396299
4,-0.350929,-0.341122,0.057489,0.24539,0.011333,-0.113152,0.095803,-0.03651,0.035858,-3.0,...,-3.0,-0.385705,-3.0,-0.270212,-3.0,-0.11953,-3.0,-0.396299,-3.0,0.119804


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GroupKFold

NUM_FOLDS = 5
oof_preds = np.zeros((len(df), 11))
y_test = np.zeros((len(test_df), 11))

target = "open_channels"
df['group'] = np.arange(df.shape[0])//4000
group = df['group']
kf = GroupKFold(n_splits=NUM_FOLDS)
splits = [x for x in kf.split(df, df["open_channels"], group)]
            
for train_ind, val_ind in splits:
    train_df, val_df = df.iloc[train_ind], df.iloc[val_ind]
    print(len(train_df), len(val_df))

    for cat in range(2):
        model = RandomForestClassifier(
                n_estimators=150,
                max_samples=0.5,
                max_depth=17,
                max_features=10,
                min_samples_leaf=10,
                random_state=42,
                n_jobs=-1,
                verbose=1
            )
        
        fit_df = train_df[train_df["category"] == cat]
        y = fit_df[TARGET].values
        y[y.argmin()] = 0 # hack to have 11 class in each fold
        
        model.fit(fit_df[features], y)
        
        pred = model.predict_proba(val_df[val_df["category"] == cat][features])
        oof_preds[val_ind[np.where(val_df["category"].values == cat)[0]], :pred.shape[1]] = pred
        
        y_test[np.where(test_df["category"].values == cat)[0], :pred.shape[1]] += model.predict_proba(test_df[test_df["category"] == cat][features])/NUM_FOLDS

4400000 1100000


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 40.1min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    7.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   13.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 13.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.5s
[Parallel

4400000 1100000


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 39.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    6.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   13.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 13.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel

4400000 1100000


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 39.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    7.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   13.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 13.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.4s
[Parallel

4400000 1100000


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 39.5min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    7.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   13.9s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 13.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.4s
[Parallel

4400000 1100000


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 39.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    7.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   13.9s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 13.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.5s
[Parallel

In [9]:
from sklearn.metrics import f1_score

f1_score(df["open_channels"], oof_preds.argmax(axis=1), average="macro")

0.9395655370595182

In [10]:
oof_f1 = f1_score(df["open_channels"].iloc[:5000_000], oof_preds[:5000_000].argmax(axis=1), average="macro")
oof_f1

0.9405664583930242

In [11]:
test_df[TARGET] = y_test.argmax(axis=1)
test_df.iloc[:600_000][TARGET].value_counts()/600_000

0     0.349160
1     0.187025
3     0.113237
2     0.091458
4     0.067930
5     0.047083
7     0.043292
8     0.040782
6     0.030890
9     0.022967
10    0.006177
Name: open_channels, dtype: float64

In [12]:
np.savez_compressed('rfc_clf.npz',valid=oof_preds, test=y_test)
test_df.to_csv(f'submission.csv', index=False, float_format='%.4f', columns=["time", TARGET])
print(test_df["open_channels"].mean())
test_df["open_channels"].value_counts()

1.3824545


0     1220578
1      216601
3      135062
2      117320
4       80598
5       55825
7       52690
8       49562
6       37085
9       27661
10       7018
Name: open_channels, dtype: int64