## 0 prefix work

In [47]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression, Ridge, SGDRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, mean_absolute_error, make_scorer
import lightgbm as lgb
import xgboost as xgb

from functools import partial
import scipy as sp

import time
import datetime

import gc

## 1 data process

### 1.1 mem ruduce function

In [48]:
# dataframe reduction function
def reduce_mem_usage(df, verbose=True):    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        if col != 'time':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
      print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


### 1.2 params define

In [49]:
window_sizes = [10, 50]
train_df = pd.read_csv('/kaggle/input/liverpool-ion-switching/train.csv')
y = train_df['open_channels']
test_df = pd.read_csv('/kaggle/input/liverpool-ion-switching/test.csv')
col = [c for c in train_df.columns if c not in ['time', 'open_channels', 'group']]
seed_random = 42
lr_xgb = 0.1
max_depth_xgb = 10
num_boost_round_xgb = 2000

params_xgb = {'colsample_bytree': 0.375,
              'learning_rate': lr_xgb,
              'max_depth': max_depth_xgb, 
              'subsample': 1, 
              'objective':'reg:squarederror',
              'eval_metric':'rmse'}

## 2 feature engineering

### 2.1 feature function

In [50]:
# feature, stat feature
def features(df, window_sizes):

    # op1 for stat feature
    df = df.sort_values(by=['time']).reset_index(drop=True)
    df.index = ((df.time * 10_000) - 1).values
    df['batch'] = df.index // 25_000
    df['batch_index'] = df.index  - (df.batch * 25_000)
    df['batch_slices'] = df['batch_index']  // 2500
    df['batch_slices2'] = df.apply(lambda r: '_'.join([str(r['batch']).zfill(3), str(r['batch_slices']).zfill(3)]), axis=1)
    
    for c in ['batch','batch_slices2']:
      d = {}
      d['mean'+c] = df.groupby([c])['signal'].mean()
      d['median'+c] = df.groupby([c])['signal'].median()
      d['max'+c] = df.groupby([c])['signal'].max()
      d['min'+c] = df.groupby([c])['signal'].min()
      d['std'+c] = df.groupby([c])['signal'].std()
      d['mean_abs_chg'+c] = df.groupby([c])['signal'].apply(lambda x: np.mean(np.abs(np.diff(x))))
      d['abs_max'+c] = df.groupby([c])['signal'].apply(lambda x: np.max(np.abs(x)))
      d['abs_min'+c] = df.groupby([c])['signal'].apply(lambda x: np.min(np.abs(x)))
      d['range'+c] = d['max'+c] - d['min'+c]
      d['maxtomin'+c] = d['max'+c] / d['min'+c]
      d['abs_avg'+c] = (d['abs_min'+c] + d['abs_max'+c]) / 2
      for v in d:
        df[v] = df[c].map(d[v].to_dict())
            # add shifts_1
    df['signal_shift_+1'] = [0,] + list(df['signal'].values[:-1])
    df['signal_shift_-1'] = list(df['signal'].values[1:]) + [0]
    for i in df[df['batch_index']==0].index:
      df['signal_shift_+1'][i] = np.nan
    for i in df[df['batch_index']==49999].index:
      df['signal_shift_-1'][i] = np.nan
    
    # add shifts_2 - my upgrade
    df['signal_shift_+2'] = [0,] + [1,] + list(df['signal'].values[:-2])
    df['signal_shift_-2'] = list(df['signal'].values[2:]) + [0] + [1]
    for i in df[df['batch_index']==0].index:
      df['signal_shift_+2'][i] = np.nan
    for i in df[df['batch_index']==1].index:
        df['signal_shift_+2'][i] = np.nan
    for i in df[df['batch_index']==49999].index:
      df['signal_shift_-2'][i] = np.nan
    for i in df[df['batch_index']==49998].index:
      df['signal_shift_-2'][i] = np.nan
    
    df = df.drop(columns=['batch', 'batch_index', 'batch_slices', 'batch_slices2'])

    for c in [c1 for c1 in df.columns if c1 not in ['time', 'signal', 'open_channels']]:
      df[c+'_msignal'] = df[c] - df['signal']

    # op2 for window_size feature
    for window in window_sizes:
      df["rolling_mean_" + str(window)] = df['signal'].rolling(window=window).mean()
      df["rolling_std_" + str(window)] = df['signal'].rolling(window=window).std()
      df["rolling_var_" + str(window)] = df['signal'].rolling(window=window).var()
      df["rolling_min_" + str(window)] = df['signal'].rolling(window=window).min()
      df["rolling_max_" + str(window)] = df['signal'].rolling(window=window).max()
        
    df = df.replace([np.inf, -np.inf], np.nan)    
    df.fillna(0, inplace=True)
    gc.collect()
    return df


### 2.2 transform df to ready to use data

In [51]:
train_df = features(train_df, window_sizes)
test_df = features(test_df, window_sizes)
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

X_train, X_valid, y_train, y_valid = train_test_split(train_df[col], y, test_size=0.3, random_state=seed_random)

Mem. usage decreased to 681.88 Mb (72.9% reduction)
Mem. usage decreased to 270.84 Mb (72.7% reduction)


## 3 model

### 3.1 start from a baseline

In [52]:
# Thanks to https://www.kaggle.com/teejmahal20/3-simple-ideas-ensemble
train_set = xgb.DMatrix(X_train, y_train)
val_set = xgb.DMatrix(X_valid, y_valid)
del X_train, X_valid, y_train, y_valid
gc.collect()

modelx = xgb.train(params_xgb, 
                   train_set, 
                   num_boost_round=num_boost_round_xgb, 
                   evals=[(train_set, 'train'), (val_set, 'val')], 
                   verbose_eval=50, 
                   early_stopping_rounds=250)
del train_set, val_set
gc.collect()

[0]	train-rmse:3.19457	val-rmse:3.20178
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 250 rounds.
[50]	train-rmse:1.57968	val-rmse:1.58069
[100]	train-rmse:1.57939	val-rmse:1.58074
[150]	train-rmse:1.57924	val-rmse:1.58074
[200]	train-rmse:1.57909	val-rmse:1.58075
[250]	train-rmse:1.57901	val-rmse:1.58076
[300]	train-rmse:1.57894	val-rmse:1.58078
Stopping. Best iteration:
[53]	train-rmse:1.57961	val-rmse:1.58066



21

## 4 predict and submmit

In [53]:
# predict
y_xgb_pred = modelx.predict(xgb.DMatrix(test_df[col]))
y_pred_train_xgb = modelx.predict(xgb.DMatrix(train_df[col]))
gc.collect()
print('XGB score {0:.4f}'.format(np.mean(f1_score(y, np.round(np.clip(y_pred_train_xgb,0,10)).astype(int), average="macro"))))

# submmit
test_df['open_channels'] = y_xgb_pred
test_df[['time','open_channels']].to_csv('submission.csv', index=False, float_format='%.4f')

XGB score 0.3540


NameError: name 'test' is not defined