In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools

import seaborn as sns
sns.set_theme(style="whitegrid")

%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/train.csv")

In [None]:
train.info()

In [None]:
test = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/test.csv")

In [None]:
train.head(n=25)

In [None]:
test.head()

In [None]:
train_labels = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv")

In [None]:
submission = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv")

## EDA

In [None]:
print("Size of train data: ", train.shape)
print("Size of test data: ", test.shape)

In [None]:
# Check how much data we have per subject
print("Min step: ", train.step.min())
print("Max step: ", train.step.max())

In [None]:
# Check how many subjects the data has
print("Number of unique subjects: ", train.subject.nunique())
print("Smallest subject number: ", train.subject.min())
print("Largest subject number: ", train.subject.max())

In [None]:
# Understand how sequence, subject and state relate to each other

# sequence -> subject -> step

In [None]:
train_labels.head()

In [None]:
train.describe()

In [None]:
# Check missing data
train.isnull().sum()

In [None]:
# Check for duplicated data
train.duplicated().value_counts()

In [None]:
# Is the dataset balanced?
sns.countplot(x="state", data=train_labels);
plt.title('Number of states', fontsize=18);
plt.xlabel('State', fontsize=16);
plt.ylabel('Count', fontsize=16);

In [None]:
# check the correlation
# From https://www.kaggle.com/code/abdulravoofshaik/early-eda-and-insights

corr = train.corr()
fig, ax = plt.subplots(1,1, figsize = (15,6))

hm = sns.heatmap(train.iloc[:,:12].corr(),
                ax = ax,
                cmap = 'coolwarm',
                annot = True,
                fmt = '.2f',
                linewidths = 0.05)
fig.subplots_adjust(top=0.93)
fig.suptitle('Correlation Heatmap for Train dataset', 
              fontsize=14, 
              fontweight='bold')

## Summary of findings 

* The data is composed of sequence -> subject -> step
* 13 different sensors
* No missing data
* There are some duplicate values
* Some sensors have correlation
* Classes are more or less balanced

## Baseline model

### Feature engineering

In [None]:
import xgboost as xgb
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, GroupKFold
from xgboost import XGBClassifier


In [None]:
cols_remove = train.columns.tolist()
cols_remove.remove('sequence')
cols_remove.remove('subject')
cols_remove

In [None]:
sensor_cols = [col for col in cols_remove if 'sensor' in col]

aggregations_lst = ['mean', 'std', 'skew', 'max', 'min']

In [None]:
def feature_engineering(train, sensor_cols, aggregations_lst, cols_remove):

    df_feat_eng = train.copy()

    for sensor, aggregation in itertools.product(sensor_cols, aggregations_lst):
        value = train.groupby(['sequence', 'subject'])[sensor].agg(aggregation)
        value = value.rename(sensor + '_' + aggregation)
    
        df_feat_eng = df_feat_eng.merge(value, 
                                        left_on=['sequence', 'subject'], 
                                        right_index=True)
    
    df_feat_eng = df_feat_eng.drop(cols_remove, axis=1)
    df_feat_eng = df_feat_eng.drop_duplicates()
    
    return df_feat_eng

In [None]:
train_feat_eng = feature_engineering(train, 
                                     sensor_cols, 
                                     aggregations_lst,
                                     cols_remove)

test_feat_eng = feature_engineering(test, 
                                    sensor_cols, 
                                    aggregations_lst,
                                    cols_remove)

In [None]:
train_feat_eng

In [None]:
test_feat_eng

## Crossvalidation and baseline model

In [None]:
df_train = train_feat_eng.drop(["sequence", "subject"], axis=1).reset_index(drop=True)
labels_train = train_labels.drop(["sequence"], axis=1)

df_test = test_feat_eng.drop(["sequence", "subject"], axis=1).reset_index(drop=True)

In [None]:
df_train.tail()

In [None]:
cv_scores = []
models = []

params = {'n_estimators': 4096,
          'max_depth': 7,
          'learning_rate': 0.15,
          'subsample': 0.95,
          'colsample_bytree': 0.60,
          'reg_lambda': 1.50,
          'reg_alpha': 6.10,
          'gamma': 1.40,
          'random_state': 69,
          'objective': 'binary:logistic',
          'tree_method': 'gpu_hist',
         }

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_indicies, valid_indicies) in enumerate(kf.split(df_train, labels_train, train["sequence"].unique())):
    
    print("-"*15, "Fold", fold, "-"*15)
    
    X_train, X_valid = df_train.iloc[train_indicies], df_train.iloc[valid_indicies]
    y_train, y_valid = labels_train.iloc[train_indicies].values, labels_train.iloc[valid_indicies].values 
    
    xgb = XGBClassifier(**params)
    xgb.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], eval_metric = ['auc'], early_stopping_rounds = 128, verbose = 50)
    
    predictions = xgb.predict(X_valid)
    score = roc_auc_score(y_valid, predictions)
    cv_scores.append(score)
    models.append(xgb)

In [None]:
np.mean(cv_scores)

In [None]:
prediction = np.zeros(df_test.shape[0])
for i,model in enumerate(models):
    pred = model.predict(df_test) 
    prediction += pred
prediction = prediction/len(models)

In [None]:
submission['state'] = prediction
submission.to_csv('submission.csv', index=False)

In [None]:
nan