# Introduction
Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! :)

EDA was done in this [notebook](https://www.kaggle.com/code/hasanbasriakcay/tpsapr22-eda-fe-baseline)

In [1]:
import pandas as pd
import numpy as np
import warnings 

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

warnings.simplefilter("ignore")
train_ = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")
train_labels = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
sub = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv")

display(train_.head())
display(test.head())
display(train_labels.head())
display(sub.head())

Unnamed: 0,sequence,subject,step,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,sensor_10,sensor_11,sensor_12
0,0,47,0,-0.196291,0.112395,1.0,0.329204,-1.00466,-0.131638,-0.127505,0.368702,-0.1,-0.963873,-0.985069,0.531893,4.751492
1,0,47,1,-0.44745,0.134454,1.0,-0.658407,0.162495,0.340314,-0.209472,-0.867176,0.2,-0.301301,0.082733,-0.231481,0.45439
2,0,47,2,0.326893,-0.694328,1.0,0.330088,0.473678,1.280479,-0.094718,0.535878,1.4,1.002168,0.449221,-0.58642,-4.736147
3,0,47,3,0.523184,0.75105,1.0,0.976991,-0.563287,-0.720269,0.79326,0.951145,-0.3,-0.995665,-0.43429,1.34465,0.429241
4,0,47,4,0.272025,1.07458,1.0,-0.136283,0.398579,0.044877,0.560109,-0.541985,-0.9,1.055636,0.812631,0.123457,-0.223359


Unnamed: 0,sequence,subject,step,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,sensor_10,sensor_11,sensor_12
0,25968,684,0,2.427357,19.639706,1.0,-1.466372,-1.289973,-4.207928,2.486339,-2.493893,8.0,-1.123555,-1.673048,10.980453,0.419011
1,25968,684,1,-4.950541,-21.747899,1.0,0.983186,-0.569053,1.845924,-3.887978,1.727481,-2.9,0.395231,-0.882233,-1.871399,-0.008525
2,25968,684,2,1.136012,-10.756303,1.0,1.016814,0.964157,2.454749,0.312386,1.154198,-5.6,1.114162,1.525273,-11.584362,0.139812
3,25968,684,3,0.806028,6.504202,1.0,-0.179646,0.969221,-1.035153,-0.457195,0.254962,-2.7,-0.588873,0.608761,-4.24177,-0.462916
4,25968,684,4,1.288253,5.552521,1.0,-0.493805,-1.036124,-1.126402,2.008197,-0.730534,0.0,0.899566,-1.259615,-0.472222,-0.121483


Unnamed: 0,sequence,state
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1


Unnamed: 0,sequence,state
0,25968,0
1,25969,0
2,25970,0
3,25971,0
4,25972,0


In [2]:
train = train_.merge(train_labels, on='sequence', how='left')
train.shape

(1558080, 17)

# Feature Engineering

In [3]:
def create_new_features(df, aggregation_cols=['sequence'], prefix=''):
    df['sensor_02_num'] = df['sensor_02'] > -15
    df['sensor_02_num'] = df['sensor_02_num'].astype(int)
    df['sensor_sum1'] = (df['sensor_00'] + df['sensor_09'] + df['sensor_06'] + df['sensor_01'])
    df['sensor_sum2'] = (df['sensor_01'] + df['sensor_11'] + df['sensor_09'] + df['sensor_06'] + df['sensor_00'])
    df['sensor_sum3'] = (df['sensor_03'] + df['sensor_11'] + df['sensor_07'])
    df['sensor_sum4'] = (df['sensor_04'] + df['sensor_10'])
    
    agg_strategy = {
                    'sensor_00': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_01': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_02': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_03': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_04': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_05': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_06': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_07': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_08': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_09': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_10': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_11': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_12': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_02_num': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_sum1': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_sum2': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_sum3': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                    'sensor_sum4': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median'],
                   }
    
    group = df.groupby(aggregation_cols).aggregate(agg_strategy)
    group.columns = ['_'.join(col).strip() for col in group.columns]
    group.columns = [str(prefix) + str(col) for col in group.columns]
    group.reset_index(inplace = True)
    
    temp = (df.groupby(aggregation_cols).size().reset_index(name = str(prefix) + 'size'))
    group = pd.merge(temp, group, how = 'left', on = aggregation_cols,)
    return group

In [4]:
train_fe = create_new_features(train, aggregation_cols=['sequence', 'subject'])
test_fe = create_new_features(test, aggregation_cols=['sequence', 'subject'])

In [5]:
train_fe_subjects = create_new_features(train, aggregation_cols = ['subject'], prefix = 'subject_')
test_fe_subjects = create_new_features(test, aggregation_cols = ['subject'], prefix = 'subject_')

In [6]:
train_fe = train_fe.merge(train_fe_subjects, on='subject', how='left')
train_fe = train_fe.merge(train_labels, on='sequence', how='left')
test_fe = test_fe.merge(test_fe_subjects, on='subject', how='left')

In [7]:
print(train_fe.shape, test_fe.shape)

(25968, 257) (12218, 256)


# Adding Pseudo Labels

In [8]:
def pseudo_labeling(df_train, df_test, target, features, fold=10):

    
    X_train = df_train[features]
    X_test = df_test[features]
    y_train = df_train[[target]]
    
    oof = np.zeros(len(X_train))
    preds = np.zeros(len(df_test))
    
    idx1 = X_train.index; idx2 = X_test.index
    
    skf = StratifiedKFold(n_splits=fold, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(X_train, y_train):
        clf = LGBMClassifier(verbose=0, force_col_wise=True)
        clf.fit(X_train.loc[train_index,:], y_train.loc[train_index, target], 
                eval_set = [(X_train.loc[test_index,:], y_train.loc[test_index, target])], verbose=0)
        oof[idx1[test_index]] = clf.predict_proba(X_train.loc[test_index,:])[:,1]
        preds[idx2] += clf.predict_proba(X_test)[:,1] / skf.n_splits
    
    pseudo_labeled_test = df_test.copy()
    pseudo_labeled_test[target + "_proba"] = preds
    
    auc = roc_auc_score(df_train[target], oof)
    print('LGBM scores CV =',round(auc,5))
    
    return pseudo_labeled_test

In [9]:
features = list(test_fe.columns)
features.remove("sequence")
features.remove("subject")

In [10]:
pseudo_labeled_test = pseudo_labeling(train_fe, test_fe, "state", features)
pseudo_labeled_test.head()

LGBM scores CV = 0.9297


Unnamed: 0,sequence,subject,size,sensor_00_mean,sensor_00_max,sensor_00_min,sensor_00_var,sensor_00_mad,sensor_00_sum,sensor_00_median,...,subject_sensor_sum3_sum,subject_sensor_sum3_median,subject_sensor_sum4_mean,subject_sensor_sum4_max,subject_sensor_sum4_min,subject_sensor_sum4_var,subject_sensor_sum4_mad,subject_sensor_sum4_sum,subject_sensor_sum4_median,state_proba
0,25968,684,60,-0.002602,3.146832,-4.950541,1.191898,0.732741,-0.156105,-0.002318,...,-58.861388,-0.049399,-0.001315,11.277613,-4.747551,1.949199,1.110229,-5.52134,0.059418,0.930236
1,25969,935,60,0.028516,5.816074,-3.675425,2.102617,0.848916,1.710974,-0.015842,...,-24.434919,-0.016942,0.000114,12.963398,-6.006014,1.365958,0.937648,0.405085,-0.045479,0.936835
2,25970,924,60,0.057664,12.306028,-15.00541,25.843168,4.023308,3.459815,1.079212,...,-78.450531,0.041885,0.004729,24.74391,-20.067045,12.770127,2.35723,10.781376,0.0244,0.062141
3,25971,769,60,-0.004791,2.876352,-2.465997,0.837073,0.687991,-0.287481,-0.033617,...,-0.930441,-0.006338,-0.005626,8.421322,-4.650984,0.484002,0.500457,-23.628549,-0.012683,0.713835
4,25972,764,60,-0.001443,17.295209,-9.974498,7.7489,1.083572,-0.086553,-0.171947,...,-276.436438,-0.01705,0.019176,14.010415,-21.911869,5.720601,1.75517,56.376032,-0.005695,0.47825


In [11]:
def print_pseudo_label_th(df, th_list=[]):
    for th in th_list:
        temp_df = df.loc[((df['state_proba']>=th) | (df['state_proba']<=(1 - th))), :]
        print(th, '-', temp_df.shape[0])

In [12]:
print_pseudo_label_th(pseudo_labeled_test, th_list=[0.99, 0.98, 0.97, 0.96, 0.95, 0.94, 0.93, 0.92, 0.91, 0.9])

0.99 - 286
0.98 - 935
0.97 - 1512
0.96 - 2032
0.95 - 2532
0.94 - 2998
0.93 - 3429
0.92 - 3845
0.91 - 4228
0.9 - 4592


In [13]:
pseudo_labeled_test.to_csv('pseudo_labeled_test.csv', index=False)

# Modeling

In [14]:
def select_pseudo_labeled_test(df_train, df, th=0.99):
    temp_df = df.loc[((df['state_proba']>=th) | (df['state_proba']<=(1 - th))), :]
    temp_df['state_proba'] = temp_df['state_proba'].round()
    temp_df = temp_df.rename(columns={'state_proba':'state'})
    new_df = pd.concat([df_train, temp_df])
    return new_df

In [15]:
def submission_with_pseudo_labels(df_train, df_test, df_pseudo, th_list=[]):
    from lightgbm import LGBMClassifier
    
    for th in th_list:
        new_df = select_pseudo_labeled_test(df_train, df_pseudo, th=th)
        X_test = df_test.drop(['sequence', 'subject'], 1)
        X_train = new_df[X_test.columns]
        y_train = new_df[['state']]

        model = LGBMClassifier()
        model.fit(X_train, y_train)
        sub['state'] = model.predict_proba(X_test)[:, 1]
        sub.to_csv(f'submission_{th}.csv', index=False)

In [16]:
submission_with_pseudo_labels(train_fe, test_fe, pseudo_labeled_test, 
                              th_list=[0.99, 0.98, 0.97, 0.96, 0.95, 0.94, 0.93, 0.92, 0.91, 0.9])