In [27]:
#import libraries:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [3]:
#import data:
data = pd.read_csv("../../kaggle-30days/data/train_folds/train_folds.csv")
#shuffle the data:
data_full= data.sample(frac=1)

In [4]:
data_full.shape

(300000, 27)

In [5]:
#training data: 
#Note: since test data is not available:
df = data_full.iloc[:240000,:]
df_test = data_full.iloc[240000:,:]

In [6]:
print(f"TRAIN DATA SHAPE: {df.shape}, TEST DATA SHAPE: {df_test.shape}")

TRAIN DATA SHAPE: (240000, 27), TEST DATA SHAPE: (60000, 27)


In [7]:
df_test.columns

Index(['id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5',
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'target', 'kfold'],
      dtype='object')

In [23]:
useful_features = [c for c in df.columns if c not in ["id","target","kfold"]]
object_cols = [c for c in useful_features if c.startswith("cat")]
df_test = df_test[useful_features]

In [24]:
#target encoding:
for col in object_cols:
    temp_df = []
    temp_test_feat =None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        print(feat)
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        print(xvalid.head())
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat) 
    
    #calculate the value for each fold and then divide by 5:
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    print("df",df)
    
        
        

{'A': 8.239023584560185, 'B': 8.244725185511742}
       id cat0 cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8  ... tar_enc_cat0  \
0  467763    A    B    A    C    B    B    A    E    C  ...     8.239024   
1  100993    A    B    A    C    B    B    A    E    E  ...     8.239024   
2  181044    B    A    A    C    B    D    A    E    C  ...     8.244725   
3   79991    B    B    A    D    B    D    A    E    A  ...     8.244725   
4  428636    A    A    A    C    D    D    A    E    A  ...     8.239024   

   tar_enc_cat1  tar_enc_cat2  tar_enc_cat3  tar_enc_cat4  tar_enc_cat5  \
0      8.202686      8.243963      8.236247      8.240336      8.228570   
1      8.202686      8.243963      8.236247      8.240336      8.228570   
2      8.276974      8.243963      8.236247      8.240336      8.250606   
3      8.202686      8.243963      8.266822      8.240336      8.250606   
4      8.276974      8.243963      8.236247      8.243999      8.250606   

   tar_enc_cat6  tar_enc_cat7  tar_enc_cat8

In [21]:
df_test

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,tar_enc_cat0
212081,A,B,A,C,B,D,A,E,A,F,...,0.879522,0.600548,0.746708,0.943503,0.690314,0.675843,0.609173,0.556060,0.592002,8.239316
70192,B,A,A,A,A,D,A,E,C,B,...,0.686556,0.333664,0.513614,0.405350,0.294153,0.837318,0.054440,0.393240,0.298847,8.247458
86905,A,A,A,D,B,B,A,E,C,L,...,0.445882,0.334889,0.380832,0.313383,0.432668,0.063771,0.349303,0.323804,0.275775,8.239316
227037,A,A,B,A,B,D,A,E,E,K,...,0.777487,0.658876,0.382230,0.494991,0.441847,0.765813,0.301800,0.689232,0.615614,8.239316
179798,A,B,A,C,B,D,A,E,C,N,...,0.698191,0.456264,0.313504,0.323706,0.435913,0.416553,0.636686,0.640100,0.765523,8.239316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172348,B,B,A,C,B,D,B,E,E,J,...,0.422568,0.782198,0.365472,0.353123,0.952243,0.442101,0.209095,-0.049877,0.763697,8.247458
30321,A,A,A,C,B,B,A,E,E,F,...,0.723801,0.339583,0.599476,0.496390,0.349599,0.287082,0.711034,0.804094,0.209807,8.239316
33788,A,A,A,C,B,D,A,E,G,G,...,0.931987,0.341038,0.788810,1.029754,0.442390,0.540145,0.633317,0.637869,0.760616,8.239316
108105,A,A,A,C,B,D,A,E,A,F,...,0.610904,0.572231,0.866331,0.687871,0.523997,0.857372,0.735768,0.756088,0.776286,8.239316


In [31]:
final_predictions =[]
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinalencoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinalencoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinalencoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinalencoder.transform(xtest[object_cols])
    
    model = XGBRegressor(random_state=42)
    model.fit(xtrain,ytrain)
    pred_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_true=yvalid, y_pred=pred_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)
    
print(np.mean(scores), np.std(scores))
    
    

0 0.726114021611243
1 0.7264163468643796
2 0.7243024426361712
3 0.7267204926137776
4 0.7268405684201508
0.7260787744291444 0.0009233292440756288
