In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/30days-folds/train_folds.csv
/kaggle/input/30-days-of-ml/sample_submission.csv
/kaggle/input/30-days-of-ml/train.csv
/kaggle/input/30-days-of-ml/test.csv


In [2]:
df = pd.read_csv('/kaggle/input/30days-folds/train_folds.csv')
df_test = pd.read_csv('/kaggle/input/30-days-of-ml/test.csv')
sample_submission = pd.read_csv('/kaggle/input/30-days-of-ml/sample_submission.csv')
useful_features = [c for c in df.columns if c not in ("id",'target','kfold')]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
df_test = df_test[useful_features]
final_preds = []
scores = []
for fold in range (5):
    xtrain = df[df.kfold !=fold].reset_index(drop = True)
    xvalid = df[df.kfold == fold].reset_index(drop =True)
    xtest = df_test.copy()
    #
    ytrain = xtrain.target
    yvalid = xvalid.target
    #
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    
  
    ordinal_encoder =preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    #
    scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    model = XGBRegressor(random_state=fold,tree_method = 'gpu_hist',gpu_id =0,predictor = 'gpu_predictor')

    # Train the model (will take about 10 minutes to run)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_preds.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold,rmse)
    scores.append(rmse)
print(np.mean(scores),np.std(scores))

0 0.7241755479182882
1 0.7241138968948254
2 0.7267386816038165
3 0.7268357864120136
4 0.725667388462628
0.7255062602583143 0.001185068397378747


In [3]:
#polynomial features
df = pd.read_csv('/kaggle/input/30days-folds/train_folds.csv')
df_test = pd.read_csv('/kaggle/input/30-days-of-ml/test.csv')
sample_submission = pd.read_csv('/kaggle/input/30-days-of-ml/sample_submission.csv')

useful_features = [c for c in df.columns if c not in ("id",'target','kfold')]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
df_test = df_test[useful_features]

poly = preprocessing.PolynomialFeatures(degree = 3,interaction_only = True,include_bias = False)
train_poly = poly.fit_transform(df[numerical_cols])
test_poly = poly.fit_transform(df_test[numerical_cols])

df_poly = pd.DataFrame(train_poly,columns = [f"poly_{i}"for i in range (train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly,columns = [f"poly_{i}"for i in range (test_poly.shape[1])])

df = pd.concat([df,df_poly],axis =1)
df_test = pd.concat([df_test,df_test_poly],axis =1)


useful_features = [c for c in df.columns if c not in ("id",'target','kfold')]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_preds = []
scores = []
for fold in range (5):
    xtrain = df[df.kfold !=fold].reset_index(drop = True)
    xvalid = df[df.kfold == fold].reset_index(drop =True)
    xtest = df_test.copy()
    #
    ytrain = xtrain.target
    yvalid = xvalid.target
    #
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    
  
    ordinal_encoder =preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    #
  
    
    model = XGBRegressor(random_state=fold,tree_method = 'gpu_hist',gpu_id =0,predictor = 'gpu_predictor')

    # Train the model (will take about 10 minutes to run)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_preds.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold,rmse)
    scores.append(rmse)
print(np.mean(scores),np.std(scores))


0 0.729073179900137
1 0.7286941123028183
2 0.7302315824391516
3 0.7304305210608322
4 0.7297044930462646
0.7296267777498406 0.0006624450323974544
