In [None]:
#Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#Importing the models
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder,OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold
# For training random forest model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from category_encoders.m_estimate import MEstimateEncoder
from lightgbm import LGBMRegressor
#from tabgan.sampler import OriginalGenerator, GANGenerator
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
import os
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

# Data preparation

In [None]:
# Reading data
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
#Load train and test data
train = pd.read_csv("../input/30-days-of-ml/train.csv", index_col=0)
test = pd.read_csv("../input/30-days-of-ml/test.csv", index_col=0)
sub = pd.read_csv('../input/submissionnn/submission.csv')

#Remove the outliers
mean = train['target'].mean()
std = train['target'].std()
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
outliers = train[(train['target'] < lower) | (train['target'] > upper)]
print(f"Orginal Dataset size: {train.shape}")
train.drop(outliers.index.to_list(), inplace=True)
print(f"Number of outliers: {len(outliers)}")

#Separate target from features
target = train['target']
features = train.drop(['target'], axis=1)

# List of categorical columns
object_cols = [col for col in features.columns if 'cat' in col]

#List of numerical columns
num_cols = [col for col in features.columns if 'cont' in col]

# Make copy to avoid changing original data 
label_features = features.copy()
label_test = test.copy()

# Label encoding for categorical features
label = LabelEncoder()
label_features[object_cols] = label_features[object_cols].apply(label.fit_transform)
label_test[object_cols] = label_test[object_cols].apply(label.fit_transform)

In [None]:
# Setting up fold parameters
splits = 10                        #5   >>>> 5 increased the MSE
kf = KFold(n_splits=splits, shuffle=True, random_state=0)
#Generating folds and making training and prediction for each of 10 folds
for train_index, test_index in kf.split(label_features, target):
    X_train, X_valid = label_features.iloc[list(train_index)], label_features.iloc[list(test_index)]
    y_train, y_valid = target.iloc[list(train_index)], target.iloc[list(test_index)]
#-----------------------------------------------------------------------------------------
X_train.head()

# Models

In [None]:
xgb_params = {
    'lambda': 67.79737006663706,
    'alpha': 40.12405005448161,
    'colsample_bytree': 0.061613774851329205,
    'subsample': 0.9556736521337416,
    'learning_rate': 0.17024722721525629,
    'n_estimators': 10000,
    'max_depth': 3,
    'booster': 'gbtree',
    'min_child_weight': 155,
    'booster': 'gbtree',
    'seed' : 38,
    'random_state':42   #removable doesn't exist from the begining 

}
early_sr = 300   #100
XGB_model = XGBRegressor(n_jobs=4,**xgb_params)
XGB_model.fit(X_train, y_train, verbose=1000,
             early_stopping_rounds=early_sr, 
             eval_set=[(X_valid, y_valid)],
             eval_metric= 'rmse')
# Predictions on testing data
predictions_XGBoost_1 = XGB_model.predict(label_test)


#First model parameters trial  >>> chosen
params = {
    'learning_rate': 0.07853392035787837,
    'reg_lambda': 1.7549293092194938e-05,
    'reg_alpha': 14.68267919457715, 
    'subsample': 0.8031450486786944, 
    'colsample_bytree': 0.170759104940733, 
    'max_depth': 3,
    'n_estimators': 5000
    #'random_state':40
}
early_sr = 300   #100
XGB_model = XGBRegressor(n_jobs=4,**params)
XGB_model.fit(X_train, y_train, verbose=1000,
             early_stopping_rounds=early_sr, 
             eval_set=[(X_valid, y_valid)],
             eval_metric= 'rmse')
# Predictions on testing data
predictions_XGBoost_2 = XGB_model.predict(label_test)

xgb_params = {
    'n_estimators': 10000,
    'learning_rate': 0.03628302216953097,
    'subsample': 0.7875490025178415,
    'colsample_bytree': 0.11807135201147481,
    'max_depth': 3,
    'booster': 'gbtree', 
    'reg_lambda': 0.0008746338866473539,
    'reg_alpha': 23.13181079976304,
    'random_state':42   #40 origin of random state
}
early_sr = 300   #100
XGB_model = XGBRegressor(n_jobs=4,**xgb_params)
XGB_model.fit(X_train, y_train, verbose=1000,
             early_stopping_rounds=early_sr, 
             eval_set=[(X_valid, y_valid)],
             eval_metric= 'rmse')
# Predictions on testing data
predictions_XGBoost_3 = XGB_model.predict(label_test)

# Ensembling

In [None]:
ss = sub['target'].tolist()
final_predictions = [(ele1+ele2+ele3+ele4)/4 for ele1, ele2, ele3, ele4 in zip(ss, predictions_XGBoost_1, predictions_XGBoost_2, predictions_XGBoost_3)]

# Submitting

In [None]:
# Save the predictions to a CSV file
output = pd.DataFrame({'Id': label_test.index,
                       'target': final_predictions})
output.to_csv('submission.csv', index=False)