In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from feature_engine.selection import DropConstantFeatures
from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import MeanEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
import xgboost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train_df.shape, test_df.shape

((4209, 378), (4209, 377))

In [5]:
train_ny = train_df.drop(['y'], axis=1)

In [6]:
#Drop Values where Variance is equal to 0
transformer = DropConstantFeatures(missing_values = 'ignore')
transformer.fit(train_ny)

DropConstantFeatures(missing_values='ignore')

In [7]:
test_df = transformer.transform(test_df)
train_df = transformer.fit_transform(train_df)

In [8]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#check for null and unique
train_df.isna().sum()

ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 366, dtype: int64

In [10]:
test_df.isna().sum()

ID      0
X0      0
X1      0
X2      0
X3      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 365, dtype: int64

In [11]:
#for x in train_df.columns:
#    print(train_df[x].unique())

In [12]:
#for x in test_df.columns:
#    print(test_df[x].unique())

In [13]:
#Impute Null and etc with MeanMedian imputer
mmi = MeanMedianImputer(imputation_method='mean')
train_df = mmi.fit_transform(train_df)
test_df = mmi.fit_transform(test_df)

In [14]:
train_df.shape, test_df.shape

((4209, 366), (4209, 365))

In [15]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [16]:
#Mean Encoder to replace categories with mean value of target for each category
me = MeanEncoder()
y_train = train_df['y']

train_df = me.fit_transform(train_df, y_train)
test_df = me.fit_transform(test_df, y_train)

In [17]:
train_df.shape, test_df.shape

((4209, 366), (4209, 365))

In [18]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,99.491818,101.412574,104.218333,102.507477,100.658293,130.81,101.165245,97.746933,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,99.491818,93.723226,95.51,100.03319,100.658293,88.53,98.945502,97.746933,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,78.025543,95.764808,83.369927,101.959269,100.658293,78.44,101.165245,98.577238,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,78.025543,93.723226,83.369927,96.564507,100.658293,78.44,98.945502,104.976311,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,78.025543,101.412574,83.369927,96.564507,100.658293,78.02,101.346464,102.194215,...,0,0,0,0,0,0,0,0,0,0


In [19]:
test_df.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,100.734658,100.699771,101.288319,101.00626,100.667545,130.81,100.060255,99.678542,0,...,0,0,0,1,0,0,0,0,0,0
1,2,102.147816,101.044664,99.727792,100.541912,100.667545,88.53,100.268034,100.49717,0,...,0,0,1,0,0,0,0,0,0,0
2,3,100.734658,100.699771,100.488818,101.00626,100.667545,76.26,100.496108,100.85043,0,...,0,0,0,1,0,0,0,0,0,0
3,4,100.734658,101.160902,101.288319,101.00626,100.667545,80.62,101.200423,99.492754,0,...,0,0,0,1,0,0,0,0,0,0
4,5,100.649747,100.911711,100.488818,100.407037,100.667545,78.02,100.993388,98.967901,0,...,1,0,0,0,0,0,0,0,0,0


In [20]:
train_ny = train_df.drop(['y'], axis=1)
Y = train_df['y']

In [21]:
rs = RobustScaler()
rs.fit(train_ny)

test_df = pd.DataFrame(rs.transform(test_df))
train_df = pd.DataFrame(rs.transform(train_ny))

In [22]:
#Dimensionality Reduction with PCA
pca = PCA(n_components=0.98, svd_solver='full')


In [23]:
train_df.shape, test_df.shape

((4209, 365), (4209, 365))

In [24]:
#split into training and validation
X_train, X_val, y_train, y_val = train_test_split(train_df, Y, train_size = 0.75, test_size = 0.25, random_state = 42)

In [25]:
#Applying Dimensionality Reduction
#pca.fit(train_ny.values)

#X_train = pd.DataFrame(pca.transform(X_train))
#X_val = pd.DataFrame(pca.transform(X_val))
#test_df = pd.DataFrame(pca.transform(test_df))

In [26]:
#creating a model with XGBoost


#Making Model
model = XGBRegressor(max_depth = 3, )

model.fit(X_train, y_train)
score = model.score(X_train, y_train)
print("Training Score: ", score)
                        
#Testing Model with val data
val_pred = model.predict(X_val)
val_mse_score = mean_squared_error(y_val, val_pred)
val_score = model.score(X_val, y_val)
print("Val MSE Score: ", val_mse_score)
print("Val RMSE Score: ", np.sqrt(val_mse_score))
print("Val Score: ", val_score)

Training Score:  0.9231510745883025
Val MSE Score:  80.59620982661961
Val RMSE Score:  8.977539185468343
Val Score:  0.4928997776623998


In [29]:
#Testing Model with Test data
test_pred = model.predict(test_df)
test_mse_score = mean_squared_error(Y, test_pred)
print("Test MSE Score: ", test_mse_score)

Test MSE Score:  168.99181876480168


In [28]:
test_pred

array([117.194374, 104.76067 , 110.93229 , ...,  97.54237 , 105.42842 ,
        95.09031 ], dtype=float32)