In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from feature_engine.selection import DropConstantFeatures
from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import MeanEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
import xgboost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train_df.shape, test_df.shape

((4209, 378), (4209, 377))

In [5]:
train_ny = train_df.drop(['y'], axis=1)

In [6]:
#Drop Values where Variance is equal to 0
transformer = DropConstantFeatures(missing_values = 'ignore')
transformer.fit(train_ny)

DropConstantFeatures(missing_values='ignore')

In [7]:
test_df = transformer.transform(test_df)
train_df = transformer.fit_transform(train_df)

In [8]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#check for null and unique
train_df.isna().sum()

ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 366, dtype: int64

In [10]:
test_df.isna().sum()

ID      0
X0      0
X1      0
X2      0
X3      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 365, dtype: int64

In [11]:
for x in train_df.columns:
    print(train_df[x].unique())

In [12]:
for x in test_df.columns:
    print(test_df[x].unique())

In [13]:
#Impute Null and etc with MeanMedian imputer
mmi = MeanMedianImputer(imputation_method='mean')
train_df = mmi.fit_transform(train_df)
test_df = mmi.fit_transform(test_df)

In [14]:
train_df.shape, test_df.shape

((4209, 366), (4209, 365))

In [15]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [16]:
#Mean Encoder to replace categories with mean value of target for each category
me = MeanEncoder()
y_train = train_df['y']

train_df = me.fit_transform(train_df, y_train)
test_df = me.fit_transform(test_df, y_train)

In [17]:
train_df.shape, test_df.shape

((4209, 366), (4209, 365))

In [18]:
#Dimensionality Reduction with PCA
pca = PCA(n_components=0.95, svd_solver='full')
train_ny = train_df.drop(['y'], axis=1)

In [19]:
train_df.shape, test_df.shape

((4209, 366), (4209, 365))

In [20]:
#split into training and validation
X_train, X_val, y_train, y_val = train_test_split(train_df.drop(['y'], axis=1), train_df['y'], train_size = 0.75, test_size = 0.25, random_state = 4500)

In [21]:
#Applying Dimensionality Reduction
pca.fit(train_ny)

X_train = pd.DataFrame(pca.transform(X_train))
X_val = pd.DataFrame(pca.transform(X_val))
test_df = pd.DataFrame(pca.transform(test_df))

In [22]:
#creating a model with XGBoost

#Making Model
model = XGBRegressor(objective = 'reg:squarederror')

model.fit(X_train, y_train)
score = model.score(X_train, y_train)
print("Training Score: ", score)
                        
#Testing Model with val data
val_pred = model.predict(X_val)
val_mse_score = mean_squared_error(y_val, val_pred)
print("Val MSE Score: ", val_mse_score)

Training Score:  0.45569904455667676
Val MSE Score:  195.2907791758164


In [25]:
#Testing Model with Test data
test_pred = model.predict(test_df)
test_score = mean_squared_error(train_df['y'], test_pred)
print("Test MSE Score: ", test_score)

Test MSE Score:  188.60449269037898


In [26]:
test_pred

array([ 87.64249,  87.64249,  87.64249, ..., 108.42547,  91.98981,
        91.98981], dtype=float32)