In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#  Importing train & test dataset

In [141]:
#loading the datasets
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')
print(train_data.shape)
print(test_data.shape)

(4209, 378)
(4209, 377)


In [142]:
#Checking the dataset
train_data.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [143]:
#drop the ID column as index as it is not needed for prediction
train_data.drop('ID',inplace=True,axis=1)
test_data.drop('ID',inplace=True,axis=1)

In [144]:
print(train_data.columns)
print(test_data.columns)

Index(['y', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=377)
Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10', 'X11',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=376)


# Columns having variance == zero, and remove those variable(s).

In [145]:
Zero_var_col = train_data.var()[train_data.var()==0].index.values
Zero_var_col

array(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290',
       'X293', 'X297', 'X330', 'X347'], dtype=object)

In [146]:
#deleting the columns with zero variance
for zero in Zero_var_col:
    if zero in train_data:
        train_data.drop(zero,axis=1,inplace=True)
        
for zero in Zero_var_col:
    if zero in test_data:
        test_data.drop(zero,axis=1,inplace=True)

print(train_data.shape)
print(test_data.shape)

(4209, 365)
(4209, 364)


# Chekcing the NAN values in the datasets

In [147]:
train_data.isnull().sum().any()

False

In [148]:
test_data.isnull().sum().any()

False

# Apply label encoder on train & test datasets

In [149]:
 from sklearn.preprocessing import LabelEncoder

In [150]:
le=LabelEncoder()

In [151]:
object_col=[]
for i in train_data.columns:
    data_type = train_data[i].dtype
    if data_type == 'object':
        print(i)
        object_col.append(i)
object_col

X0
X1
X2
X3
X4
X5
X6
X8


['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

In [152]:
for col in object_col:
    le.fit(train_data[col].append(test_data[col]).values)
    train_data[col]= le.transform(train_data[col])
    test_data[col]= le.transform(test_data[col])

In [153]:
train_data.columns

Index(['y', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=365)

In [154]:
# cross checking if lables are encoded to numbers
print(train_data['X0'].unique())

[37 24 46 11 41 49 36 34 45 40 23 32 50 51  9 10 12 52 43 18 15 48  6  0
 31  8 30 16 29  1 26 17 35 44 25 22 28 47  4 19 39 38 21 14  3 33  2]


# Perform dimensionality reduction.

In [155]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
pca = PCA(n_components = 0.98,svd_solver='full')

In [156]:
X = train_data.drop('y',axis=1)
Y = train_data['y']


In [194]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=12)

In [195]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(2946, 364)
(2946,)
(1263, 364)
(1263,)


In [196]:
pca.fit(X)

PCA(n_components=0.98, svd_solver='full')

In [197]:
pca_X_train = pd.DataFrame(pca.transform(X_train))
pca_X_test = pd.DataFrame(pca.transform(X_test)) 
pca_test = pd.DataFrame(pca.transform(test_data))

In [198]:
pca.n_components_

12

In [199]:
pca.explained_variance_ratio_

array([0.40868988, 0.21758508, 0.13120081, 0.10783522, 0.08165248,
       0.0140934 , 0.00660951, 0.00384659, 0.00260289, 0.00214378,
       0.00209857, 0.00180388])

# Predicting using XGBoost

In [187]:
from sklearn.metrics import r2_score,mean_squared_error
import xgboost as xgb

In [188]:
model = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.4, max_depth = 10, alpha = 6, 
                           n_estimators = 20)


In [200]:
model.fit(pca_X_train,Y_train)



XGBRegressor(alpha=6, base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.3,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.4, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=10, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=20, n_jobs=0,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=0, ...)

In [201]:
#predict on the validaiton set
pred_y_test = model.predict(pca_X_test)

In [202]:
mean_squared_error(Y_test,pred_y_test)

106.12790026771569

In [203]:
pred_test_data=model.predict(pca_test)

In [193]:
pred_test_data

array([ 81.70047 ,  96.85091 ,  92.18459 , ...,  93.24074 , 105.23896 ,
        89.607704], dtype=float32)