# Import Libraries & Datasets

In [69]:
#Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [70]:
df_test = pd.read_csv(r"Datasets\test.csv")
df_train = pd.read_csv(r"Datasets\train.csv")

def print_dataset_info(dataset_name, df):
    print(dataset_name + " Dataset:")
    print("Shape:", df.shape)
    print("Columns:", df.columns)
    print("Head:")
    print(df.head())
    print()

# Print test dataset information
print_dataset_info("Test", df_test)

# Print train dataset information
print_dataset_info("Train", df_train)


Test Dataset:
Shape: (4209, 377)
Columns: Index(['ID', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=377)
Head:
   ID  X0 X1  X2 X3 X4 X5 X6 X8  X10  ...  X375  X376  X377  X378  X379  X380  \
0   1  az  v   n  f  d  t  a  w    0  ...     0     0     0     1     0     0   
1   2   t  b  ai  a  d  b  g  y    0  ...     0     0     1     0     0     0   
2   3  az  v  as  f  d  a  j  j    0  ...     0     0     0     1     0     0   
3   4  az  l   n  f  d  z  l  n    0  ...     0     0     0     1     0     0   
4   5   w  s  as  c  d  y  i  m    0  ...     1     0     0     0     0     0   

   X382  X383  X384  X385  
0     0     0     0     0  
1     0     0     0     0  
2     0     0     0     0  
3     0     0     0     0  
4     0     0     0     0  

[5 rows x 377 columns]

Train Dataset:
Shape: (4209, 378)
Columns: Index(['ID', 'y', 'X0

# Remove columns with zero variance

In [71]:
test_variances = df_test.var(numeric_only=True)
zero_test_variances_columns = test_variances[test_variances == 0].index
zero_test_variances_columns

Index(['X257', 'X258', 'X295', 'X296', 'X369'], dtype='object')

In [72]:
df_test = df_test.drop(zero_test_variances_columns, axis = 1)

In [73]:
df_test.shape

(4209, 372)

In [74]:
train_variances = df_train.var(numeric_only=True)
zero_train_variances_columns = train_variances[train_variances == 0].index
zero_train_variances_columns

Index(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293',
       'X297', 'X330', 'X347'],
      dtype='object')

In [75]:
df_train = df_train.drop(zero_train_variances_columns, axis=1)

In [76]:
df_train.shape

(4209, 366)

# Check for nulls & unique values

In [77]:
test_null_counts = df_test.isnull().sum()
test_unique_counts = df_test.nunique()
print(test_null_counts)
print(test_null_counts)

ID      0
X0      0
X1      0
X2      0
X3      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 372, dtype: int64
ID      0
X0      0
X1      0
X2      0
X3      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 372, dtype: int64


In [78]:
train_null_counts = df_train.isnull().sum()
train_unique_counts = df_train.nunique()
print(train_null_counts)
print(train_unique_counts)

ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 366, dtype: int64
ID      4209
y       2545
X0        47
X1        27
X2        44
        ... 
X380       2
X382       2
X383       2
X384       2
X385       2
Length: 366, dtype: int64


# Apply label encoder

In [79]:
from sklearn.preprocessing import LabelEncoder

In [80]:
label_encoder = LabelEncoder()
df_train['X0'].unique()

array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
       'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
       'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
       'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab'],
      dtype=object)

In [81]:
df_train['X0'] = label_encoder.fit_transform(df_train['X0'])

In [82]:
df_train['X0'].unique()

array([32, 20, 40,  9, 36, 43, 31, 29, 39, 35, 19, 27, 44, 45,  7,  8, 10,
       46, 37, 15, 12, 42,  5,  0, 26,  6, 25, 13, 24,  1, 22, 14, 30, 38,
       21, 18, 23, 41,  4, 16, 34, 33, 17, 11,  3, 28,  2])

In [83]:
columns_to_encode = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

for column in columns_to_encode:
    df_train[column] = label_encoder.fit_transform(df_train[column])

In [84]:
df_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,32,23,17,0,3,24,9,14,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,32,21,19,4,3,28,11,14,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,20,24,34,2,3,27,9,23,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,20,21,34,5,3,27,11,4,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,20,23,34,5,3,12,3,13,...,0,0,0,0,0,0,0,0,0,0


In [85]:
print('Feature types:')
df_train.dtypes.value_counts()


Feature types:


int64      357
int32        8
float64      1
dtype: int64

# Dimensionality reduction

In [86]:
from sklearn.decomposition import PCA

In [87]:
sklearn_pca = PCA(n_components=12)
sklearn_pca.fit(df_train)
x_train_transformed = sklearn_pca.transform(df_train)

In [88]:
print(x_train_transformed.shape)

(4209, 12)


In [90]:
df_train.y

0       130.81
1        88.53
2        76.26
3        80.62
4        78.02
         ...  
4204    107.39
4205    108.77
4206    109.22
4207     87.48
4208    110.85
Name: y, Length: 4209, dtype: float64

## Train and Test split on Train dataset

In [91]:
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size=0.3,random_state=42)

In [92]:
print(xtrain)
print(xtrain.shape)

        ID  X0  X1  X2  X3  X4  X5  X6  X8  X10  ...  X375  X376  X377  X378  \
370    735  35  13  16   1   3   9   6  19    0  ...     0     0     0     0   
3392  6770  15  10  16   2   3  23   9  16    0  ...     0     0     1     0   
2208  4414  31   3  16   2   3  15   2  21    0  ...     0     0     1     0   
3942  7907  35  20   8   6   3  26   6  14    0  ...     1     0     0     0   
1105  2191  36  13  16   5   3   1   6   0    0  ...     0     0     0     0   
...    ...  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...   ...   ...   ...   ...   
3444  6879  31  10  16   2   3  22  11  17    0  ...     0     0     1     0   
466    898  20  25  25   2   3   9   9   9    0  ...     0     0     0     0   
3092  6214  45  24   3   2   3  21   8   2    0  ...     1     0     0     0   
3772  7558  45  19   8   5   3  25   8   1    0  ...     0     0     0     0   
860   1712  22   1   7   2   3   5   9  17    0  ...     1     0     0     0   

      X379  X380  X382  X383  X384  X38

In [93]:
print(ytrain)
print(ytrain.shape)

370      95.13
3392    117.36
2208    109.01
3942     93.77
1105    103.41
         ...  
3444    109.42
466      78.25
3092     92.18
3772     91.92
860      87.71
Name: y, Length: 2946, dtype: float64
(2946,)


In [94]:
print(xtest)
print(xtest.shape)

        ID  X0  X1  X2  X3  X4  X5  X6  X8  X10  ...  X375  X376  X377  X378  \
1073  2140   9  16   7   5   3   6   9  11    0  ...     0     0     0     0   
144    310  27  13   3   5   3  13   8  22    0  ...     0     0     0     0   
2380  4779  31   1  21   2   3  18  11  14    1  ...     1     0     0     0   
184    385  20  25  22   2   3  13   9  11    0  ...     0     0     0     0   
2587  5180   8  23   8   3   3  17   8  17    0  ...     0     0     0     0   
...    ...  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...   ...   ...   ...   ...   
2493  4997  27  20  16   2   3  18  10   5    0  ...     0     0     1     0   
3388  6760  40  19  24   5   3  23   3  19    0  ...     0     0     0     0   
3997  8016  22   3   7   0   3  26   6  18    0  ...     0     0     1     0   
383    752  40   1  16   6   3   9   8   0    0  ...     1     0     0     0   
3364  6709  27   4  33   2   3  23   6  24    0  ...     0     0     1     0   

      X379  X380  X382  X383  X384  X38

# XGboost

In [95]:
from sklearn.metrics import r2_score
%pip install xgboost
import xgboost as xgb

Note: you may need to restart the kernel to use updated packages.


In [96]:
d_train = xgb.DMatrix(xtrain, label=ytrain)
d_test = xgb.DMatrix(xtest, label=ytest)

In [105]:
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.1
params['max_depth'] = 3

epochs = 100
XGB_model = xgb.train(params,d_train,epochs)
XGB_prediction=XGB_model.predict(d_test)

from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(ytest, XGB_prediction))

print(rmse)

9.592553589580936
