# Calling necessasry libraries

In [292]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


# Reading the data

In [293]:
data = pd.read_csv("/content/train.csv", index_col='Id')
data.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price,x,y,z
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1.06,Ideal,I,SI2,61.8,57.0,4270,6.57,6.6,4.07
2,1.51,Premium,G,VVS2,60.9,58.0,15164,7.38,7.42,4.51
3,0.32,Ideal,F,VS2,61.3,56.0,828,4.43,4.41,2.71
4,0.53,Ideal,G,VS2,61.2,56.0,1577,5.19,5.22,3.19
5,0.7,Premium,H,VVS2,61.0,57.0,2596,5.76,5.72,3.5


In [294]:
data.shape

(43152, 10)

In [295]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43152 entries, 1 to 43152
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    43152 non-null  float64
 1   cut      43152 non-null  object 
 2   color    43152 non-null  object 
 3   clarity  43152 non-null  object 
 4   depth    43152 non-null  float64
 5   table    43152 non-null  float64
 6   price    43152 non-null  int64  
 7   x        43152 non-null  float64
 8   y        43152 non-null  float64
 9   z        43152 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.6+ MB


There are no null values, thus we do not need to take care of removing nulls or imputing any values.

# 1. Data Pre-Processing

The 0's in x, y, and z columns are meaningless, which mean we have to get rid of them.


In [296]:
data.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0
mean,0.797855,61.747177,57.458347,3929.491912,5.731568,5.735018,3.538568
std,0.473594,1.435454,2.233904,3985.527795,1.121279,1.148809,0.708238
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,947.75,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5312.0,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


# 1.1 Dealing with numerical data

In [297]:
# drop the records with 0 dimensions
data = data.loc[(data["x"] != 0) & (data["y"] != 0) & (data["z"] != 0)]

# Drop the ID column
# data = data.iloc[:, 1:]



In [298]:
data.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,43135.0,43135.0,43135.0,43135.0,43135.0,43135.0,43135.0
mean,0.797578,61.747386,57.457925,3927.668691,5.731757,5.73507,3.539962
std,0.473325,1.435091,2.233401,3983.324154,1.119336,1.147272,0.704884
min,0.2,43.0,43.0,326.0,3.73,3.68,1.07
25%,0.4,61.0,56.0,947.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2400.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5311.0,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [299]:
data.shape


(43135, 10)

20 records with 0 dimensions are removed.

# 1.2 categorial data

In [300]:

o = (data.dtypes =="object")
print(o)

carat      False
cut         True
color       True
clarity     True
depth      False
table      False
price      False
x          False
y          False
z          False
dtype: bool


In [301]:
# # encoding categorial data using lable encoder

# #data is ordinal, please use ordinal encoder instead(except color)

In [302]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

#Color
encoder = LabelEncoder()
data['color'] = encoder.fit_transform(data['color'])

#Cut
cut_order = 'Fair, Good, Very Good, Premium, Ideal'.split(', ')
cut_enc = OrdinalEncoder(categories=[cut_order])
data['cut'] = cut_enc.fit_transform(data[['cut']])

#Clarity
clarity_order = 'I1, SI2, SI1, VS2, VS1, VVS2, VVS1, IF'.split(', ')
clarity_enc = OrdinalEncoder(categories=[clarity_order])
data['clarity'] = clarity_enc.fit_transform(data[['clarity']])



In [303]:
data.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price,x,y,z
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1.06,4.0,5,1.0,61.8,57.0,4270,6.57,6.6,4.07
2,1.51,3.0,3,5.0,60.9,58.0,15164,7.38,7.42,4.51
3,0.32,4.0,2,3.0,61.3,56.0,828,4.43,4.41,2.71
4,0.53,4.0,3,3.0,61.2,56.0,1577,5.19,5.22,3.19
5,0.7,3.0,4,5.0,61.0,57.0,2596,5.76,5.72,3.5


In [304]:
data['size'] = data['x'] * data['y'] * data['z']

In [305]:
data.drop(['x', 'y', 'z'], inplace=True, axis=1)
data.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price,size
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1.06,4.0,5,1.0,61.8,57.0,4270,176.48334
2,1.51,3.0,3,5.0,60.9,58.0,15164,246.965796
3,0.32,4.0,2,3.0,61.3,56.0,828,52.943373
4,0.53,4.0,3,3.0,61.2,56.0,1577,86.422842
5,0.7,3.0,4,5.0,61.0,57.0,2596,115.3152


# 3.3 Feature Scaling and normalising

In [309]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
scaler.fit(data[['depth']])
data['depth'] = scaler.transform(data[['depth']])

scaler = StandardScaler()
scaler.fit(data[['table']])
data['table'] = scaler.transform(data[['table']])

scaler = StandardScaler()
scaler.fit(data[['carat']])
data['carat'] = scaler.transform(data[['carat']])

In [313]:
data

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price,size
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.554429,4.0,5,1.0,0.036663,-0.205037,4270,176.483340
2,1.505160,3.0,3,5.0,-0.590482,0.242716,15164,246.965796
3,-1.008997,4.0,2,3.0,-0.311751,-0.652790,828,52.943373
4,-0.565322,4.0,3,3.0,-0.381434,-0.652790,1577,86.422842
5,-0.206157,3.0,4,5.0,-0.520800,-0.205037,2596,115.315200
...,...,...,...,...,...,...,...,...
43148,-0.586450,4.0,1,3.0,-0.172386,-0.652790,1760,84.997584
43149,-0.163902,2.0,0,3.0,0.245711,0.690468,3016,115.946550
43150,-0.755469,3.0,5,6.0,-0.172386,0.242716,990,72.801630
43151,-1.030124,3.0,1,4.0,-1.078262,0.242716,734,51.419010


# 4. Splitting traning and test samples

In [375]:
x = data.drop(columns = ['price','table'],axis=1).copy()
y = data['price']


In [376]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.15,random_state=42)



# 5. Training models

# 5.1 Linear Regression

In [377]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

lr = LinearRegression()
lr.fit(x_train,y_train)
lr_y_pred=lr.predict(x_test)
rmse_lr = np.sqrt(mean_squared_error(lr_y_pred,y_test))

accuracy_lr = cross_val_score(estimator=lr,X= x_train, y= y_train,cv=10, verbose=1)
print(accuracy_lr)
print("Root Mean squared error is: ",rmse_lr)


[0.90556373 0.90342829 0.90906476 0.90311091 0.90196538 0.90601107
 0.91042172 0.90252999 0.89304485 0.91323063]
Root Mean squared error is:  1311.1942679752476


# 5.2 Decission Tree Regressor

In [378]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(random_state=0)
dtr.fit(x_train, y_train)

dt_y_pred=dtr.predict(x_test)

accuracy_dtr = cross_val_score(estimator=dtr,X= x_train, y= y_train,cv=10, verbose=1)
rmse_dtr = np.sqrt(mean_squared_error(y_test,dt_y_pred))
print(accuracy_dtr)
print(rmse_dtr)

[0.9662394  0.96571432 0.96720021 0.96948932 0.96666055 0.96373808
 0.96572389 0.96724983 0.9629803  0.9653203 ]
713.6614539335462


# 5.3 Random Forests Regressor

In [379]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100)

rfr.fit(x_train,y_train)
rf_y_pred=rfr.predict(x_test)

accuracy_rfr = cross_val_score(estimator=rfr,X= x_train, y= y_train,cv=10, verbose=1)
rmse_rfr = np.sqrt(mean_squared_error(y_test,rf_y_pred))

print(accuracy_rfr,rmse_rfr)

[0.97975655 0.97876895 0.98055372 0.98070725 0.98179193 0.97979517
 0.98124534 0.97988514 0.98031018 0.97959538] 560.3405594427472


# 5.4 XGBoost Regressor

In [380]:
from xgboost import XGBRegressor

xgb_r = XGBRegressor( n_estimators = 150, seed = 123,  learning_rate=0.25)
xgb_r.fit(x_train, y_train)
xgb_y_pred = xgb_r.predict(x_test)

mean_cross_val_rmse = np.mean(cross_val_score(estimator=xgb_r,X= x_train, y= y_train,cv=4, verbose=1))
rmse_xgb_r = np.sqrt(mean_squared_error(y_test,xgb_y_pred))



In [381]:
print('Mean_cross_val_score: ', mean_cross_val_rmse, 'rmse test',rmse_xgb_r)

Mean_cross_val_score:  0.9798402597934743 rmse test 563.5048825697435


# 5.5 GradientBoosint

In [382]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()

model.fit(x_train, y_train)

gbr_y_pred = model.predict(x_test)

accuracy = cross_val_score(model, x, y, cv=4, scoring='neg_mean_squared_error')
rmse = np.sqrt(mean_squared_error(y_test, gbr_y_pred))
print(accuracy, rmse)

[-390362.08272429 -407888.94558127 -383072.55965542 -381784.36237181] 622.7965278064339


In [383]:
from sklearn.model_selection import GridSearchCV

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

gcv = GridSearchCV(XGBRegressor( n_estimators = 140, seed = 123,  learning_rate=0.3), params, n_jobs=-1)

In [384]:
gcv.fit(x_train, y_train)

In [385]:
model = gcv.best_estimator_

In [408]:
gsv_y_pred = model.predict(x_test)
print('rmse: ', np.sqrt(mean_squared_error(gsv_y_pred, y_test)))

rmse:  557.703296675522


In [456]:
param_cat= {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
    'bagging_temperature': [0, 1, 5],
    'border_count': [32, 64, 128],
    'eval_metric': ['RMSE']
}

params_LGB = {
        'bagging_fraction': 0.5,
        'bagging_freq': 5,
        'objective' :'binary',
        'metric': 'AUC',
        'feature_fraction': 0.5,
        'max_depth': 3,
        'min_data_in_leaf': 40,
        'num_leaves': 600,
        'sample_pos_weight' : 0.2,
        'num_iterations' :  1000
}


params_XGB = {
    'eval_metric': 'logloss',
    'max_depth':6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'learning_rate': 0.03,
    'n_estimators': 100,
    'seed': 42
}

params_logistic = {
    'C': 1.0,
    'solver': 'liblinear',
    'penalty': 'l2',
    'max_iter': 1000
}

params_RandomForest = {
    'n_estimators': 200,
    'max_depth': 8,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'auto',
    'bootstrap': True
}

params_ADA = {
        'learning_rate' : 0.025,
        'n_estimators' : 1500
}

In [440]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [457]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor

results = []

for iteration in param_cat['iterations']:
    param_grid_copy = param_cat.copy()
    param_grid_copy['iterations'] = [iteration]

    cat_grid_search = GridSearchCV(estimator=CatBoostRegressor(),
                               param_grid=param_grid_copy,
                               scoring='neg_root_mean_squared_error',  # Use negative RMSE for optimization
                               cv=3)

    cat_grid_search.fit(x_train, y_train)
    results.append(cat_grid_search.best_params_)

print(results)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0:	learn: 3812.9414052	total: 17.9ms	remaining: 1.77s
1:	learn: 3640.8385401	total: 30.6ms	remaining: 1.5s
2:	learn: 3480.8856707	total: 36ms	remaining: 1.16s
3:	learn: 3327.0195752	total: 46.1ms	remaining: 1.11s
4:	learn: 3188.2074251	total: 51ms	remaining: 969ms
5:	learn: 3058.4042136	total: 60.3ms	remaining: 944ms
6:	learn: 2927.0810513	total: 66.2ms	remaining: 879ms
7:	learn: 2805.5330863	total: 72ms	remaining: 828ms
8:	learn: 2690.3069585	total: 76.9ms	remaining: 778ms
9:	learn: 2578.1284800	total: 85.2ms	remaining: 767ms
10:	learn: 2473.4115752	total: 95.2ms	remaining: 771ms
11:	learn: 2378.8562408	total: 105ms	remaining: 766ms
12:	learn: 2285.4208572	total: 115ms	remaining: 770ms
13:	learn: 2199.1327720	total: 122ms	remaining: 749ms
14:	learn: 2115.5310476	total: 128ms	remaining: 723ms
15:	learn: 2037.5185122	total: 133ms	remaining: 697ms
16:	learn: 1962.3435113	total: 138ms	remaining: 674ms
17:	learn: 1888.7848279

NameError: name 'grid_search' is not defined

In [467]:
cat = CatBoostRegressor(**results[0])
cat.fit(x_train, y_train, verbose=False)
cat_y_pred = cat.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, cat_y_pred))
print("Mean Squared Error:", rmse)

Mean Squared Error: 547.7998087959772


predictions

In [468]:
test_data = pd.read_csv('/content/test.csv', index_col = 'Id')

In [469]:
test_data.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.34,Ideal,G,VVS2,61.1,57.0,4.52,4.48,2.75
2,0.71,Premium,E,VS2,62.7,58.0,5.74,5.68,3.58
3,0.44,Very Good,I,VS1,62.8,56.0,4.83,4.88,3.05
4,0.81,Premium,E,SI2,60.1,59.0,6.09,6.03,3.65
5,0.4,Ideal,G,VVS1,61.2,56.0,4.74,4.8,2.92


In [470]:
test_data['clarity'] = clarity_enc.fit_transform(test_data[['clarity']])
test_data['cut'] = cut_enc.fit_transform(test_data[['cut']])
test_data['color'] = encoder.fit_transform(test_data[['color']])


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [471]:
test_data['size'] = test_data['x'] * test_data['y'] * test_data['z']

In [472]:
test_data.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,size
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.34,4.0,3,5.0,61.1,57.0,4.52,4.48,2.75,55.6864
2,0.71,3.0,1,3.0,62.7,58.0,5.74,5.68,3.58,116.719456
3,0.44,2.0,5,4.0,62.8,56.0,4.83,4.88,3.05,71.88972
4,0.81,3.0,1,1.0,60.1,59.0,6.09,6.03,3.65,134.037855
5,0.4,4.0,3,6.0,61.2,56.0,4.74,4.8,2.92,66.43584


In [473]:
scaler = StandardScaler()
scaler.fit(test_data[['depth']])
test_data['depth'] = scaler.transform(test_data[['depth']])

scaler = StandardScaler()
scaler.fit(test_data[['table']])
test_data['table'] = scaler.transform(test_data[['table']])

scaler = StandardScaler()
scaler.fit(test_data[['carat']])
test_data['carat'] = scaler.transform(test_data[['carat']])

In [474]:
test_data.drop([ 'x', 'y', 'z'], inplace=True, axis=1)

In [475]:
lr_y_pred  = lr.predict(test_data.drop(columns = ['table'],axis = 1))
gsv_y_pred = model.predict(test_data.drop(columns = ['table'],axis = 1))
rf_y_pred  = rfr.predict(test_data.drop(columns = ['table'],axis = 1))
xgb_y_pred = xgb_r.predict(test_data.drop(columns = ['table'],axis = 1))
cat_y_pred = cat.predict(test_data.drop(columns = ['table'],axis = 1))

In [477]:
all_preds = [rf_y_pred,xgb_y_pred,gsv_y_pred,cat_y_pred]
from sklearn.metrics import mean_absolute_error
for i in all_preds:
  for j in all_preds:
    if str(i) != str(j):
      print(mean_absolute_error(i, j))
  print('---------------------------------------------------------------------------------')


185.7462772267964
219.4859627797826
166.70960191032702
---------------------------------------------------------------------------------
185.7462772267964
135.71394
162.89287908600863
---------------------------------------------------------------------------------
219.4859627797826
135.71394
181.28499150673537
---------------------------------------------------------------------------------
166.70960191032702
162.89287908600863
181.28499150673537
---------------------------------------------------------------------------------


In [499]:
test_preds = np.average(all_preds, weights=[0.1,0.3,0.2,0.5],axis = 0)
test_preds

array([  843.47491243,  2931.54542279,   850.21261803, ...,
        4136.5190963 ,  4694.83923263, 13739.52050017])

In [500]:
final_df = pd.DataFrame({
    'Id': test_data.index,
    'price': test_preds
}, columns = ['Id', 'price'])

In [501]:
final_df.head()

Unnamed: 0,Id,price
0,1,843.474912
1,2,2931.545423
2,3,850.212618
3,4,2868.190218
4,5,1141.778352


In [502]:
final_df.to_csv('Submission_weighted_classifiers.csv', index=False)