# Models

## Imports

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score


In [2]:
df = pd.read_csv('./train_clean.csv')

In [45]:
df5 = pd.read_csv('./test_clean.csv')

## Linear regression model

In [4]:
X = df.drop(["price"], axis = 1)

In [5]:
y = df.price

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

In [7]:
X_train.shape


(32364, 10)

In [8]:
y_train.shape

(32364,)

In [9]:
X_test.shape

(8091, 10)

In [10]:
y_test.shape

(8091,)

In [11]:
lr = LinearRegression()

In [12]:
lr.fit(X_train, y_train)

LinearRegression()

In [13]:
y_pred = lr.predict(X_test).round(1)


In [14]:
len(y_pred)

8091

In [15]:
len(y_test)

8091

In [16]:
y_pred[:10]

array([9.4, 8.8, 6.3, 7.8, 7.1, 7.5, 6.8, 6.1, 7.6, 6.1])

In [17]:
y_test[:10]

11567    9.473
31139    8.805
21007    6.358
25120    7.904
10065    7.197
6433     7.553
31770    6.759
3361     6.043
38916    7.677
25057    6.292
Name: price, dtype: float64

In [18]:
rmse = sqrt(mean_squared_error(y_test, y_pred))
rmse

0.1598780027882294

In [21]:
y_pred_test = lr.predict(df2).round(1)

In [22]:
df2["price"] = y_pred_test

In [24]:
df_upload = df2.drop(["carat","cut","color","clarity","depth","table","x","y","z",], axis = 1)

In [25]:
df_upload.head()

Unnamed: 0,id,price
0,0,6.2
1,1,8.6
2,2,9.6
3,3,8.0
4,4,9.2


In [None]:
df_upload.to_csv("first_try.csv",index=False)

### Decision tree

In [None]:
dt = DecisionTreeRegressor(max_depth=5, random_state=111)

In [None]:
dt.fit(X_train, y_train)

In [None]:
dt.predict(X_train).round(2)[:10]

In [None]:
y_test[0:10]

In [None]:
mean_squared_error(
    y_true=y_test,
    y_pred=dt.predict(X_test)
).round(3)

In [None]:
rmse2 = sqrt(mean_squared_error(
    y_true=y_test,
    y_pred=dt.predict(X_test))
)

In [None]:
rmse2

In [None]:
y_pred_test2 = dt.predict(df3).round(1)

In [None]:
df3["price"] = y_pred_test2

In [None]:
df3.head()

In [None]:
df_upload2 = df3.drop(["carat","cut","color","clarity","depth","table","x","y","z",], axis = 1)

In [None]:
df_upload2.head()

In [None]:
df_upload2.to_csv("second_try.csv",index=False)

### Random Forrest

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_features=9, max_depth=10, min_samples_split=50)

In [None]:
rf.fit(X_train, y_train)

In [None]:
rmse3 = sqrt(mean_squared_error(
    y_true=y_test,
    y_pred=rf.predict(X_test))
)

In [None]:
rmse3

In [None]:
y_pred_test3 = rf.predict(df3).round(1)

In [None]:
df3["price"] = y_pred_test3

In [None]:
df3.head()

In [None]:
df_upload3 = df3.drop(["carat","cut","color","clarity","depth","table","x","y","z",], axis = 1)

In [None]:
df_upload3.head()

In [None]:
df_upload3.to_csv("third_try.csv",index=False)

### Random Forrest

In [49]:
rf2 = RandomForestRegressor(n_estimators=200, max_features=9, max_depth=12, min_samples_split=50)

In [50]:
rf2.fit(X_train, y_train)

RandomForestRegressor(max_depth=12, max_features=9, min_samples_split=50,
                      n_estimators=200)

### Trying cross validation

In [60]:
scores = cross_val_score(rf2, X, y, scoring='neg_root_mean_squared_error', cv=5)

In [61]:
scores

array([-0.10635327, -0.10539858, -0.10164894, -0.10117233, -0.10075843])

In [62]:
scores.mean()

-0.1030663078669453

In [51]:
rmse4 = sqrt(mean_squared_error(
    y_true=y_test,
    y_pred=rf2.predict(X_test))
)

In [52]:
rmse4

0.10204868566219691

In [53]:
y_pred_test5 = rf2.predict(df3).round(1)

In [54]:
df3["price"] = y_pred_test5

In [63]:
df3.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.32,5,2,3,60.5,58.0,4.43,4.49,2.7,6.2
1,1,1.24,4,2,3,62.9,60.0,6.8,6.74,4.26,8.5
2,2,1.66,4,7,3,62.0,59.0,7.55,7.6,4.7,9.5
3,3,0.75,4,7,2,60.6,56.0,5.94,5.9,3.59,7.8
4,4,1.5,1,6,2,64.8,55.0,7.26,7.15,4.67,8.9


In [None]:
df_upload5 = df3.drop(["carat","cut","color","clarity","depth","table","x","y","z",], axis = 1)

In [None]:
df_upload5

In [None]:
df_upload5.to_csv("fifth_try.csv",index=False)

### XGboost

In [26]:
xg = XGBRegressor()

In [27]:
xg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [92]:
xg_pred = xg.predict(X_test).round(1)

In [93]:
rmse5 = sqrt(mean_squared_error(
    y_true=y_test,
    y_pred=xg_pred
))

In [94]:
rmse5


0.09563909597439087

In [95]:
xg_pred_0 = xg.predict(df3).round(1)

ValueError: Feature shape mismatch, expected: 10, got 13485

In [None]:
df_upload4 = df3.drop(["carat","cut","color","clarity","depth","table","x","y","z",], axis = 1)

In [None]:
df_upload4

In [None]:
df_upload4.to_csv("fourth_try.csv",index=False)

### Catboost

In [150]:
cb = CatBoostRegressor(iterations=110,
                          learning_rate=1,
                          depth=7)

In [162]:
#cb.fit(X_train, y_train)

In [153]:
pred_cb = cb.predict(X_test).round(1)

In [154]:
rmse6 = sqrt(mean_squared_error(y_test, pred_cb))
rmse6

0.09657658531469644

In [None]:
pred_cb[15:30]

In [None]:
y_test[15:30]

In [155]:
y_pred_test10 = cb.predict(df3).round(1)

In [156]:
df3["price"] = y_pred_test10

In [157]:
y_pred_test10[0:10]

array([6.1, 8.5, 9.4, 7.8, 9. , 8.1, 7.1, 6.9, 6.4, 6.8])

In [158]:
df_upload10 = df3.drop(["carat","cut","color","clarity","depth","table","x","y","z",], axis = 1)

In [160]:
df_upload10.head()

Unnamed: 0,id,price
0,0,6.1
1,1,8.5
2,2,9.4
3,3,7.8
4,4,9.0


In [161]:
df_upload10.to_csv("catboost2.csv",index=False)

###  Trying cross validation

In [106]:
#scores2 = cross_val_score(cb, X, y, scoring='neg_mean_absolute_error', cv=5)

In [100]:
scores2.mean()

-0.0702187421867361

In [101]:
y_pred_test15 = cb.predict(df3).round(1)

In [102]:
df3["price"] = y_pred_test15

In [107]:
df3[30:50]

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
30,30,0.34,5,3,8,60.8,56.0,4.56,4.55,2.77,6.9
31,31,1.11,5,3,2,61.3,60.0,6.68,6.66,4.09,8.4
32,32,1.04,4,2,3,61.8,60.0,6.51,6.47,4.01,8.3
33,33,0.3,4,3,3,61.3,61.0,4.27,4.25,2.61,6.3
34,34,2.08,3,7,2,60.1,59.0,8.29,8.37,5.01,9.8
35,35,1.0,3,3,4,64.7,56.0,6.23,6.29,4.05,8.6
36,36,1.58,3,4,2,60.0,53.0,7.59,7.64,4.57,9.2
37,37,0.7,3,6,2,63.2,61.0,5.62,5.59,3.54,7.7
38,38,1.66,5,3,8,61.5,57.0,7.59,7.62,4.68,9.6
39,39,2.03,4,4,3,61.3,59.0,8.11,8.08,4.96,9.7
