# Modeling, Prediction & Evaluation

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LinearRegression as LinReg
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.tree import plot_tree
from sklearn import metrics
import numpy as np

In [2]:
df = pd.read_csv("../Data/diamonds_train_clean.csv")

In [3]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.14,5,4,6,61.0,56.0,9013
1,0.76,5,3,4,62.7,57.0,2692
2,0.84,5,4,5,61.4,56.0,4372
3,1.55,5,3,5,62.0,57.0,13665
4,0.3,5,4,2,61.9,57.0,422


In [4]:
X = df.drop(columns = 'price', axis = 1)

In [5]:
X.shape

(40455, 6)

In [7]:
y = df.price

In [8]:
y.shape

(40455,)

In [9]:
X_train, X_test, y_train, y_test = tts(X,y, test_size=0.2)

### 1. Trying different models to find the best one

In [15]:
models = {
    'lin': LinReg(),
    'ridge': Ridge(),
    'lasso': Lasso(),
    'sgd': SGDRegressor(),
    'knn': KNeighborsRegressor(),
    'grad': GradientBoostingRegressor(),
}

In [16]:
for name, model in models.items():
    print("TRAINING: ", name)
    model.fit(X_train, y_train)

TRAINING:  lin
TRAINING:  ridge
TRAINING:  lasso
TRAINING:  sgd
TRAINING:  knn
TRAINING:  grad


In [19]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('MAE - ', metrics.mean_absolute_error(y_test, y_pred))
    print('MSE - ', metrics.mean_squared_error(y_test, y_pred))
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('R2 - ', metrics.r2_score(y_test, y_pred))

------lin------
MAE -  854.5346539129436
MSE -  1527805.0360724356
RMSE -  1236.0441076565332
R2 -  0.9049273099044264
------ridge------
MAE -  854.4504338920148
MSE -  1527813.2470820385
RMSE -  1236.047429139367
R2 -  0.9049267989473652
------lasso------
MAE -  854.0590499981647
MSE -  1527824.2574420278
RMSE -  1236.0518829895564
R2 -  0.904926113791525
------sgd------
MAE -  37117468.41567463
MSE -  2099524016663359.5
RMSE -  45820563.25126699
R2 -  -130649781.83982153
------knn------
MAE -  1107.1528612038069
MSE -  3548843.1178989
RMSE -  1883.8373384926047
R2 -  0.7791615723344071
------grad------
MAE -  342.9747822723377
MSE -  390891.18194472865
RMSE -  625.2129092914898
R2 -  0.975675511387461


### 2. Now I try with Decision Tree

In [37]:
model = DecisionTreeRegressor(max_depth=11)

In [38]:
model.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=11)

In [39]:
y_pred = model.predict(X_test)

In [40]:
metrics.mean_squared_error(y_train, model.predict(X_train)).round(3)

222788.639