# Modeling, Prediction & Evaluation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LinearRegression as LinReg
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.tree import plot_tree
from sklearn import metrics
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("../Data/diamonds_train_clean.csv")

In [3]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.14,5,4,6,61.0,56.0,9013
1,0.76,5,3,4,62.7,57.0,2692
2,0.84,5,4,5,61.4,56.0,4372
3,1.55,5,3,5,62.0,57.0,13665
4,0.3,5,4,2,61.9,57.0,422


In [4]:
X = df.drop(columns = 'price', axis = 1)

In [5]:
X.shape

(40455, 6)

In [6]:
y = df.price

In [7]:
y.shape

(40455,)

In [8]:
X_train, X_test, y_train, y_test = tts(X,y, test_size=0.2)

### 1. Trying different models to find the best one

In [9]:
models = {
    'lin': LinReg(),
    'ridge': Ridge(),
    'lasso': Lasso(),
    'sgd': SGDRegressor(),
    'knn': KNeighborsRegressor(),
    'grad': GradientBoostingRegressor(),
    'rfc': RandomForestRegressor()
}

In [10]:
for name, model in models.items():
    print("TRAINING: ", name)
    model.fit(X_train, y_train)

TRAINING:  lin
TRAINING:  ridge
TRAINING:  lasso
TRAINING:  sgd
TRAINING:  knn
TRAINING:  grad
TRAINING:  rfc


In [11]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('MAE - ', metrics.mean_absolute_error(y_test, y_pred))
    print('MSE - ', metrics.mean_squared_error(y_test, y_pred))
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('R2 - ', metrics.r2_score(y_test, y_pred))

------lin------
MAE -  849.7072576263424
MSE -  1463964.7539093392
RMSE -  1209.944111894983
R2 -  0.9059661859367857
------ridge------
MAE -  849.6187271381824
MSE -  1463961.2793561576
RMSE -  1209.9426760620347
R2 -  0.9059664091153061
------lasso------
MAE -  849.2474959923169
MSE -  1463999.245847013
RMSE -  1209.9583653361851
R2 -  0.9059639704405141
------sgd------
MAE -  245136809.22809717
MSE -  7.959545330635123e+16
RMSE -  282126661.81407106
R2 -  -5112598534.244655
------knn------
MAE -  1086.3781238413053
MSE -  3464037.302123347
RMSE -  1861.1924409161313
R2 -  0.7774969385662698
------grad------
MAE -  340.4847995355614
MSE -  373233.4919637326
RMSE -  610.9283852987456
R2 -  0.9760263567194766
------rfc------
MAE -  288.88555329087757
MSE -  300020.6804049377
RMSE -  547.741435720302
R2 -  0.9807289835353071


### 2. Now I try with Decision Tree

In [12]:
model = DecisionTreeRegressor(max_depth=10)

In [13]:
model.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=10)

In [14]:
y_pred = model.predict(X_test)

In [15]:
metrics.mean_squared_error(y_train, model.predict(X_train)).round(3)

281734.387

The model with the lowest RMSE is Random Forest Regressor, so I will continue with that model

### 3. Applying the model to the test dataset

In [16]:
test = pd.read_csv("../Data/diamonds_test_clean.csv")

In [17]:
test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table
0,1.0,1,1,2,56.3,64.0
1,0.83,4,4,3,62.3,58.0
2,1.0,1,6,2,67.0,53.0
3,1.0,1,3,2,66.5,62.0
4,1.2,3,2,3,62.6,57.0


I apply the RandomForestRegressor model as it is the one with the lowest RMSE from all the models tried.

In [18]:
forest = RandomForestRegressor()

In [19]:
forest.fit(X_train, y_train)

RandomForestRegressor()

In [20]:
y_pred = forest.predict(X_test)

In [21]:
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

RMSE:  547.320624528149


To improve my results, I will use GridSearchCV to find the best parameters in order to reduce a little bit more my RMSE

In [22]:
params = {'max_depth': [10, 200],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 7],
 'n_estimators': [300, 350, 400]}


In [23]:
grid = GridSearchCV(forest, params, verbose=1)

In [24]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_depth': [10, 200],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 7],
                         'n_estimators': [300, 350, 400]},
             verbose=1)

In [25]:
print(grid.best_params_)

{'max_depth': 200, 'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 400}


Now that I have my best parameters, I will try the model again with those ones.

In [26]:
forest = RandomForestRegressor(max_depth = 100, min_samples_leaf = 3, n_estimators = 200)

In [27]:
forest.fit(X_train, y_train)

RandomForestRegressor(max_depth=100, min_samples_leaf=3, n_estimators=200)

In [28]:
y_pred = forest.predict(X_test)

In [29]:
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

RMSE:  537.8436603558305


In [30]:
model2 = forest.fit(X,y)

Finally, time to apply this model to my test dataset

In [31]:
price = model2.predict(test)

In [32]:
test['price'] = price

In [33]:
test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.0,1,1,2,56.3,64.0,3293.828971
1,0.83,4,4,3,62.3,58.0,3062.364947
2,1.0,1,6,2,67.0,53.0,3335.361612
3,1.0,1,3,2,66.5,62.0,3164.384607
4,1.2,3,2,3,62.6,57.0,5381.241642


In [34]:
test2 = test.drop(['carat', 'cut', 'color', 'clarity', 'depth', 'table'], axis=1)

In [35]:
test2.index.rename('id', inplace=True)

In [36]:
test2.sample()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
8700,554.997991


In [37]:
test2.to_csv('../Data/predictions.csv')