# Create a model to predict the price of diamonds
preprocessing data

evaluate the model

In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.base import clone

In [2]:
diamonds = sns.load_dataset('diamonds')
print(diamonds.shape)
diamonds.head()

(37809, 10)


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
diamonds.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [4]:
X = diamonds[['carat', 'cut', 'color', 'clarity', 'depth', 'table']]
y = diamonds['price']

In [6]:
sns.scatterplot(X)



ValueError: If using all scalar values, you must pass an index

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= .2, random_state= 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [17]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
])

column_transformer = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, ['carat','depth', 'table']),
    ('categoric', categorical_transformer, ['cut', 'color', 'clarity'] )
])

In [20]:
def mse(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

1582710.1630482445

In [19]:
model_1 = Pipeline(steps=[
    ('preprocessing', column_transformer),
    ('model', LinearRegression())
])

mse(model_1)

In [21]:
model_2 = Pipeline(steps=[
    ('preprocessing', column_transformer),
    ('poly_features', PolynomialFeatures(degree=2)),
    ('model', LinearRegression())
])

mse(model_2)

In [23]:
model_3 = Pipeline(steps=[
    ('preprocessing', column_transformer),
    ('model', Ridge())
])

mse(model_3)

In [32]:
model_4 = Pipeline(steps=[
    ('preprocessing', column_transformer),
    ('poly_features', PolynomialFeatures(degree=2)),
    ('model', Ridge(alpha=0.5))
])

mse(model_4)

In [None]:
def gs(model):
    gridsearch = GridSearchCV(estimator=model,
                         param_grid={
                             'model__alpha':[.01, .1, 1., 10., 100]
                         })
    gridsearch.fit(X_train, y_train);
    
    best_gs_model = clone(gridsearch.best_estimator_)
    return mse(best_gs_model), gridsearch.best_params_


In [None]:
gs(model_4)