In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import datasets, linear_model
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
import plotly.express as px
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
train = pd.read_csv('../data/diamonds_train.csv')

In [3]:
predict = pd.read_csv('../data/diamonds_test.csv')

In [4]:
train['area_xyz'] = train['z'] * train['x'] * train['y']

In [5]:
predict['area_xyz'] = predict['z'] * predict['x'] * predict['y']

In [10]:
train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,area_xyz
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,197.096725
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,52.39575
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,113.43689
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0,66.2688
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,168.429975


In [11]:
TARGET = 'price'

CAT_FEATURES = ['cut', 'color', 'clarity']
NUM_FEATURES = ['carat', 'depth', 'table', 'x', 'y', 'z', 'area_xyz']

for categorical_feature in CAT_FEATURES:
    
    train[categorical_feature] = train[categorical_feature].astype('category') 
    predict[categorical_feature] = predict[categorical_feature].astype('category')

In [12]:
categorical_train_df = pd.get_dummies(train[CAT_FEATURES])
numerical_train_df = train[NUM_FEATURES]

train_df = pd.concat([categorical_train_df, numerical_train_df], axis=1)

In [13]:
categorical_predict_df = pd.get_dummies(predict[CAT_FEATURES])
numerical_predict_df = predict[NUM_FEATURES]

predict_df = pd.concat([categorical_predict_df, numerical_predict_df], axis=1)

In [14]:
FEATURES = categorical_train_df.columns.tolist() + numerical_train_df.columns.tolist()
FEATURES

['cut_Fair',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_D',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_I1',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2',
 'carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'area_xyz']

In [15]:
train_df[FEATURES]

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,...,clarity_VS2,clarity_VVS1,clarity_VVS2,carat,depth,table,x,y,z,area_xyz
0,0,0,0,1,0,0,0,0,0,0,...,1,0,0,1.21,62.4,58.0,6.83,6.79,4.25,197.096725
1,0,0,0,0,1,0,0,0,0,1,...,1,0,0,0.32,63.0,57.0,4.35,4.38,2.75,52.395750
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0.71,65.5,55.0,5.62,5.53,3.65,113.436890
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0.41,63.8,56.0,4.68,4.72,3.00,66.268800
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,1.02,60.5,59.0,6.55,6.51,3.95,168.429975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,0,0,1,0,0,0,0,0,1,0,...,0,0,0,1.34,62.7,57.0,7.10,7.04,4.43,221.429120
40451,0,1,0,0,0,0,0,1,0,0,...,0,0,0,2.02,57.1,60.0,8.31,8.25,4.73,324.276975
40452,0,0,1,0,0,0,0,0,0,1,...,0,0,0,1.01,62.7,56.0,6.37,6.42,4.01,163.990554
40453,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0.33,61.9,54.3,4.45,4.47,2.76,54.900540


In [16]:
train[TARGET]

0         4268
1          505
2         2686
3          738
4         4882
         ...  
40450    10070
40451    12615
40452     5457
40453      456
40454     6232
Name: price, Length: 40455, dtype: int64

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()

In [19]:
X = scaler.fit_transform(train_df[FEATURES])
X

array([[-0.17611318, -0.3155308 , -0.81809533, ...,  0.92198533,
         1.02265738,  0.85287564],
       [-0.17611318, -0.3155308 , -0.81809533, ..., -1.17981558,
        -1.1292594 , -0.98103352],
       [ 5.67816675, -0.3155308 , -0.81809533, ..., -0.17688154,
         0.16189067, -0.20741119],
       ...,
       [-0.17611318, -0.3155308 ,  1.22235144, ...,  0.5993022 ,
         0.6783507 ,  0.43329512],
       [-0.17611318, -0.3155308 ,  1.22235144, ..., -1.10132509,
        -1.11491329, -0.94928835],
       [-0.17611318, -0.3155308 ,  1.22235144, ...,  1.00047582,
         1.02265738,  0.88598559]])

In [20]:
y = train[TARGET]
y

0         4268
1          505
2         2686
3          738
4         4882
         ...  
40450    10070
40451    12615
40452     5457
40453      456
40454     6232
Name: price, Length: 40455, dtype: int64

In [69]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

In [74]:
max_depths = range(1, 20)
training_error = []
for max_depth in max_depths:
    model_1 = DecisionTreeRegressor(max_depth=max_depth)
    model_1.fit(X, y)
    training_error.append(mse(y, model_1.predict(X)))

In [79]:
training_error

[6238241.459971933,
 2731472.972565481,
 1995185.4450105845,
 1639327.2336345243,
 1360204.6230938043,
 1111299.613147137,
 896946.2614768189,
 731528.62880177,
 577153.721959999,
 455947.73763881926,
 358596.2088956144,
 275186.11988127604,
 214307.81664974222,
 165478.1483884433,
 124922.34300360462,
 92287.50517160787,
 66610.43386366882,
 46939.104378640535,
 31799.5214498413]

In [75]:
X_predict = scaler.fit_transform(predict_df[FEATURES])
X_predict

array([[-0.17325799, -0.31864666, -0.80857254, ...,  0.13323602,
         0.17309121, -0.05488267],
       [-0.17325799, -0.31864666,  1.23674742, ...,  1.01939517,
         0.87078717,  0.86756363],
       [-0.17325799, -0.31864666, -0.80857254, ...,  1.40044361,
         1.40431938,  1.53326171],
       ...,
       [-0.17325799, -0.31864666,  1.23674742, ..., -0.02627263,
        -0.01843317, -0.18485144],
       [-0.17325799, -0.31864666, -0.80857254, ...,  0.13323602,
        -0.12787567, -0.14581152],
       [-0.17325799, -0.31864666,  1.23674742, ..., -0.89470861,
        -0.82557163, -0.84779795]])

In [80]:
predictions = model.predict(X_predict)

In [81]:
from sklearn.metrics import mean_squared_error

In [82]:
diamond_id = predict['id']

In [83]:
submission = pd.DataFrame({'id': diamond_id, 'price': predictions})

In [84]:
submission.to_csv('../submissions/submission_3.csv', index=False)

In [85]:
submission.shape

(13485, 2)