In [1]:
import numpy as np 
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import datasets, linear_model
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
import plotly.express as px

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 1. Data Loading

In [3]:
train = pd.read_csv('../data/diamonds_train.csv')

In [4]:
predict = pd.read_csv('../data/diamonds_test.csv')

## 2. EDA

In [5]:
train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


In [6]:
train.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [7]:
train.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,0.797706,61.752841,57.446133,3928.444469,5.729392,5.732819,3.537154
std,0.475544,1.431725,2.233535,3992.416147,1.124453,1.14665,0.697062
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,945.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2397.0,5.69,5.71,3.52
75%,1.04,62.5,59.0,5331.0,6.54,6.54,4.035
max,4.5,79.0,95.0,18823.0,10.23,58.9,8.06


In [8]:
train['area_xyz'] = 2 * train['z'] / train['x'] + train['y']

In [9]:
predict.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19


In [10]:
predict['area_xyz'] = 2 * predict['z'] / predict['x'] + predict['y']

In [11]:
#train['price/carat'] = train['price'] / train['carat']

In [12]:
train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,area_xyz
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,8.03451
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,5.644368
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,6.828932
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0,6.002051
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,7.716107


## 3. modeling

In [13]:
TARGET = 'price'

CAT_FEATURES = ['cut', 'color', 'clarity']
NUM_FEATURES = ['carat', 'depth', 'table', 'x', 'y', 'z', 'area_xyz']

for categorical_feature in CAT_FEATURES:
    
    train[categorical_feature] = train[categorical_feature].astype('category') 
    predict[categorical_feature] = predict[categorical_feature].astype('category')

In [14]:
categorical_train_df = pd.get_dummies(train[CAT_FEATURES])
numerical_train_df = train[NUM_FEATURES]

train_df = pd.concat([categorical_train_df, numerical_train_df], axis=1)

In [15]:
categorical_predict_df = pd.get_dummies(predict[CAT_FEATURES])
numerical_predict_df = predict[NUM_FEATURES]

predict_df = pd.concat([categorical_predict_df, numerical_predict_df], axis=1)

In [16]:
train_df.shape

(40455, 27)

In [17]:
predict_df.shape

(13485, 27)

In [18]:
FEATURES = categorical_train_df.columns.tolist() + numerical_train_df.columns.tolist()
FEATURES

['cut_Fair',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_D',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_I1',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2',
 'carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'area_xyz']

In [19]:
train_df[FEATURES]

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,...,clarity_VS2,clarity_VVS1,clarity_VVS2,carat,depth,table,x,y,z,area_xyz
0,0,0,0,1,0,0,0,0,0,0,...,1,0,0,1.21,62.4,58.0,6.83,6.79,4.25,8.034510
1,0,0,0,0,1,0,0,0,0,1,...,1,0,0,0.32,63.0,57.0,4.35,4.38,2.75,5.644368
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0.71,65.5,55.0,5.62,5.53,3.65,6.828932
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0.41,63.8,56.0,4.68,4.72,3.00,6.002051
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,1.02,60.5,59.0,6.55,6.51,3.95,7.716107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,0,0,1,0,0,0,0,0,1,0,...,0,0,0,1.34,62.7,57.0,7.10,7.04,4.43,8.287887
40451,0,1,0,0,0,0,0,1,0,0,...,0,0,0,2.02,57.1,60.0,8.31,8.25,4.73,9.388387
40452,0,0,1,0,0,0,0,0,0,1,...,0,0,0,1.01,62.7,56.0,6.37,6.42,4.01,7.679027
40453,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0.33,61.9,54.3,4.45,4.47,2.76,5.710449


In [20]:
train[TARGET]

0         4268
1          505
2         2686
3          738
4         4882
         ...  
40450    10070
40451    12615
40452     5457
40453      456
40454     6232
Name: price, Length: 40455, dtype: int64

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
scaler = StandardScaler()

In [23]:
X = scaler.fit_transform(train_df[FEATURES])
X

array([[-0.17611318, -0.3155308 , -0.81809533, ...,  0.92198533,
         1.02265738,  0.93119836],
       [-0.17611318, -0.3155308 , -0.81809533, ..., -1.17981558,
        -1.1292594 , -1.15676979],
       [ 5.67816675, -0.3155308 , -0.81809533, ..., -0.17688154,
         0.16189067, -0.12196374],
       ...,
       [-0.17611318, -0.3155308 ,  1.22235144, ...,  0.5993022 ,
         0.6783507 ,  0.62065743],
       [-0.17611318, -0.3155308 ,  1.22235144, ..., -1.10132509,
        -1.11491329, -1.09904253],
       [-0.17611318, -0.3155308 ,  1.22235144, ...,  1.00047582,
         1.02265738,  1.00982011]])

In [24]:
y = train[TARGET]

In [25]:
X.shape

(40455, 27)

In [26]:
y.shape

(40455,)

In [36]:
train_df.head()

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,...,clarity_VS2,clarity_VVS1,clarity_VVS2,carat,depth,table,x,y,z,area_xyz
0,0,0,0,1,0,0,0,0,0,0,...,1,0,0,1.21,62.4,58.0,6.83,6.79,4.25,8.03451
1,0,0,0,0,1,0,0,0,0,1,...,1,0,0,0.32,63.0,57.0,4.35,4.38,2.75,5.644368
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0.71,65.5,55.0,5.62,5.53,3.65,6.828932
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0.41,63.8,56.0,4.68,4.72,3.0,6.002051
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,1.02,60.5,59.0,6.55,6.51,3.95,7.716107


In [38]:
from sklearn.model_selection import train_test_split

In [39]:
diamonds_train, diamonds_test = train_test_split(train_df)

In [40]:
print(diamonds_train.shape)
print(diamonds_test.shape)

(30341, 27)
(10114, 27)


In [43]:
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

In [49]:
model = DecisionTreeRegressor(max_depth=4)

In [68]:
np.where(np.isnan(X))

(array([ 6465, 14815, 26192, 28029, 34423]), array([26, 26, 26, 26, 26]))

In [69]:
np.nan_to_num(X)

array([[-0.17611318, -0.3155308 , -0.81809533, ...,  0.92198533,
         1.02265738,  0.93119836],
       [-0.17611318, -0.3155308 , -0.81809533, ..., -1.17981558,
        -1.1292594 , -1.15676979],
       [ 5.67816675, -0.3155308 , -0.81809533, ..., -0.17688154,
         0.16189067, -0.12196374],
       ...,
       [-0.17611318, -0.3155308 ,  1.22235144, ...,  0.5993022 ,
         0.6783507 ,  0.62065743],
       [-0.17611318, -0.3155308 ,  1.22235144, ..., -1.10132509,
        -1.11491329, -1.09904253],
       [-0.17611318, -0.3155308 ,  1.22235144, ...,  1.00047582,
         1.02265738,  1.00982011]])

In [70]:
np.where(np.isnan(X))

(array([ 6465, 14815, 26192, 28029, 34423]), array([26, 26, 26, 26, 26]))

In [72]:
X.fillna(X.mean(), inplace=True)

AttributeError: 'numpy.ndarray' object has no attribute 'fillna'