In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [10]:
train = pd.read_csv('../data/diamonds_train.csv')

In [11]:
predict = pd.read_csv('../data/diamonds_test.csv')

In [12]:
train.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40455 non-null  float64
 1   cut      40455 non-null  object 
 2   color    40455 non-null  object 
 3   clarity  40455 non-null  object 
 4   depth    40455 non-null  float64
 5   table    40455 non-null  float64
 6   price    40455 non-null  int64  
 7   x        40455 non-null  float64
 8   y        40455 non-null  float64
 9   z        40455 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.1+ MB


In [13]:
train.memory_usage(deep=True)

Index          128
carat       323640
cut        2560230
color      2346390
clarity    2431983
depth       323640
table       323640
price       323640
x           323640
y           323640
z           323640
dtype: int64

predict['z'].isna().sum()
predict.dropna(inplace=True)
predict.shape

In [14]:
train['area_xyz'] = train['z'] * train['x'] * train['y']

In [15]:
predict['area_xyz'] = predict['z'] * predict['x'] * predict['y']

In [16]:
train['area_xyz'].isnull().any()

False

In [17]:
TARGET = 'price'

CAT_FEATURES = ['cut', 'color', 'clarity']
NUM_FEATURES = ['carat', 'depth', 'table', 'x', 'y', 'z']

for categorical_feature in CAT_FEATURES:
    
    train[categorical_feature] = train[categorical_feature].astype('category') 
    predict[categorical_feature] = predict[categorical_feature].astype('category')

In [18]:
categorical_train_df = pd.get_dummies(train[CAT_FEATURES])
numerical_train_df = train[NUM_FEATURES]

train_df = pd.concat([categorical_train_df, numerical_train_df], axis=1)

In [19]:
categorical_predict_df = pd.get_dummies(predict[CAT_FEATURES])
numerical_predict_df = predict[NUM_FEATURES]

predict_df = pd.concat([categorical_predict_df, numerical_predict_df], axis=1)

In [20]:
train_df.shape

(40455, 26)

In [21]:
predict_df.shape

(13485, 26)

In [22]:
FEATURES = categorical_train_df.columns.tolist() + numerical_train_df.columns.tolist()
FEATURES

['cut_Fair',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_D',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_I1',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2',
 'carat',
 'depth',
 'table',
 'x',
 'y',
 'z']

In [23]:
train_df[FEATURES]

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,...,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,carat,depth,table,x,y,z
0,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1.21,62.4,58.0,6.83,6.79,4.25
1,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0.32,63.0,57.0,4.35,4.38,2.75
2,1,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0.71,65.5,55.0,5.62,5.53,3.65
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0.41,63.8,56.0,4.68,4.72,3.00
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1.02,60.5,59.0,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,1.34,62.7,57.0,7.10,7.04,4.43
40451,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,2.02,57.1,60.0,8.31,8.25,4.73
40452,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,1.01,62.7,56.0,6.37,6.42,4.01
40453,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0.33,61.9,54.3,4.45,4.47,2.76


In [24]:
train[TARGET]

0         4268
1          505
2         2686
3          738
4         4882
         ...  
40450    10070
40451    12615
40452     5457
40453      456
40454     6232
Name: price, Length: 40455, dtype: int64

In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
scaler = StandardScaler()

In [27]:
X = scaler.fit_transform(train_df[FEATURES])
X

array([[-0.17611318, -0.3155308 , -0.81809533, ...,  0.97880679,
         0.92198533,  1.02265738],
       [-0.17611318, -0.3155308 , -0.81809533, ..., -1.22673789,
        -1.17981558, -1.1292594 ],
       [ 5.67816675, -0.3155308 , -0.81809533, ..., -0.09728557,
        -0.17688154,  0.16189067],
       ...,
       [-0.17611318, -0.3155308 ,  1.22235144, ...,  0.56971383,
         0.5993022 ,  0.6783507 ],
       [-0.17611318, -0.3155308 ,  1.22235144, ..., -1.13780463,
        -1.10132509, -1.11491329],
       [-0.17611318, -0.3155308 ,  1.22235144, ...,  0.97880679,
         1.00047582,  1.02265738]])

In [28]:
y = train[TARGET]
y

0         4268
1          505
2         2686
3          738
4         4882
         ...  
40450    10070
40451    12615
40452     5457
40453      456
40454     6232
Name: price, Length: 40455, dtype: int64

In [29]:
X.shape

(40455, 26)

In [30]:
y.shape

(40455,)

In [31]:
train_df.head()

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,...,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,carat,depth,table,x,y,z
0,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1.21,62.4,58.0,6.83,6.79,4.25
1,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0.32,63.0,57.0,4.35,4.38,2.75
2,1,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0.71,65.5,55.0,5.62,5.53,3.65
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0.41,63.8,56.0,4.68,4.72,3.0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1.02,60.5,59.0,6.55,6.51,3.95


In [32]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

In [33]:
model = GradientBoostingClassifier()

In [34]:
model

GradientBoostingClassifier()

In [35]:
#errors = cross_val_score(model, X, y, cv=4, verbose=2, scoring='neg_mean_squared_error')

In [36]:
#errors

In [37]:
#total_errors = errors / 4
#total_errors

In [None]:
model.fit(X, y)

In [33]:
X_predict = scaler.fit_transform(predict_df[FEATURES])
X_predict

array([[-0.17325799, -0.31864666, -0.80857254, ...,  0.07502152,
         0.13323602,  0.17309121],
       [-0.17325799, -0.31864666,  1.23674742, ...,  0.96400677,
         1.01939517,  0.87078717],
       [-0.17325799, -0.31864666, -0.80857254, ...,  1.47584677,
         1.40044361,  1.40431938],
       ...,
       [-0.17325799, -0.31864666,  1.23674742, ...,  0.01216398,
        -0.02627263, -0.01843317],
       [-0.17325799, -0.31864666, -0.80857254, ...,  0.10196047,
         0.13323602, -0.12787567],
       [-0.17325799, -0.31864666,  1.23674742, ..., -0.93069917,
        -0.89470861, -0.82557163]])

In [34]:
predictions = model.predict(X_predict)

In [38]:
diamond_id = predict['id']

In [39]:
submission = pd.DataFrame({'id': diamond_id, 'price': predictions})

In [46]:
submission.to_csv('../submissions/submission_test.csv', index=False)

In [47]:
submission.shape

(13485, 2)