In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import datasets, linear_model
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
import plotly.express as px
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMClassifier
from IPython.display import set_matplotlib_formats
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

In [4]:
train = pd.read_csv('../data/diamonds_train.csv')

In [5]:
predict = pd.read_csv('../data/diamonds_test.csv')

In [6]:
train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


In [7]:
train.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,0.797706,61.752841,57.446133,3928.444469,5.729392,5.732819,3.537154
std,0.475544,1.431725,2.233535,3992.416147,1.124453,1.14665,0.697062
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,945.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2397.0,5.69,5.71,3.52
75%,1.04,62.5,59.0,5331.0,6.54,6.54,4.035
max,4.5,79.0,95.0,18823.0,10.23,58.9,8.06


predict['z'].isna().sum()
predict.dropna(inplace=True)
predict.shape

In [8]:
train['area_xyz'] = train['z'] * train['x'] * train['y']

In [9]:
predict['area_xyz'] = predict['z'] * predict['x'] * predict['y']

In [10]:
train['area_xyz'].isnull().any()

False

In [11]:
TARGET = 'price'

CAT_FEATURES = ['cut', 'color', 'clarity']
NUM_FEATURES = ['carat', 'depth', 'table', 'x', 'y', 'z']

for categorical_feature in CAT_FEATURES:
    
    train[categorical_feature] = train[categorical_feature].astype('category') 
    predict[categorical_feature] = predict[categorical_feature].astype('category')

In [12]:
categorical_train_df = pd.get_dummies(train[CAT_FEATURES])
numerical_train_df = train[NUM_FEATURES]

train_df = pd.concat([categorical_train_df, numerical_train_df], axis=1)

In [13]:
categorical_predict_df = pd.get_dummies(predict[CAT_FEATURES])
numerical_predict_df = predict[NUM_FEATURES]

predict_df = pd.concat([categorical_predict_df, numerical_predict_df], axis=1)

In [14]:
train_df.shape

(40455, 26)

In [15]:
predict_df.shape

(13485, 26)

In [16]:
FEATURES = categorical_train_df.columns.tolist() + numerical_train_df.columns.tolist()
FEATURES

['cut_Fair',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_D',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_I1',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2',
 'carat',
 'depth',
 'table',
 'x',
 'y',
 'z']

In [17]:
train_df[FEATURES]

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,...,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,carat,depth,table,x,y,z
0,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1.21,62.4,58.0,6.83,6.79,4.25
1,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0.32,63.0,57.0,4.35,4.38,2.75
2,1,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0.71,65.5,55.0,5.62,5.53,3.65
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0.41,63.8,56.0,4.68,4.72,3.00
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1.02,60.5,59.0,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,1.34,62.7,57.0,7.10,7.04,4.43
40451,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,2.02,57.1,60.0,8.31,8.25,4.73
40452,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,1.01,62.7,56.0,6.37,6.42,4.01
40453,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0.33,61.9,54.3,4.45,4.47,2.76


In [18]:
train[TARGET]

0         4268
1          505
2         2686
3          738
4         4882
         ...  
40450    10070
40451    12615
40452     5457
40453      456
40454     6232
Name: price, Length: 40455, dtype: int64

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
scaler = StandardScaler()

In [21]:
X = scaler.fit_transform(train_df[FEATURES])
X

array([[-0.17611318, -0.3155308 , -0.81809533, ...,  0.97880679,
         0.92198533,  1.02265738],
       [-0.17611318, -0.3155308 , -0.81809533, ..., -1.22673789,
        -1.17981558, -1.1292594 ],
       [ 5.67816675, -0.3155308 , -0.81809533, ..., -0.09728557,
        -0.17688154,  0.16189067],
       ...,
       [-0.17611318, -0.3155308 ,  1.22235144, ...,  0.56971383,
         0.5993022 ,  0.6783507 ],
       [-0.17611318, -0.3155308 ,  1.22235144, ..., -1.13780463,
        -1.10132509, -1.11491329],
       [-0.17611318, -0.3155308 ,  1.22235144, ...,  0.97880679,
         1.00047582,  1.02265738]])

In [22]:
y = train[TARGET]
y

0         4268
1          505
2         2686
3          738
4         4882
         ...  
40450    10070
40451    12615
40452     5457
40453      456
40454     6232
Name: price, Length: 40455, dtype: int64

In [23]:
X.shape

(40455, 26)

In [24]:
y.shape

(40455,)

In [25]:
train_df.head()

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,...,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,carat,depth,table,x,y,z
0,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1.21,62.4,58.0,6.83,6.79,4.25
1,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0.32,63.0,57.0,4.35,4.38,2.75
2,1,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0.71,65.5,55.0,5.62,5.53,3.65
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0.41,63.8,56.0,4.68,4.72,3.0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1.02,60.5,59.0,6.55,6.51,3.95


In [26]:
from sklearn.model_selection import cross_val_score

In [27]:
model = LGBMClassifier()

In [28]:
model

LGBMClassifier()

In [29]:
#errors = cross_val_score(model, X, y, cv=4, verbose=2, scoring='neg_mean_squared_error')

In [30]:
#errors

In [31]:
#total_errors = errors / 4
#total_errors

In [None]:
model.fit(X, y)
plot_decision_boundary(X, y, model)

In [32]:
X_predict = scaler.fit_transform(predict_df[FEATURES])
X_predict

array([[-0.17325799, -0.31864666, -0.80857254, ...,  0.07502152,
         0.13323602,  0.17309121],
       [-0.17325799, -0.31864666,  1.23674742, ...,  0.96400677,
         1.01939517,  0.87078717],
       [-0.17325799, -0.31864666, -0.80857254, ...,  1.47584677,
         1.40044361,  1.40431938],
       ...,
       [-0.17325799, -0.31864666,  1.23674742, ...,  0.01216398,
        -0.02627263, -0.01843317],
       [-0.17325799, -0.31864666, -0.80857254, ...,  0.10196047,
         0.13323602, -0.12787567],
       [-0.17325799, -0.31864666,  1.23674742, ..., -0.93069917,
        -0.89470861, -0.82557163]])

In [2]:
predictions = model.predict(X_predict)

NameError: name 'model' is not defined

In [38]:
diamond_id = predict['id']

In [39]:
submission = pd.DataFrame({'id': diamond_id, 'price': predictions})

In [46]:
submission.to_csv('../submissions/submission_test.csv', index=False)

In [47]:
submission.shape

(13485, 2)