In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler

In [2]:
df_train = pd.read_csv("../data/train.csv", index_col="id")
df_test = pd.read_csv("../data/test.csv", index_col="id")

#### Enconde categorical data

In [3]:
def encode_categorical_data(dataframe):
    cut_type = {
    'Ideal': 5,
    'Premium': 4,
    'Very Good': 3,
    'Good': 2,
    'Fair': 1
    }
    
    color_type = {
    'D': 7,
    'E': 6,
    'F': 5,
    'G': 4,
    'H': 3,
    'I': 2,
    'J': 1
    }
    
    clarity_type = {
    'IF': 8,
    'VVS1': 7,
    'VVS2': 6,
    'VS1': 5,
    'VS2': 4,
    'SI1': 3,
    'SI2': 2,
    'I1': 1
    }    

    dataframe['cut'] = dataframe['cut'].map(cut_type)
    dataframe['color'] = dataframe['color'].map(color_type)
    dataframe['clarity'] = dataframe['clarity'].map(clarity_type)
    
    return None

In [4]:
encode_categorical_data(df_train)
encode_categorical_data(df_test)

#### Drop columns that don't provide information

In [5]:
corr_ = df_train.corr(method='pearson')
pd.DataFrame(corr_).style.background_gradient(cmap='coolwarm')

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
carat,1.0,-0.136191,-0.291,-0.354075,0.029877,0.182316,0.976032,0.968603,0.948812,0.920101
cut,-0.136191,1.0,0.020565,0.192416,-0.219379,-0.433781,-0.127371,-0.125646,-0.150608,-0.089483
color,-0.291,0.020565,1.0,-0.027502,-0.045797,-0.026552,-0.269683,-0.267296,-0.265799,-0.153779
clarity,-0.354075,0.192416,-0.027502,1.0,-0.068549,-0.162996,-0.373931,-0.365906,-0.366564,-0.215906
depth,0.029877,-0.219379,-0.045797,-0.068549,1.0,-0.2923,-0.023856,-0.027099,0.095304,0.002542
table,0.182316,-0.433781,-0.026552,-0.162996,-0.2923,1.0,0.195966,0.188074,0.151454,0.158621
x,0.976032,-0.127371,-0.269683,-0.373931,-0.023856,0.195966,1.0,0.991368,0.965504,0.958779
y,0.968603,-0.125646,-0.267296,-0.365906,-0.027099,0.188074,0.991368,1.0,0.959607,0.953428
z,0.948812,-0.150608,-0.265799,-0.366564,0.095304,0.151454,0.965504,0.959607,1.0,0.930799
price,0.920101,-0.089483,-0.153779,-0.215906,0.002542,0.158621,0.958779,0.953428,0.930799,1.0


In [6]:
df_train.drop(['depth','table'], axis = 1, inplace = True) 
df_test.drop(['depth','table'], axis = 1, inplace = True) 

#### Standarization of the data

In [7]:
def standarize_data(dataframe):
    scaler = StandardScaler()
    to_standarize = ['carat', 'cut', 'color', 'clarity', 'x', 'y', 'z']
    for attribute in to_standarize:
        dataframe[attribute] = scaler.fit_transform(dataframe[attribute].values.reshape(-1, 1))
    return None

In [8]:
standarize_data(df_train)
standarize_data(df_test)

#### Training the model and predicting

In [24]:
X = df_train.drop("price", axis=1)
y = df_train['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [25]:
model = RandomForestRegressor(n_estimators = 400,
                              max_depth = 80, 
                              min_samples_split = 8, 
                              min_samples_leaf = 3, 
                              bootstrap=True, 
                              n_jobs=-1)

model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)

In [27]:
print('RMSE - ', mean_squared_error(y_test, y_pred, squared=False))

RMSE -  0.08777272685773295


#### Generate file for kaggle submission

In [31]:
X = df_train.drop("price", axis=1)
y = df_train['price']

model = RandomForestRegressor(n_estimators = 400,
                              max_depth = 80, 
                              min_samples_split = 8, 
                              min_samples_leaf = 4, 
                              bootstrap=True, 
                              n_jobs=-1)

model.fit(X, y)
y_pred = model.predict(df_test)

In [32]:
df_submission = pd.DataFrame(y_pred, columns=["price"])
df_submission.reset_index(inplace=True)
df_submission.rename({"index":"id"}, axis=1, inplace=True)

In [33]:
df_submission.to_csv("../data/submission.csv", index=False)