In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import math
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
#error metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
def error_metrics(y_true, y_pred):
    "give some error metrics base on a y_true and y _predict"
    mean_abs = "Mean Absolute Error: {}".format(mean_absolute_error(y_true, y_pred))
    mean_squared = "Mean Square Error: {}".format(mean_squared_error(y_true, y_pred))
    root_mean_squared = "Root Mean Square Error: {}".format(mean_squared_error(y_true, y_pred,squared=False))
    r2 = "r2 score: {}".format(r2_score(y_true, y_pred))
    return mean_abs, mean_squared, r2,root_mean_squared

# Data Analysing

In [3]:
diamonds = pd.read_csv('../data/diamonds_train.csv')
diamonds_predict = pd.read_csv('../data/diamonds_predict.csv')

In [4]:
#Drop Duplicates
diamonds.drop_duplicates(inplace=True)

# Add new data and Data Cleaning

We observed some data with 0 as size dimension, so let's get rid of them

In [5]:
diamonds_predict['x'] = diamonds_predict['x'].apply(lambda x: diamonds_predict['x'].median() if x==0 else x)
diamonds_predict['y'] = diamonds_predict['y'].apply(lambda x: diamonds_predict['y'].median() if x==0 else x)
diamonds_predict['z'] = diamonds_predict['z'].apply(lambda x: diamonds_predict['z'].median() if x==0 else x)

For better prediction, we will delete those diamonds that are so big or size's outlayer

In [6]:
diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0) | (diamonds['z']>0)]
diamonds = diamonds.loc[~((diamonds['y'] > 20) | (diamonds['z'] > 20))]

Let's calculate ratio, carat log and volume as sintetyc variables for our prediction

In [7]:
diamonds['ratio_lw'] = diamonds['x']/diamonds['y']
diamonds_predict['ratio_lw'] = diamonds_predict['x']/diamonds_predict['y']

diamonds['carat_log'] = diamonds['carat'].apply(lambda x : math.log(x))
diamonds_predict['carat_log'] = diamonds_predict['carat'].apply(lambda x : math.log(x))

diamonds['volumen'] = (((diamonds['x']**2) * diamonds['z'] ) / 3)
diamonds_predict['volumen'] = (((diamonds_predict['x']**2) * diamonds_predict['z'] ) / 3)

In [8]:
def diamond_shape(df):
    dshape = []
    for i in df['table'].index:
        if 54<df['table'][i]<57 and 61<df['depth'][i]<62.5:
            dshape.append('round')
        elif 52<df['table'][i]<60 and 60<df['depth'][i]<68:
            dshape.append('oval')
        elif 63<df['table'][i]<69 and 69<df['depth'][i]<76:
            dshape.append('princess')
        elif 58<df['table'][i]<63 and 58<df['depth'][i]<66:
            dshape.append('cushion')
        else:
            dshape.append('others')
    return dshape

diamonds['shape'] = diamond_shape(diamonds)
diamonds_predict['shape'] = diamond_shape(diamonds_predict)

# Transform your data

We will rate those categorical variables. The top characteristic is rate with the higher punctuation

In [9]:
diamonds['cut']=diamonds['cut'].map({'Fair':0,'Good':1,'Very Good':2,'Premium':3, 'Ideal':4})
diamonds['color']=diamonds['color'].map({'J':0, 'I':1, 'H':2, 'G':3, 'F': 4, 'E': 5, 'D':6})
diamonds['clarity']=diamonds['clarity'].map({'I1':0,'SI2':1,'SI1':2,'VS2':3,'VS1':4,'VVS2':5,'VVS1':6,'IF':7})
diamonds['shape']=diamonds['shape'].map({'round':4, 'oval': 1, 'princess': 3, 'cushion':2, 'others':0})

In [10]:
diamonds_predict['cut']=diamonds_predict['cut'].map({'Fair':0,'Good':1,'Very Good':2,'Premium':3, 'Ideal':4})
diamonds_predict['color']=diamonds_predict['color'].map({'J':0, 'I':1, 'H':2, 'G':3, 'F': 4, 'E': 5, 'D':6})
diamonds_predict['clarity']=diamonds_predict['clarity'].map({'I1':0,'SI2':1,'SI1':2,'VS2':3,'VS1':4,'VVS2':5,'VVS1':6,'IF':7})
diamonds_predict['shape']=diamonds_predict['shape'].map({'round':4, 'oval': 1, 'princess': 3, 'cushion':2, 'others':0})

# Training your Supervised Model

In [11]:
scaler = StandardScaler()

In [12]:
NUM_FEATS = ['ratio_lw','carat_log','volumen']
CAT_FEATS = ['cut','color', 'clarity','shape']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

In [13]:
X_raw = diamonds[FEATS]
y_raw = diamonds[TARGET]

In [14]:
X = scaler.fit_transform(X_raw)
y = y_raw

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [16]:
model = lgb.LGBMRegressor()


In [17]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error',cv=10)

In [18]:
np.mean(-scores)

521.5589759288007

In [19]:
param_grid = {
    'n_estimators':[115, 116,117], 
    'max_depth:':['auto', 10],
    'num_leaves': [40, 60,73],
    'learning_rate': [0.005, 0.1,0.2],
    'bagging_fraction': [0.70, 0.75],
    'max_bin': [128, 256],
    'feature_fraction' : [0.67,0.75,],
    'bagging_frequency' : [0.70, 0.80, 0.85],
    'min_data_in_leaf': [20, 25]
}


grid_search = GridSearchCV(model, 
                           param_grid=param_grid,
                           cv=10, 
                           verbose=10, 
                           scoring='neg_root_mean_squared_error',
                            n_jobs=-1)

In [20]:
grid_search.fit(X, y)

Fitting 10 folds for each of 3456 candidates, totalling 34560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   

GridSearchCV(cv=10, estimator=LGBMRegressor(), n_jobs=-1,
             param_grid={'bagging_fraction': [0.7, 0.75],
                         'bagging_frequency': [0.7, 0.8],
                         'feature_fraction': [0.67, 0.75, 0.8],
                         'learning_rate': [0.005, 0.1, 0.2],
                         'max_bin': [128, 256], 'max_depth:': ['auto', 1],
                         'min_data_in_leaf': [20, 25],
                         'n_estimators': [114, 115, 116],
                         'num_leaves': [31, 40, 60, 73]},
             scoring='neg_root_mean_squared_error', verbose=10)

In [21]:
grid_search.best_estimator_

LGBMRegressor(bagging_fraction=0.7, bagging_frequency=0.7, feature_fraction=0.8,
              max_bin=256, max_depth:='auto', min_data_in_leaf=25,
              n_estimators=115, num_leaves=40)

In [22]:
grid_search.best_score_

-520.3540576554394

In [23]:
from sklearn.metrics import mean_squared_error
y_train_pred = grid_search.predict(x_train)
y_test_pred = grid_search.predict(x_test)
print(f"test error: {mean_squared_error(y_pred=y_test_pred, y_true=y_test, squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train_pred, y_true=y_train, squared=False)}")

test error: 456.27373321328406
train error: 451.2072747087676


In [30]:
diamonds_predict

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,ratio_lw,carat_log,volumen,shape
0,0,0.79,2,4,2,62.7,60.0,5.82,5.89,3.67,0.988115,-0.235722,41.437236,2
1,1,1.20,4,0,4,61.0,57.0,6.81,6.89,4.18,0.988389,0.182322,64.617366,1
2,2,1.57,3,2,2,62.2,61.0,7.38,7.32,4.57,1.008197,0.451076,82.967436,2
3,3,0.90,2,4,2,63.8,54.0,6.09,6.13,3.90,0.993475,-0.105361,48.214530,1
4,4,0.50,2,4,4,62.9,58.0,5.05,5.09,3.19,0.992141,-0.693147,27.117658,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,4,5,2,61.9,56.0,5.35,5.32,3.30,1.005639,-0.562119,31.484750,4
13481,13481,0.71,4,1,3,62.2,55.0,5.71,5.73,3.56,0.996510,-0.342490,38.690199,4
13482,13482,0.70,4,4,4,61.6,55.0,5.75,5.71,3.53,1.007005,-0.356675,38.903542,4
13483,13483,0.70,2,4,1,58.8,57.0,5.85,5.89,3.45,0.993209,-0.356675,39.355875,0


In [31]:
X_test = scaler.transform(diamonds_predict[FEATS])

In [32]:
preds = grid_search.predict(X_test)

In [33]:
from datetime import datetime
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d-%H-%M")

In [34]:
output = pd.DataFrame({'id': diamonds_predict['id'], 'price': preds})


In [35]:
output

Unnamed: 0,id,price
0,0,2900.628711
1,1,5826.519210
2,2,9386.050677
3,3,4032.168131
4,4,1596.410054
...,...,...
13480,13480,1682.407747
13481,13481,2545.651645
13482,13482,3129.805837
13483,13483,2178.139254


In [36]:
output.describe()

Unnamed: 0,id,price
count,13485.0,13485.0
mean,6742.0,3951.725175
std,3892.928525,3955.212518
min,0.0,314.426112
25%,3371.0,939.828934
50%,6742.0,2462.401611
75%,10113.0,5335.140108
max,13484.0,18358.884993


In [37]:
# output.price.clip(350, 19000, inplace=True)

In [38]:
output.to_csv(f'../solutions_v2/good-lgbm-3v-{dt_string}.csv', index=False)