In [1]:
import pandas as pd

from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import pickle
import joblib

from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

In [2]:
#scale

def scaling_feat(df, to_scale, not_to_scale, scale=1):
    
    scalable = df[to_scale]
    not_scalable = df[not_to_scale]

    if scale == 2:
        scaler = StandardScaler()
    elif scale ==3:
        scaler = RobustScaler()
    else:
        scaler = MinMaxScaler()

    scaled = pd.DataFrame(scaler.fit_transform(scalable), columns=to_scale)
    
    return not_scalable.join(scaled)

In [3]:
def carat_to_bins(df, carat):
    
    df.loc[df[carat].between(0, .5, 'left'), 'bin'] = 1.0
    df.loc[df[carat].between(.5, 1.0, 'left'), 'bin'] = 1.44
    df.loc[df[carat].between(1.0, 1.5, 'left'), 'bin'] = 2.3
    df.loc[df[carat].between(1.5, 2.0, 'left'), 'bin'] = 3.21
    df.loc[df[carat].between(2.0, 3.0, 'left'), 'bin'] = 4.12
    df.loc[df[carat].between(3.0, 4.0, 'left'), 'bin'] = 5.53
    df.loc[df[carat].between(4.0, 10.0, 'left'), 'bin'] = 8.39
    
    return df

In [4]:
pre_x_train = pd.read_csv('../data/diamonds_train.csv')
pre_x_test = pd.read_csv('../data/diamonds_test.csv')
pre_x_train.drop(columns=['Unnamed: 0'], inplace=True)
pre_x_test.drop(columns=['id'], inplace=True)

In [5]:
#unify with test to scale features

pre_x_train = pd.concat([pre_x_train, pre_x_test])
pre_x_train.reset_index(inplace=True)
pre_x_train.drop(columns=['index'], inplace=True)

In [6]:
#columns bin for carat

pre_x_train = carat_to_bins(pre_x_train, 'carat')

In [8]:
#scaling features

feat_to_scale = ['carat', 'depth', 'table', 'x', 'y', 'z', 'bin']
others = ['cut', 'color', 'clarity', 'city', 'price']
pre_x_scaled = scaling_feat(pre_x_train, feat_to_scale, others)

In [9]:
#drop test rows
pre_x_scaled = pre_x_scaled[pre_x_scaled['price'].notna()]

In [10]:
#labeling cut, color, clarity

clar_lab_mm = pd.read_csv('../data/scales/clar_lab_mm.csv')
cut_lab_mm = pd.read_csv('../data/scales/cut_lab_mm.csv')
col_lab_mm = pd.read_csv('../data/scales/col_lab_mm.csv')

# clar_lab_rob = pd.read_csv('../data/scales/clar_lab_mm.csv')
# cut_lab_rob = pd.read_csv('../data/scales/cut_lab_mm.csv')
# col_lab_rob = pd.read_csv('../data/scales/col_lab_mm.csv')

clar_lab_mm.drop(columns=['Unnamed: 0'], inplace=True)
cut_lab_mm.drop(columns=['Unnamed: 0'], inplace=True)
col_lab_mm.drop(columns=['Unnamed: 0'], inplace=True)

# clar_lab_rob.drop(columns=['Unnamed: 0'], inplace=True)
# cut_lab_rob.drop(columns=['Unnamed: 0'], inplace=True)
# col_lab_rob.drop(columns=['Unnamed: 0'], inplace=True)

In [11]:
# labeling cut, color & clarity

pre_x_scaled2 = pre_x_scaled

cut2 = [cut_lab_mm[cut_lab_mm['cut'] == c]['label'].iloc[0] for c in pre_x_scaled2['cut']]
color2 = [col_lab_mm[col_lab_mm['color'] == c]['label'].iloc[0] for c in pre_x_scaled2['color']]
clarity2 = [clar_lab_mm[clar_lab_mm['clarity'] == c]['label'].iloc[0] for c in pre_x_scaled2['clarity']]

pre_x_scaled2['cut2'] = cut2
pre_x_scaled2['color2'] = color2
pre_x_scaled2['clar2'] = clarity2

In [12]:
pre_x_scaled2.drop(columns=['cut', 'color', 'clarity'], inplace=True)

In [13]:
#droping city column
pre_x_scaled2 = pre_x_scaled2.drop(columns='city')

In [15]:
pre_x_scaled2.columns

Index(['price', 'carat', 'depth', 'table', 'x', 'y', 'z', 'bin', 'cut2',
       'color2', 'clar2'],
      dtype='object')

In [16]:
feat = ['carat', 'bin', 'depth', 'table', 'clar2', 'cut2', 'color2']
# feat = ['price', 'carat', 'depth', 'table', 'x', 'y', 'z', 'bin', 'clar2', 'cut2', 'color2']

target = ['price']

In [17]:
X, y = pre_x_scaled2[feat], pre_x_scaled2[target]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
%%time
# XGBoost regression

regressor=xgb.XGBRegressor(learning_rate = 0.01,
                           n_estimators  = 900,
                           max_depth     = 6,
                           eval_metric='rmsle')
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)

rmse = mean_squared_error(y_test, prediction)**0.5

print(type(regressor), '\n')
print(rmse, '\n')

<class 'xgboost.sklearn.XGBRegressor'> 

563.548204959016 

CPU times: total: 1min 25s
Wall time: 11.6 s


In [36]:
## lightgbm regression

params = {
    'task': 'train', 
    'boosting': 'gbdt',
    'objective': 'regression',
    'num_leaves': 15,
#     'learning_rate': 0.05,
    'metric': {'l2','l1'},
    'verbose': -1
#     'task': 'train', 
#     'boosting': 'gbdt',
#     'objective': 'regression',
#     'num_leaves': 25,
#     'max_depth' :10,
#     'num_iterations': 500,
#     'min_data_in_leaf':25,
#     'metric': {'l2','l1'},
#     'verbose': -1

}


lgb_train = lgb.Dataset(X, y)

model_lightgbm = lgb.train(params,
                 train_set=lgb_train)

In [43]:
%%time

## lightgbm regression

params = {
    'task': 'train', 
    'boosting': 'gbdt',
    'objective': 'regression',
    'num_leaves': 15,
#     'learning_rate': 0.05,
    'metric': {'l2','l1'},
    'verbose': -1
#         'task': 'train', 
#     'boosting': 'gbdt',
#     'objective': 'regression',
#     'num_leaves': 25,
#     'max_depth' :10,
#     'num_iterations': 500,
#     'min_data_in_leaf':25,
#     'metric': {'l2','l1'},
#     'verbose': -1
}


lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

model_lightgbm = lgb.train(params,
                 train_set=lgb_train,
                 valid_sets=lgb_eval,
                 early_stopping_rounds=30)


prediction = model_lightgbm.predict(X_test)

rmse = mean_squared_error(y_test, prediction)**0.5

print(rmse, '\n')

[1]	valid_0's l2: 1.30653e+07	valid_0's l1: 2731.97
Training until validation scores don't improve for 30 rounds
[2]	valid_0's l2: 1.07785e+07	valid_0's l1: 2472.77
[3]	valid_0's l2: 8.92065e+06	valid_0's l1: 2242.62
[4]	valid_0's l2: 7.40731e+06	valid_0's l1: 2033.24
[5]	valid_0's l2: 6.18327e+06	valid_0's l1: 1848.44
[6]	valid_0's l2: 5.18525e+06	valid_0's l1: 1681.75
[7]	valid_0's l2: 4.36761e+06	valid_0's l1: 1533.07
[8]	valid_0's l2: 3.69084e+06	valid_0's l1: 1399.48
[9]	valid_0's l2: 3.1428e+06	valid_0's l1: 1283.31
[10]	valid_0's l2: 2.69341e+06	valid_0's l1: 1179.12
[11]	valid_0's l2: 2.32234e+06	valid_0's l1: 1085.54
[12]	valid_0's l2: 2.01309e+06	valid_0's l1: 1003.58
[13]	valid_0's l2: 1.76323e+06	valid_0's l1: 932.842
[14]	valid_0's l2: 1.55838e+06	valid_0's l1: 868.985
[15]	valid_0's l2: 1.38042e+06	valid_0's l1: 811.231
[16]	valid_0's l2: 1.2359e+06	valid_0's l1: 761.291
[17]	valid_0's l2: 1.10929e+06	valid_0's l1: 716.45
[18]	valid_0's l2: 1.00884e+06	valid_0's l1: 676.9



## Preparing TEST DF

In [27]:
#preparing test df

to_predict = pre_x_test = pd.read_csv('../data/diamonds_test.csv')
to_predict.drop(columns=['id'], inplace=True)


#carat to bins
to_predict = carat_to_bins(to_predict, 'carat')

#scaling columns
feat_to_scale = ['carat', 'depth', 'table', 'x', 'y', 'z', 'bin']
others = ['cut', 'color', 'clarity', 'city']
to_predict_s = scaling_feat(to_predict, feat_to_scale, others)

In [28]:
#labeling cut, color, clarity

cut2 = [cut_lab_mm[cut_lab_mm['cut'] == c]['label'].iloc[0] for c in to_predict_s['cut']]
color2 = [col_lab_mm[col_lab_mm['color'] == c]['label'].iloc[0] for c in to_predict_s['color']]
clarity2 = [clar_lab_mm[clar_lab_mm['clarity'] == c]['label'].iloc[0] for c in to_predict_s['clarity']]

to_predict_s['cut2'] = cut2
to_predict_s['color2'] = color2
to_predict_s['clar2'] = clarity2

to_predict_s.drop(columns=['cut', 'color', 'clarity'], inplace=True)

In [29]:
#drop city column
to_predict_s.drop(columns=['city'], inplace=True)

In [30]:
to_predict_s.columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z', 'bin', 'cut2', 'color2',
       'clar2'],
      dtype='object')

In [31]:
to_predict_X = to_predict_s[feat]

### Prediction

In [32]:
#lightgbm predictions
light_pred = model_lightgbm.predict(to_predict_X)

## Save prediction

In [None]:
prices = pd.DataFrame(rforest_pred, columns=['Price'])
prices.reset_index(inplace=True)
prices.rename(columns={"index": "id"}, inplace=True)
prices = prices.set_index('id')
prices.to_csv('../data/prices_lightgbm.csv')