In [1]:
import pandas as pd

from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import pickle
import joblib

from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

In [2]:
#scale

def scaling_feat(df, to_scale, not_to_scale, scale=1):
    
    scalable = df[to_scale]
    not_scalable = df[not_to_scale]

    if scale == 2:
        scaler = StandardScaler()
    elif scale ==3:
        scaler = RobustScaler()
    else:
        scaler = MinMaxScaler()

    scaled = pd.DataFrame(scaler.fit_transform(scalable), columns=to_scale)
    
    return not_scalable.join(scaled)

In [3]:
def carat_to_bins(df, carat):
    
    df.loc[df[carat].between(0, .5, 'left'), 'bin'] = 1.0
    df.loc[df[carat].between(.5, 1.0, 'left'), 'bin'] = 1.44
    df.loc[df[carat].between(1.0, 1.5, 'left'), 'bin'] = 2.3
    df.loc[df[carat].between(1.5, 2.0, 'left'), 'bin'] = 3.21
    df.loc[df[carat].between(2.0, 3.0, 'left'), 'bin'] = 4.12
    df.loc[df[carat].between(3.0, 4.0, 'left'), 'bin'] = 5.53
    df.loc[df[carat].between(4.0, 10.0, 'left'), 'bin'] = 8.39
    
    return df

In [4]:
pre_x_train = pd.read_csv('../data/diamonds_train.csv')
pre_x_test = pd.read_csv('../data/diamonds_test.csv')
pre_x_train.drop(columns=['Unnamed: 0'], inplace=True)
pre_x_test.drop(columns=['id'], inplace=True)

In [5]:
#unify with test to scale features

pre_x_train = pd.concat([pre_x_train, pre_x_test])
pre_x_train.reset_index(inplace=True)
pre_x_train.drop(columns=['index'], inplace=True)

In [6]:
#columns bin for carat

pre_x_train = carat_to_bins(pre_x_train, 'carat')

In [7]:
#top size
pre_x_train['xy'] = pre_x_train['x']*pre_x_train['y']

In [8]:
#scaling features

feat_to_scale = ['carat', 'depth', 'table', 'x', 'y', 'z', 'bin', 'xy']
others = ['cut', 'color', 'clarity', 'city', 'price']
pre_x_scaled = scaling_feat(pre_x_train, feat_to_scale, others)

In [9]:
#drop test rows
pre_x_scaled = pre_x_scaled[pre_x_scaled['price'].notna()]

In [10]:
#labeling cut, color, clarity

clar_lab_mm = pd.read_csv('../data/scales/clar_lab_mm.csv')
cut_lab_mm = pd.read_csv('../data/scales/cut_lab_mm.csv')
col_lab_mm = pd.read_csv('../data/scales/col_lab_mm.csv')

# clar_lab_rob = pd.read_csv('../data/scales/clar_lab_mm.csv')
# cut_lab_rob = pd.read_csv('../data/scales/cut_lab_mm.csv')
# col_lab_rob = pd.read_csv('../data/scales/col_lab_mm.csv')

clar_lab_mm.drop(columns=['Unnamed: 0'], inplace=True)
cut_lab_mm.drop(columns=['Unnamed: 0'], inplace=True)
col_lab_mm.drop(columns=['Unnamed: 0'], inplace=True)

# clar_lab_rob.drop(columns=['Unnamed: 0'], inplace=True)
# cut_lab_rob.drop(columns=['Unnamed: 0'], inplace=True)
# col_lab_rob.drop(columns=['Unnamed: 0'], inplace=True)

In [11]:
pre_x_scaled2 = pre_x_scaled

cut2 = [cut_lab_mm[cut_lab_mm['cut'] == c]['label'].iloc[0] for c in pre_x_scaled2['cut']]
color2 = [col_lab_mm[col_lab_mm['color'] == c]['label'].iloc[0] for c in pre_x_scaled2['color']]
clarity2 = [clar_lab_mm[clar_lab_mm['clarity'] == c]['label'].iloc[0] for c in pre_x_scaled2['clarity']]

pre_x_scaled2['cut2'] = cut2
pre_x_scaled2['color2'] = color2
pre_x_scaled2['clar2'] = clarity2

In [12]:
pre_x_scaled2.drop(columns=['cut', 'color', 'clarity'], inplace=True)

In [13]:
#droping city column
pre_x_scaled2 = pre_x_scaled2.drop(columns='city')

In [14]:
pre_x_scaled2

Unnamed: 0,price,carat,depth,table,x,y,z,bin,xy,cut2,color2,clar2
0,4268.0,0.209979,0.538889,0.288462,0.635940,0.115280,0.133648,0.175913,0.097326,0.604337,0.007570,0.524187
1,505.0,0.024948,0.555556,0.269231,0.405028,0.074363,0.086478,0.000000,0.039985,0.540255,0.442759,0.524187
2,2686.0,0.106029,0.625000,0.230769,0.523277,0.093888,0.114780,0.059540,0.065223,0.130465,0.590861,0.596888
3,738.0,0.043659,0.577778,0.250000,0.435754,0.080136,0.094340,0.000000,0.046358,0.349414,0.982139,0.323826
4,4882.0,0.170478,0.486111,0.307692,0.609870,0.110526,0.124214,0.175913,0.089487,0.908900,0.590861,0.323826
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,5850.0,0.182952,0.525000,0.211538,0.618250,0.112224,0.128931,0.175913,0.092110,0.908900,0.442759,0.524187
40451,6300.0,0.197505,0.522222,0.230769,0.626629,0.114771,0.131132,0.175913,0.095477,0.908900,0.226905,0.323826
40452,1800.0,0.068607,0.511111,0.269231,0.482309,0.088285,0.100314,0.059540,0.056529,0.908900,0.739817,0.596888
40453,2368.0,0.095634,0.494444,0.288462,0.527933,0.095756,0.108176,0.059540,0.067112,0.604337,0.739817,0.596888


In [15]:
pre_x_scaled2.columns

Index(['price', 'carat', 'depth', 'table', 'x', 'y', 'z', 'bin', 'xy', 'cut2',
       'color2', 'clar2'],
      dtype='object')

In [16]:
feat = ['carat', 'bin', 'depth', 'table', 'clar2', 'cut2', 'color2']
# feat = ['price', 'carat', 'depth', 'table', 'x', 'y', 'z', 'bin', 'clar2', 'cut2', 'color2']

target = ['price']

In [17]:
X, y = pre_x_scaled2[feat], pre_x_scaled2[target]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
model_rmse = RandomForestRegressor(n_estimators= 512, max_depth= 16,  random_state = 42)


model_rmse.fit(X, y)

  model_rmse.fit(X, y)


In [None]:
%%time
# XGBoost regression

regressor=xgb.XGBRegressor(eval_metric='rmsle')
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)

rmse = mean_squared_error(y_test, prediction)**0.5

print(type(regressor), '\n')
print(rmse, '\n')

In [None]:
%%time

param_grid = {"max_depth":    [6, 8, 16],
              "n_estimators": [512, 700, 900],
              "learning_rate": [0.01, 0.015]}

search = GridSearchCV(regressor, param_grid, cv=5).fit(X_train, y_train)
print("The best hyperparameters are ",search.best_params_)

In [None]:
%%time

regressor=xgb.XGBRegressor(learning_rate = 0.01,
                           n_estimators  = 900,
                           max_depth     = 6,
                           eval_metric='rmsle')

regressor.fit(X_train, y_train)

prediction = regressor.predict(X_test)

rmse = mean_squared_error(y_test, prediction)**0.5

print(type(regressor), '\n')
print(rmse, '\n')

In [None]:
%%time

#train with whole X

regressor=xgb.XGBRegressor(learning_rate = 0.01,
                           n_estimators  = 900,
                           max_depth     = 6,
                           eval_metric='rmsle')



regressor.fit(X, y)

In [None]:
## RandomForest regression
model_rmse = RandomForestRegressor(n_estimators= 512, max_depth= 16,  random_state = 42)

model_rmse.fit(X, y)

In [None]:
## lightgbm regression

params = {
    'task': 'train', 
    'boosting': 'gbdt',
    'objective': 'regression',
    'num_leaves': 10,
    'learnnig_rate': 0.05,
    'metric': {'l2','l1'},
    'verbose': -1
#     'task': 'train', 
#     'boosting': 'gbdt',
#     'objective': 'regression',
#     'num_leaves': 25,
#     'max_depth' :10,
#     'num_iterations': 500,
#     'min_data_in_leaf':25,
#     'metric': {'l2','l1'},
#     'verbose': -1

}


lgb_train = lgb.Dataset(X, y)

model_lightgbm = lgb.train(params,
                 train_set=lgb_train)

In [None]:
%%time

## lightgbm regression

params = {
    'task': 'train', 
    'boosting': 'gbdt',
    'objective': 'regression',
    'num_leaves': 15,
    'learnnig_rage': 0.05,
    'metric': {'l2','l1'},
    'verbose': -1
#         'task': 'train', 
#     'boosting': 'gbdt',
#     'objective': 'regression',
#     'num_leaves': 25,
#     'max_depth' :10,
#     'num_iterations': 500,
#     'min_data_in_leaf':25,
#     'metric': {'l2','l1'},
#     'verbose': -1
}


lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

model_lightgbm = lgb.train(params,
                 train_set=lgb_train,
                 valid_sets=lgb_eval,
                 early_stopping_rounds=30)


prediction = model_lightgbm.predict(X_test)

rmse = mean_squared_error(y_test, prediction)**0.5

print(rmse, '\n')

In [None]:
%%time
# grid search  hyperparameter tuning
params = {
    'task': 'train', 
    'boosting': 'gbdt',
    'objective': 'regression',
    'num_leaves': 25,
    'max_depth' :10,
    'num_iterations': 500,
    'min_data_in_leaf':25,
    'metric': {'l2','l1'},
    'verbose': -1
}


lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

model_lightgbm = lgb.train(params,
                 train_set=lgb_train,
                 valid_sets=lgb_eval,
                 early_stopping_rounds=30)


prediction = model_lightgbm.predict(X_test)

rmse = mean_squared_error(y_test, prediction)**0.5

print(rmse, '\n')

## Preparing TEST DF

In [20]:
#preparing test df

to_predict = pre_x_test = pd.read_csv('../data/diamonds_test.csv')
to_predict.drop(columns=['id'], inplace=True)


#carat to bins
to_predict = carat_to_bins(to_predict, 'carat')

#scaling columns
feat_to_scale = ['carat', 'depth', 'table', 'x', 'y', 'z', 'bin']
others = ['cut', 'color', 'clarity', 'city']
to_predict_s = scaling_feat(to_predict, feat_to_scale, others)

In [21]:
#labeling cut, color, clarity

cut2 = [cut_lab_mm[cut_lab_mm['cut'] == c]['label'].iloc[0] for c in to_predict_s['cut']]
color2 = [col_lab_mm[col_lab_mm['color'] == c]['label'].iloc[0] for c in to_predict_s['color']]
clarity2 = [clar_lab_mm[clar_lab_mm['clarity'] == c]['label'].iloc[0] for c in to_predict_s['clarity']]

to_predict_s['cut2'] = cut2
to_predict_s['color2'] = color2
to_predict_s['clar2'] = clarity2

to_predict_s.drop(columns=['cut', 'color', 'clarity'], inplace=True)

In [22]:
#drop city column
to_predict_s.drop(columns=['city'], inplace=True)

In [23]:
to_predict_s.columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z', 'bin', 'cut2', 'color2',
       'clar2'],
      dtype='object')

In [24]:
to_predict_X = to_predict_s[feat]

In [26]:
#predictions
# final_prediction = regressor.predict(to_predict_X)

# #lightgbm predictions
# light_pred = model_lightgbm.predict(to_predict_X)

# #catboost predictions
# cat_pred = model_cat.predict(to_predict_X)

#RandonForest predictions
rforest_pred = model_rmse.predict(to_predict_X)

In [27]:
prices = pd.DataFrame(rforest_pred, columns=['Price'])
prices.reset_index(inplace=True)
prices.rename(columns={"index": "id"}, inplace=True)
prices = prices.set_index('id')
prices.to_csv('../data/prices_015.csv')

prices

Unnamed: 0_level_0,Price
id,Unnamed: 1_level_1
0,2955.979272
1,5377.005074
2,10408.099352
3,4168.651188
4,1656.621385
...,...
13480,1481.577051
13481,2882.972504
13482,3096.961191
13483,2127.313137


In [None]:
def classify_diamond_shape(x, y, z, table, depth):
    # Calculate ratios
    length_to_width_ratio = x / y
    length_to_depth_ratio = x / z

    # Define shape classification ranges
    shape_ranges = {
        'Round Brilliant': {
            'length_to_width_ratio': (0.95, 1.05),
            'length_to_depth_ratio': (1.35, 1.65),
            'table_percentage': (52, 62),
            'depth_percentage': (59, 62)
        },
        'Princess': {
            'length_to_width_ratio': (0.90, 1.10),
            'length_to_depth_ratio': (0.90, 1.10),
            'table_percentage': (60, 80),
            'depth_percentage': (58, 77)
        },
        'Emerald': {
            'length_to_width_ratio': (1.20, 1.60),
            'length_to_depth_ratio': (1.30, 1.60),
            'table_percentage': (60, 69),
            'depth_percentage': (60, 75)
        },
        'Asscher': {
            'length_to_width_ratio': (1.00, 1.05),
            'length_to_depth_ratio': (1.00, 1.05),
            'table_percentage': (60, 68),
            'depth_percentage': (60, 75)
        },
        'Radiant': {
            'length_to_width_ratio': (1.00, 1.30),
            'length_to_depth_ratio': (1.00, 1.30),
            'table_percentage': (60, 69),
            'depth_percentage': (60, 75)
        },
        'Pear': {
            'length_to_width_ratio': (1.40, 2.00),
            'length_to_depth_ratio': (1.30, 1.70),
            'table_percentage': (53, 63),
            'depth_percentage': (58, 66)
        },
        'Marquise': {
            'length_to_width_ratio': (1.60, 2.10),
            'length_to_depth_ratio': (1.30, 2.20),
            'table_percentage': (53, 63),
            'depth_percentage': (58, 66)
        },
        'Oval': {
            'length_to_width_ratio': (1.20, 1.70),
            'length_to_depth_ratio': (1.30, 1.70),
            'table_percentage': (53, 63),
            'depth_percentage': (58, 66)
        },
        'Heart': {
            'length_to_width_ratio': (0.90, 1.10),
            'length_to_depth_ratio': (0.90, 1.10),
            'table_percentage': (53, 63),
            'depth_percentage': (58, 66)
        },
        'Cushion': {
            'length_to_width_ratio': (1.00, 1.10),
            'length_to_depth_ratio': (1.00, 1.10),
            'table_percentage': (53, 63),
            'depth_percentage': (58, 66)
        }
    }

    # Classify the diamond shape
    for shape, ranges in shape_ranges.items():
        if (ranges['length_to_width_ratio'][0] <= length_to_width_ratio <= ranges['length_to_width_ratio'][1] and
                ranges['length_to_depth_ratio'][0] <= length_to_depth_ratio <= ranges['length_to_depth_ratio'][1] and
                ranges['table_percentage'][0] <= table <= ranges['table_percentage'][1] and
                ranges['depth_percentage'][0] <= depth <= ranges['depth_percentage'][1]):
            return shape

    return 'Unknown'  # If no shape match is found

# Example usage
x = 6.1
y = 6.1
z = 3.7
table = 58
depth = 61

diamond_shape = classify_diamond_shape(x, y, z, table, depth)
print("The diamond shape is:", diamond_shape)


In [None]:
len(pre_x_train)

In [None]:
shapes = []

for i in range(len(pre_x_train)):
    x = pre_x_train['x'].iloc[i]
    y = pre_x_train['y'].iloc[i]
    z = pre_x_train['z'].iloc[i]
    t = pre_x_train['table'].iloc[i]
    d = pre_x_train['depth'].iloc[i]

    shapes.append(classify_diamond_shape(x, y, z, t, d))
    
shapes

In [None]:
pre_x_train

In [None]:
xy = 6.83/6.79
xz = 6.83/4.25
# xy
xz