In [109]:
import pandas as pd

from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import pickle
import joblib

from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

In [15]:
def carat_to_bins(df, carat):
    
    df.loc[df[carat].between(0, .5, 'left'), 'bin'] = 1.0
    df.loc[df[carat].between(.5, 1.0, 'left'), 'bin'] = 1.44
    df.loc[df[carat].between(1.0, 1.5, 'left'), 'bin'] = 2.3
    df.loc[df[carat].between(1.5, 2.0, 'left'), 'bin'] = 3.21
    df.loc[df[carat].between(2.0, 3.0, 'left'), 'bin'] = 4.12
    df.loc[df[carat].between(3.0, 4.0, 'left'), 'bin'] = 5.53
    df.loc[df[carat].between(4.0, 10.0, 'left'), 'bin'] = 8.39
    
    return df

In [16]:
#scale

def scaling_feat(df, to_scale, not_to_scale, scale=1):
    
    scalable = df[to_scale]
    not_scalable = df[not_to_scale]

    if scale == 2:
        scaler = StandardScaler()
    elif scale ==3:
        scaler = RobustScaler()
    else:
        scaler = MinMaxScaler()

    scaled = pd.DataFrame(scaler.fit_transform(scalable), columns=to_scale)
    
    return not_scalable.join(scaled)

In [3]:
df = pd.read_csv('../data/diamonds_train.csv')
pre_x_test = pd.read_csv('../data/diamonds_test.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
pre_x_test.drop(columns=['id'], inplace=True)

In [12]:
zeros = df[df['z'] == 0].index

In [13]:
df.drop(labels=zeros, axis=0, inplace=True)

In [17]:
#columns bin for carat

df = carat_to_bins(df, 'carat')

In [37]:
def classify_diamond_shape(x, y, z):
    # Calculate ratios
    length_to_width_ratio = x / y
    length_to_depth_ratio = x / z

    # Define shape classification ranges
    shape_ranges = {
        'Round Brilliant': {
            'length_to_width_ratio': (0.95, 1.05),
            'length_to_depth_ratio': (1.35, 1.75)
        },
        'Princess': {
            'length_to_width_ratio': (0.90, 1.10),
            'length_to_depth_ratio': (0.90, 1.10)
        },
        'Emerald': {
            'length_to_width_ratio': (1.20, 1.60),
            'length_to_depth_ratio': (1.30, 1.60)
        },
        'Radiant': {
            'length_to_width_ratio': (1.00, 1.30),
            'length_to_depth_ratio': (1.00, 1.30)
        },
        'Pear': {
            'length_to_width_ratio': (1.40, 2.00),
            'length_to_depth_ratio': (1.30, 1.70)
        },
        'Marquise': {
            'length_to_width_ratio': (1.60, 2.10),
            'length_to_depth_ratio': (1.30, 2.20)
        },
        'Oval': {
            'length_to_width_ratio': (1.20, 1.70),
            'length_to_depth_ratio': (1.30, 1.70)
        }
    }

    # Classify the diamond shape
    for shape, ranges in shape_ranges.items():
        if (ranges['length_to_width_ratio'][0] <= length_to_width_ratio <= ranges['length_to_width_ratio'][1]
                and ranges['length_to_depth_ratio'][0] <= length_to_depth_ratio <= ranges['length_to_depth_ratio'][1]):
            return shape

    return 'Unknown'  # If no shape match is found

In [38]:
shapes = []

for i in range(len(df)):
    x = df['x'].iloc[i]
    y = df['y'].iloc[i]
    z = df['z'].iloc[i]

    shapes.append(classify_diamond_shape(x, y, z))
    
shapes

['Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Unknown',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Unknown',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brilliant',
 'Round Brillian

In [39]:
shapes_df = pd.DataFrame(shapes, columns=['shape'])

In [43]:
unk = shapes_df[shapes_df['shape'] == 'Unknown'].index

In [46]:
shapes_df['shape'].value_counts()

Round Brilliant    40223
Unknown              211
Radiant                3
Princess               1
Marquise               1
Name: shape, dtype: int64

In [45]:
df.iloc[unk].loc[df['xy']<1.050]

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,city,price,bin,xy,xz
10,1.02,Premium,I,SI1,58.0,58.0,6.84,6.60,3.90,Tel Aviv,4284,2.30,1.036364,1.753846
41,1.00,Fair,H,SI1,55.2,64.0,6.69,6.64,3.68,New York City,3830,2.30,1.007530,1.817935
164,0.88,Fair,F,SI1,56.6,65.0,6.39,6.32,3.60,Madrid,2882,1.44,1.011076,1.775000
928,0.99,Fair,J,SI1,55.0,61.0,6.72,6.67,3.68,Dubai,2812,1.44,1.007496,1.826087
956,1.50,Good,G,I1,57.4,62.0,7.56,7.39,4.29,Kimberly,3179,3.21,1.023004,1.762238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39874,1.00,Premium,F,VS1,59.2,60.0,6.53,6.48,3.58,Dubai,7056,2.30,1.007716,1.824022
40145,0.38,Good,F,VS2,56.2,61.0,4.84,4.87,2.73,Tel Aviv,873,1.00,0.993840,1.772894
40202,0.39,Good,F,VVS2,57.7,61.0,4.91,4.80,2.80,Las Vegas,1058,1.00,1.022917,1.753571
40270,2.02,Good,F,SI2,57.1,60.0,8.31,8.25,4.73,Madrid,12615,4.12,1.007273,1.756871


In [25]:
df['xy'] = df['x'] / df['y']
df['xz'] = df['x'] / df['z']

In [26]:
df['xy'].max()

1.6155717761557176

In [27]:
df['xz'].max()

6.186915887850467

In [29]:
df[df['xz'] > 2]

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,city,price,bin,xy,xz
7956,0.55,Very Good,E,VS2,61.0,59.0,5.31,5.34,2.25,Luxembourg,1831,1.44,0.994382,2.36
7967,1.07,Ideal,F,SI1,60.6,57.0,6.62,6.67,1.07,London,5909,2.3,0.992504,6.186916
12924,1.53,Ideal,I,SI1,61.9,54.0,7.43,7.5,1.53,Las Vegas,8971,3.21,0.990667,4.856209
16544,1.57,Fair,H,VS1,67.3,66.0,7.85,5.75,3.87,Antwerp,8133,3.21,1.365217,2.028424
20014,0.38,Ideal,E,VVS2,60.9,56.0,4.71,4.68,2.06,Luxembourg,1068,1.0,1.00641,2.286408
35619,0.34,Good,E,VS2,57.0,61.0,6.65,4.61,2.64,Zurich,816,1.0,1.442516,2.518939


In [63]:
xx = pre_x_test.groupby(['carat', 'cut', 'color', 'clarity']).sum().reset_index()[['carat', 'cut', 'color', 'clarity']]

  xx = pre_x_test.groupby(['carat', 'cut', 'color', 'clarity']).sum().reset_index()[['carat', 'cut', 'color', 'clarity']]


In [66]:
yy = df.groupby(['carat', 'cut', 'color', 'clarity']).sum().reset_index()[['carat', 'cut', 'color', 'clarity']]

  yy = df.groupby(['carat', 'cut', 'color', 'clarity']).sum().reset_index()[['carat', 'cut', 'color', 'clarity']]


In [73]:
xx['concat'] = str(xx['carat']) + xx['cut'] + xx['color'] + xx['clarity']

In [74]:
xx

Unnamed: 0,carat,cut,color,clarity,concat
0,0.20,Ideal,E,VS2,0 0.20\n1 0.20\n2 0.20\n3 ...
1,0.20,Premium,D,VS2,0 0.20\n1 0.20\n2 0.20\n3 ...
2,0.20,Premium,E,VS2,0 0.20\n1 0.20\n2 0.20\n3 ...
3,0.20,Premium,F,VS2,0 0.20\n1 0.20\n2 0.20\n3 ...
4,0.20,Very Good,E,VS2,0 0.20\n1 0.20\n2 0.20\n3 ...
...,...,...,...,...,...
6871,3.01,Premium,J,SI2,0 0.20\n1 0.20\n2 0.20\n3 ...
6872,3.04,Very Good,I,SI2,0 0.20\n1 0.20\n2 0.20\n3 ...
6873,3.65,Fair,H,I1,0 0.20\n1 0.20\n2 0.20\n3 ...
6874,3.67,Premium,I,I1,0 0.20\n1 0.20\n2 0.20\n3 ...


In [132]:
final = pd.read_csv('../data/prices_013.csv', sep=';')

In [133]:
final.reset_index(drop=True, inplace=True)

In [134]:
final = final.set_index('id')

In [135]:
final.to_csv('../data/prices_014.csv')

In [131]:
final

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,3165
1,5287
2,9585
3,4001
4,1657
...,...
13480,1716
13481,2290
13482,3109
13483,1997
