In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/diamonds_train.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,city,price
0,1.21,Premium,J,VS2,62.4,58.0,6.83,6.79,4.25,Dubai,4268
1,0.32,Very Good,H,VS2,63.0,57.0,4.35,4.38,2.75,Kimberly,505
2,0.71,Fair,G,VS1,65.5,55.0,5.62,5.53,3.65,Las Vegas,2686
3,0.41,Good,D,SI1,63.8,56.0,4.68,4.72,3.0,Kimberly,738
4,1.02,Ideal,G,SI1,60.5,59.0,6.55,6.51,3.95,Dubai,4882


In [3]:
cuts = list(df['cut'].unique())
colors = list(df['color'].unique())
clarities = list(df['clarity'].unique())
cities = list(df['city'].unique())
clar_no_i1 = clarities.copy()
clar_no_i1.remove('I1')

In [4]:
#checking the most repeated carats:
top_10_carat = df[['carat']].value_counts().head(10).reset_index()

In [5]:
top_10_carat.rename(columns={0: 'sample'}, inplace=True)

In [6]:
carats = list(top_10_carat['carat'])

In [7]:
df_2 = df[['cut', 'color', 'clarity', 'price','city', 'carat']]

In [8]:
df_2 = df_2[df_2['carat'].isin(carats)]

### Function to create a scaled label to cut, color & clarity

In [10]:
def classificatoration(df, col1, col2, target, values, scale=1):
    df_corr = pd.DataFrame(columns=[target, 'label'])
    n = 0
    values.sort()
    
    if scale == 2:
        scaler = StandardScaler()
    elif scale ==3:
        scaler = RobustScaler()
    else:
        scaler = MinMaxScaler()
    
    
    for car in carats:
        for x in df.loc[df['carat'] == car][col1].unique():
            for y in df.loc[df['carat'] == car].loc[df[col1] == x][col2].unique():
                items = list(df_2.loc[df_2['carat'] == car].loc[df_2[col1] == x].loc[df_2[col2] == y][target].unique())
                items.sort()
                if items == values:
                    temp_df = df_2.loc[df_2['carat'] == car].loc[df_2[col1] == x].loc[df_2[col2] == y]
                    agg_df = temp_df.groupby([col1, col2, target]).sum(numeric_only=True).reset_index()
                    agg_df['pxc'] = agg_df['price']/agg_df['carat']
                

                    agg_df['label'] = scaler.fit_transform(agg_df[['pxc']])
                
                    df_corr = pd.concat([df_corr, agg_df[[target, 'label']]], axis=0)
                    n += 1
    
    print(f'Results based in {n} combinations. Scaler: {scaler}.')     
    
    return df_corr.groupby(target).mean(numeric_only=True).sort_values('label', ascending=False).reset_index()
                

### Create & save labeling tables

In [11]:
clar_lab_mm = classificatoration(df_2, 'cut', 'color', 'clarity', clarities, scale=1)
clar_lab_std = classificatoration(df_2, 'cut', 'color', 'clarity', clarities, scale=2)
clar_lab_rob = classificatoration(df_2, 'cut', 'color', 'clarity', clarities, scale=3)

Results based in 14 combinations. Scaler: MinMaxScaler().
Results based in 14 combinations. Scaler: StandardScaler().
Results based in 14 combinations. Scaler: RobustScaler().


In [12]:
clar_lab_mm.to_csv('../data/scales/clar_lab_mm.csv')
clar_lab_std.to_csv('../data/scales/clar_lab_std.csv')
clar_lab_rob.to_csv('../data/scales/clar_lab_rob.csv')

In [13]:
cut_lab_mm = classificatoration(df_2, 'color', 'clarity', 'cut', cuts, scale=1)
cut_lab_std = classificatoration(df_2, 'color', 'clarity', 'cut', cuts, scale=2)
cut_lab_rob = classificatoration(df_2, 'color', 'clarity', 'cut', cuts, scale=3)

Results based in 134 combinations. Scaler: MinMaxScaler().
Results based in 134 combinations. Scaler: StandardScaler().
Results based in 134 combinations. Scaler: RobustScaler().


In [14]:
cut_lab_mm.to_csv('../data/scales/cut_lab_mm.csv')
cut_lab_std.to_csv('../data/scales/cut_lab_std.csv')
cut_lab_rob.to_csv('../data/scales/cut_lab_rob.csv')

In [15]:
col_lab_mm = classificatoration(df_2, 'cut', 'clarity', 'color', colors, scale=1)
col_lab_std = classificatoration(df_2, 'cut', 'clarity', 'color', colors, scale=2)
col_lab_rob = classificatoration(df_2, 'cut', 'clarity', 'color', colors, scale=3)

Results based in 116 combinations. Scaler: MinMaxScaler().
Results based in 116 combinations. Scaler: StandardScaler().
Results based in 116 combinations. Scaler: RobustScaler().


In [16]:
col_lab_mm.to_csv('../data/scales/col_lab_mm.csv')
col_lab_std.to_csv('../data/scales/col_lab_std.csv')
col_lab_rob.to_csv('../data/scales/col_lab_rob.csv')