In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm

In [8]:
df=pd.read_csv('diamonds.csv')

In [9]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
1,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
2,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
3,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
4,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48


# Make test default model

In [10]:
#create log columns
def create_log_columns(df,param):
    cat_price=['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
    df['clarity_log'] = df['clarity'].apply(lambda x: 1+ (cat_price.index(x)))
    color_list = sorted(list(df.color.unique()),reverse=True)
    df['color_log'] = df['color'].apply(lambda x: 1+ (color_list.index(x)))
    cut_price = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
    df['cut_log'] = df['cut'].apply(lambda x: 1+ (cut_price.index(x)))
    if param == 1:
        df['price_log'] = np.log(df['price'])
    df['carat_log']=np.log(df['carat'])
    df['cut_log']=np.log(df['cut_log'])
    df['color_log']=np.log(df['color_log'])
    df['clarity_log']=np.log(df['clarity_log'])
    df['depth_log']=np.log(df['depth'])
    df['table_log']=np.log(df['table'])
    return df

In [11]:
# Transformando colunas categoricas em lista conforme importancia no pre√ßo
df = create_log_columns(df,1)
X= df[['carat_log','cut_log','color_log','clarity_log','depth_log','table_log']]
y = df['price_log']
modelo_default = LinearRegression()
modelo_default.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
def test_default(df_test):
    X= df_test[['carat_log','cut_log','color_log','clarity_log','depth_log','table_log']]
    y_predict=modelo_default.predict(X)
    y_predict=np.exp(y_predict)
    y_real = df_test['price']
    return np.sqrt(metrics.mean_squared_error(y_real,y_predict))

In [13]:
print(test_default(df))

812.4096125576648


# Make model for categories combination

In [14]:
cuts_lst = df.cut.unique()
colors_lst = df.color.unique()
clars_lst = df.clarity.unique()
combo_lst = [[cut,color,clar] for clar in clars_lst for color in colors_lst for cut in cuts_lst]
dfs = [df.query(f'(cut == "{x[0]}") & (color =="{x[1]}") & (clarity == "{x[2]}")') for x in combo_lst]

In [15]:
def ger_model(df):
    try:
        y= df['price']
        X= df[['carat','depth','table','x','y','z']]
        modelo = LinearRegression()
        modelo.fit(X, y)
        y_predict=modelo.predict(X)
        result_model=np.sqrt(metrics.mean_squared_error(y,y_predict))
        result_default=test_default(df)
        if result_model < result_default:
            return modelo
        else: 
            return None
    except:
        return None

In [16]:
modelos = [ger_model(df) for df in dfs]

In [17]:
def do_predict(row):
    try:
        cut = row['cut']
        color = row['color']
        clarity = row['clarity']
        X= [[row['carat'],row['depth'],row['table'],row['x'],row['y'],row['z']]]
        index = combo_lst.index([cut,color,clarity])
        model = modelos[index] 
        return model.predict(X)
    except:
        X= [[row['carat_log'],row['cut_log'],row['color_log'],row['clarity_log'],row['depth_log'],row['table_log']]]
        y_predict = modelo_default.predict(X)
        y_predict=np.exp(y_predict)
        return y_predict

In [18]:
y_predict = df.apply(lambda row : do_predict(row), axis=1)
y_real = df['price']
metrics.r2_score(y_real, y_predict).round(3)

0.977

In [19]:
np.sqrt(metrics.mean_squared_error(y_real,y_predict))

599.5966571289708

In [102]:
rick_diamonds = pd.read_csv('rick_diamonds.csv')
rick_copy = rick_diamonds.copy()
rick_copy = create_log_columns(rick_copy,0)
y_predict = rick_copy.apply(lambda row : do_predict(row)[0], axis=1)
rick_diamonds['price_predicted'] = y_predict
rick_diamonds.to_csv('./Deliverables/rick_diamonds_mean.csv')