In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# IMPORTING AND CONFIG

In [None]:
# USO BÁSICO
import pandas as pd
import numpy as np

#PREPROCESAMIENTO
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#DIVIDIR MODELO
from sklearn.model_selection import train_test_split

#MODELOS
from sklearn.linear_model import LassoCV,RidgeCV
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
import xgboost as xgb

#ANALISIS
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

#EXPORTMODEL
import pickle

# GENERATE DATASET

## Declaring Functions

In [None]:
def encoding_categorical_feats (df):
  color_categorization = {'J':1,'I':2,'H':3,'G':4,
                          'F':5,'E':6,'D':7}

  clarity_categorization = {'I1':1,'SI2':2,'SI1':3,'VS2':4,
                            'VS1':5,'VVS2':6,'VVS1':7,'IF':8}

  cut_categorization = {'Fair':1,'Good':2,'Very Good':3,
                        'Premium':4,'Ideal':5}

  df['color']=df['color'].map(color_categorization)
  df['clarity']=df['clarity'].map(clarity_categorization)
  df['cut']=df['cut'].map(cut_categorization)

  return df


def errase_outliers (df):
  #diamonds.describe(percentiles=[0.999])
  #borrar valores por encima del percentil 99.9%

  df = df[df['carat']<2.68]
  df = df[df['depth']<68.800]
  df = df[df['table']<67]
  df = df[df['x']<8.9]
  df = df[df['y']<8.885460]
  df = df[df['z']<5.54]

  return df

## CREATING AND CLEANING DATA SET

In [None]:
diamonds = pd.read_csv('/content/drive/My Drive/00_IRONHACK/diamonds_comp/data/diamonds_train.csv');

In [None]:
print(f'Initial df: {diamonds.shape}')
initial_rows = diamonds.shape[0]
display(diamonds.head())
diamonds = errase_outliers(diamonds)
diamonds = encoding_categorical_feats (diamonds)
print(f'\n\nPost cleaning df: {diamonds.shape}')
display(diamonds.head())
print(f'{initial_rows - diamonds.shape[0]} outliers deleted')

Initial df: (40455, 10)


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95




Post cleaning df: (40291, 10)


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,4,1,4,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,3,3,4,63.0,57.0,505,4.35,4.38,2.75
2,0.71,1,4,5,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,2,7,3,63.8,56.0,738,4.68,4.72,3.0
4,1.02,5,4,3,60.5,59.0,4882,6.55,6.51,3.95


164 outliers deleted


In [None]:
working_features = ['carat','cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']
target = 'price'

In [None]:
diamonds_train, diamonds_test = train_test_split(diamonds,test_size=0.2)
print(f'TRAIN{diamonds_train.shape}')
print(f'TEST: {diamonds_test.shape}')

TRAIN(32232, 10)
TEST: (8059, 10)


## PREPROCESOR

In [None]:
class CustomNumericalTransformer (BaseEstimator,TransformerMixin):
  def _init_(self):
      pass
  
  def fit (self, X, y = None):
      return self
  
  def transform (self, X, y = None):

      #dimension
      X.loc[:,'dimensions'] = X['x'] * X['y'] * X['z']
      #drop redundant columns
      X.drop(['x','y','z'],axis = 1)

      #carat_size
      X.loc[:,'carat_size'] =  X['carat']/X['dimensions']
      X.drop('dimensions',axis=1) #revisar si es mejor dejarlo
        
      #Converting any infinity values in the dataset to Nan
      X = X.replace ([np.inf,-np.inf],np.nan)
      
      return X.values



numeric_feats_pipeline = Pipeline(steps=[('custom_changes',CustomNumericalTransformer()),
                                         ('imputer',SimpleImputer(strategy='median')),
                                         ('standar_scaler',StandardScaler())])

preprocesor = ColumnTransformer(transformers=[('numeric_pipeline',numeric_feats_pipeline,working_features)])

# TEST

In [None]:
diamonds_train, diamonds_test = train_test_split(diamonds,test_size=0.2)
diamonds_train,diamonds_eval = train_test_split(diamonds_train,test_size=0.1)

print(f'TRAIN: {diamonds_train.shape}')
print(f'TEST: {diamonds_test.shape}')
print(f'EVAL: {diamonds_eval.shape}')


# MODEL DEFINITION

In [None]:
ml_model = Pipeline(steps=[('preprocesor',preprocesor),
                           ('selected_model',GradientBoostingRegressor(n_estimators=2000,max_depth=8))])

In [None]:
ml_model.fit(diamonds_train[working_features],diamonds_train[target])


In [None]:
#Error en train
y_train_pred = ml_model.predict(diamonds_train[working_features])
y_train_real = diamonds_train[target]
mean_squared_error(y_pred=y_train_pred,y_true=y_train_real,squared=False)

359.9290763276678

In [None]:
#Error en test
y_test_pred = ml_model.predict(diamonds_test[working_features])
y_test_real = diamonds_test[target]
mean_squared_error(y_pred=y_test_pred,y_true=y_test_real,squared=False)

577.7541419597225

In [None]:
scores = cross_val_score(ml_model,
                        diamonds[working_features],
                        diamonds[target],
                        scoring='neg_root_mean_squared_error',
                        cv=5, n_jobs=-1)

In [None]:
print(f'model score: {np.mean(scores)}')

_________________________________________

In [None]:
ml_model.fit(diamonds[working_features],diamonds[target])

In [None]:
param_to_optimize = {
    'preprocesor__numeric_transformer__imputer__strategy':['mean','median'],
    'regressor_model__n_estimators':[2000],
    'regressor_model__max_depth':[8,16],
    'regressor_model__random_state':[2,10,42],
    'regressor_model__max_features':'auto',
    'regressor_model__samples_split':[5,10,20],
    'regressor_model__samples_leaf':[15,30]
    }

In [None]:
grid_search = RandomizedSearchCV(ml_model,
                                param_to_optimize,
                                cv=5,
                                verbose=10,
                                scoring='neg_root_mean_squared_error',
                                n_jobs = -1,
                                n_iter = 32)