In [14]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import plotly.express as px

In [15]:
from sklearn import neighbors, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [16]:
pd.options.display.max_columns = None

Import the DataBase cleaned


In [17]:
df = pd.read_csv('DB_Valor_Prima_Anual_Cleaned.csv')

Cleaned dataset

In [18]:
df.head(3)

Unnamed: 0,Tipo_poliza,Valor_Asegurado,Fecha_Emisión,Fecha_Inicio,Fecha_fin,Valor_prima_Anual,Valor_asegurado_Vehiculo,Ciudad,DEPARTAMENTO,OCUPACION,EDAD,MARCA,REF1,REF2,REF3,CLASE,Modelo_del_Vehiculo,COLOR,GENERO
0,Renovacion,684689000,2011-10-10,2011-11-21,2012-11-21,782949.0,18800000,CARTAGENA,BOLIVAR,EMPLEADO(A),45.0,CHEVROLET,AVEO,FAMILY,MT 1500CC 4P AA,AUTOMOVIL,2010.0,NEGRO EBONY,MASCULINO
1,Nuevo,965205600,2011-10-10,2011-10-05,2012-10-05,715824.0,24900000,BOGOTA D.C.,CUNDINAMARCA,EMPLEADO(A),41.0,CHEVROLET,AVEO EMOTION,1.6L,MT 1600CC AA 2AB ABS,AUTOMOVIL,2009.0,BLANCO ARCO BICAPA,MASCULINO
2,Nuevo,640605600,2011-10-10,2011-10-03,2012-10-03,740816.0,12600000,MEDELLIN,ANTIOQUIA,INDEPENDIENTE,33.0,RENAULT,CLIO II,F.II EXPRESSION,MT 1400CC PACK,AUTOMOVIL,2002.0,BEIGE CARRARA,FEMENINO


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120822 entries, 0 to 120821
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Tipo_poliza               120822 non-null  object 
 1   Valor_Asegurado           120822 non-null  int64  
 2   Fecha_Emisión             120822 non-null  object 
 3   Fecha_Inicio              120822 non-null  object 
 4   Fecha_fin                 120822 non-null  object 
 5   Valor_prima_Anual         120822 non-null  float64
 6   Valor_asegurado_Vehiculo  120822 non-null  int64  
 7   Ciudad                    120822 non-null  object 
 8   DEPARTAMENTO              120822 non-null  object 
 9   OCUPACION                 120822 non-null  object 
 10  EDAD                      120822 non-null  float64
 11  MARCA                     120822 non-null  object 
 12  REF1                      120822 non-null  object 
 13  REF2                      120822 non-null  o

## 1st FEATURE ENGINEERING - CREATE NEW DATA COLUMNS

I need to create new features that improve the modeling, so I decided to extract Year an Month from the datetime columns

In [20]:
df['Y_Fecha_Emisión'] = pd.DatetimeIndex(df['Fecha_Emisión']).year

df['M_Fecha_Emisión'] = pd.DatetimeIndex(df['Fecha_Emisión']).month

df['Y_Fecha_Inicio'] = pd.DatetimeIndex(df['Fecha_Inicio']).year

df['M_Fecha_Inicio'] = pd.DatetimeIndex(df['Fecha_Inicio']).month

df['Y_Fecha_fin'] = pd.DatetimeIndex(df['Fecha_fin']).year

df['M_Fecha_fin'] = pd.DatetimeIndex(df['Fecha_fin']).month

In [21]:
df.head(3)

Unnamed: 0,Tipo_poliza,Valor_Asegurado,Fecha_Emisión,Fecha_Inicio,Fecha_fin,Valor_prima_Anual,Valor_asegurado_Vehiculo,Ciudad,DEPARTAMENTO,OCUPACION,EDAD,MARCA,REF1,REF2,REF3,CLASE,Modelo_del_Vehiculo,COLOR,GENERO,Y_Fecha_Emisión,M_Fecha_Emisión,Y_Fecha_Inicio,M_Fecha_Inicio,Y_Fecha_fin,M_Fecha_fin
0,Renovacion,684689000,2011-10-10,2011-11-21,2012-11-21,782949.0,18800000,CARTAGENA,BOLIVAR,EMPLEADO(A),45.0,CHEVROLET,AVEO,FAMILY,MT 1500CC 4P AA,AUTOMOVIL,2010.0,NEGRO EBONY,MASCULINO,2011,10,2011,11,2012,11
1,Nuevo,965205600,2011-10-10,2011-10-05,2012-10-05,715824.0,24900000,BOGOTA D.C.,CUNDINAMARCA,EMPLEADO(A),41.0,CHEVROLET,AVEO EMOTION,1.6L,MT 1600CC AA 2AB ABS,AUTOMOVIL,2009.0,BLANCO ARCO BICAPA,MASCULINO,2011,10,2011,10,2012,10
2,Nuevo,640605600,2011-10-10,2011-10-03,2012-10-03,740816.0,12600000,MEDELLIN,ANTIOQUIA,INDEPENDIENTE,33.0,RENAULT,CLIO II,F.II EXPRESSION,MT 1400CC PACK,AUTOMOVIL,2002.0,BEIGE CARRARA,FEMENINO,2011,10,2011,10,2012,10


# 1. TRAIN TEST SPLIT

Target variable = Valor_prima_Anual

In [None]:
X = df.drop(['Valor_prima_Anual', 'REF1', 'REF2', 'REF3', 'MARCA', 'Fecha_Emisión', 'Fecha_Inicio', 'Fecha_fin'], axis=1)
Y = df['Valor_prima_Anual']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
X_train.head(2)

# PIPELINE TO TRANSFORM VARIABLES

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=2)),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# 1. MODEL CREATION

## RUN #1

X = df.drop(['Valor_prima_Anual', 'REF1', 'REF2', 'REF3', 'MARCA', 'Fecha_Emisión', 'Fecha_Inicio', 'Fecha_fin'], axis=1) <br>
Y = df['Valor_prima_Anual']

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

regressors = [
    LinearRegression(),
    # KNeighborsRegressor(3),
    # SVR(kernel="rbf", C=0.025),
    # DecisionTreeRegressor(),
    # RandomForestRegressor(),
    # AdaBoostRegressor(),
    # GradientBoostingRegressor(),
    # XGBRegressor(),
    LGBMRegressor()
    ]
for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    y_pred = pipe.predict(X_test)
    print(regressor)
    print("model MAE : " , mean_absolute_error(y_test, y_pred))
    print("model MSE : " ,  mean_squared_error(y_test, y_pred))
    print("model R2  : " ,  r2_score(y_test, y_pred))

## RUN #2

## 2nd FEATURE ENGINEERING - CREATE NEW DATA COLUMNS

In [22]:
df['MT_AT_TP_REF3'] = df['REF3'].astype(str).str[0:2]
df['MT_AT_TP_REF3']

0         MT
1         MT
2         MT
3         MT
4         MT
          ..
120817    MT
120818    MT
120819    AT
120820    MT
120821    MT
Name: MT_AT_TP_REF3, Length: 120822, dtype: object

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120822 entries, 0 to 120821
Data columns (total 26 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Tipo_poliza               120822 non-null  object 
 1   Valor_Asegurado           120822 non-null  int64  
 2   Fecha_Emisión             120822 non-null  object 
 3   Fecha_Inicio              120822 non-null  object 
 4   Fecha_fin                 120822 non-null  object 
 5   Valor_prima_Anual         120822 non-null  float64
 6   Valor_asegurado_Vehiculo  120822 non-null  int64  
 7   Ciudad                    120822 non-null  object 
 8   DEPARTAMENTO              120822 non-null  object 
 9   OCUPACION                 120822 non-null  object 
 10  EDAD                      120822 non-null  float64
 11  MARCA                     120822 non-null  object 
 12  REF1                      120822 non-null  object 
 13  REF2                      120822 non-null  o

In [24]:
X = df.drop(['Valor_prima_Anual', 'REF1', 'REF2', 'REF3', 'MARCA', 'Fecha_Emisión', 'Fecha_Inicio', 'Fecha_fin'], axis=1)
Y = df['Valor_prima_Anual']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
X_train.head(2)

In [None]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
numeric_features

In [None]:
categorical_features

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

regressors = [
    LinearRegression(),
    # KNeighborsRegressor(3),
    # SVR(kernel="rbf", C=0.025),
    # DecisionTreeRegressor(),
    # RandomForestRegressor(),
    # AdaBoostRegressor(),
    # GradientBoostingRegressor(),
    # XGBRegressor(),
    LGBMRegressor()
    ]
for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    y_pred = pipe.predict(X_test)
    print(regressor)
    print("model MAE : " , mean_absolute_error(y_test, y_pred))
    print("model MSE : " ,  mean_squared_error(y_test, y_pred))
    print("model R2  : " ,  r2_score(y_test, y_pred))

## RUN #3

## 3rd FEATURE ENGINEERING - CREATE NEW DATA COLUMNS

In [25]:
df['CC_REF3'] = df['REF3'].str.extract('(\d+)')
df['CC_REF3'].unique()
df['CC_REF3'] = df['CC_REF3'].astype('int64')

In [None]:
X = df.drop(['Valor_prima_Anual', 'REF1', 'REF2', 'REF3', 'MARCA', 'Fecha_Emisión', 'Fecha_Inicio', 'Fecha_fin'], axis=1)
Y = df['Valor_prima_Anual']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
X_train.head(2)

In [None]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

regressors = [
    LinearRegression(),
    # KNeighborsRegressor(3),
    # SVR(kernel="rbf", C=0.025),
    # DecisionTreeRegressor(),
    # RandomForestRegressor(),
    # AdaBoostRegressor(),
    # GradientBoostingRegressor(),
    # XGBRegressor(),
    LGBMRegressor()
    ]
for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    y_pred = pipe.predict(X_test)
    print(regressor)
    print("model MAE : " , mean_absolute_error(y_test, y_pred))
    print("model MSE : " ,  mean_squared_error(y_test, y_pred))
    print("model R2  : " ,  r2_score(y_test, y_pred))

## RUN #4

## 3rd FEATURE ENGINEERING - ADDING NEW DATA COLUMNS "MARCA"

In [26]:
df.head(2)

Unnamed: 0,Tipo_poliza,Valor_Asegurado,Fecha_Emisión,Fecha_Inicio,Fecha_fin,Valor_prima_Anual,Valor_asegurado_Vehiculo,Ciudad,DEPARTAMENTO,OCUPACION,EDAD,MARCA,REF1,REF2,REF3,CLASE,Modelo_del_Vehiculo,COLOR,GENERO,Y_Fecha_Emisión,M_Fecha_Emisión,Y_Fecha_Inicio,M_Fecha_Inicio,Y_Fecha_fin,M_Fecha_fin,MT_AT_TP_REF3,CC_REF3
0,Renovacion,684689000,2011-10-10,2011-11-21,2012-11-21,782949.0,18800000,CARTAGENA,BOLIVAR,EMPLEADO(A),45.0,CHEVROLET,AVEO,FAMILY,MT 1500CC 4P AA,AUTOMOVIL,2010.0,NEGRO EBONY,MASCULINO,2011,10,2011,11,2012,11,MT,1500
1,Nuevo,965205600,2011-10-10,2011-10-05,2012-10-05,715824.0,24900000,BOGOTA D.C.,CUNDINAMARCA,EMPLEADO(A),41.0,CHEVROLET,AVEO EMOTION,1.6L,MT 1600CC AA 2AB ABS,AUTOMOVIL,2009.0,BLANCO ARCO BICAPA,MASCULINO,2011,10,2011,10,2012,10,MT,1600


In [None]:
X = df.drop(['Valor_prima_Anual', 'REF1', 'REF2', 'REF3', 'Fecha_Emisión', 'Fecha_Inicio', 'Fecha_fin'], axis=1)
Y = df['Valor_prima_Anual']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
X_train.head(2)

In [None]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

regressors = [
    LinearRegression(),
    # KNeighborsRegressor(3),
    # SVR(kernel="rbf", C=0.025),
    # DecisionTreeRegressor(),
    # RandomForestRegressor(),
    # AdaBoostRegressor(),
    # GradientBoostingRegressor(),
    # XGBRegressor(),
    LGBMRegressor()
    ]
for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    y_pred = pipe.predict(X_test)
    print(regressor)
    print("model MAE : " , mean_absolute_error(y_test, y_pred))
    print("model MSE : " ,  mean_squared_error(y_test, y_pred))
    print("model R2  : " ,  r2_score(y_test, y_pred))

# CONCLUSION OF MODELING AND TESTING

1. The first thing to do to improve the results of a prediction model is try to figure out the high feature engineering, after this try to do a hyperruning to your model. 
2. using others models the time to train the model was so long (2 hrs), due to the amount of unique values of certain values
3. Even doing some transformations on those columns, the time was so long, I decided to work only with LR and LGBMR
4. Doing some feature engineering on the columns like REF, FECHAS and others I could increase the R2 from 0.8 to 0.9

# BEST MODEL = LGBMRegressor
model MAE :  92779.1713959528 <br>
model MSE :  24875340462.45637 <br>
model R2  :  0.9004703342263928

In [None]:
LGBMRegressor()

# BONUS = Let´s add a new feture "ANTIGUEDAD", and let´s see how is its behaviour

# RUN #5

In [27]:
df2 = pd.read_csv('DB_Valor_Prima_Anual_Cleaned_antiguedadCol.csv')

In [28]:
df2['Y_Fecha_Emisión'] = pd.DatetimeIndex(df2['Fecha_Emisión']).year

df2['M_Fecha_Emisión'] = pd.DatetimeIndex(df2['Fecha_Emisión']).month

df2['Y_Fecha_Inicio'] = pd.DatetimeIndex(df2['Fecha_Inicio']).year

df2['M_Fecha_Inicio'] = pd.DatetimeIndex(df2['Fecha_Inicio']).month

df2['Y_Fecha_fin'] = pd.DatetimeIndex(df2['Fecha_fin']).year

df2['M_Fecha_fin'] = pd.DatetimeIndex(df2['Fecha_fin']).month

In [29]:
df2['MT_AT_TP_REF3'] = df2['REF3'].astype(str).str[0:2]

In [30]:
df2['CC_REF3'] = df2['REF3'].str.extract('(\d+)')
df2['CC_REF3'].unique()
df2['CC_REF3'] = df2['CC_REF3'].astype('int64')

In [31]:
X = df2.drop(['Valor_prima_Anual', 'REF1', 'REF2', 'REF3', 'Fecha_Emisión', 'Fecha_Inicio', 'Fecha_fin'], axis=1)
Y = df2['Valor_prima_Anual']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
X_train.head(3)

In [None]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

regressors = [
    LinearRegression(),
    # KNeighborsRegressor(3),
    # SVR(kernel="rbf", C=0.025),
    # DecisionTreeRegressor(),
    # RandomForestRegressor(),
    # AdaBoostRegressor(),
    # GradientBoostingRegressor(),
    # XGBRegressor(),
    LGBMRegressor()
    ]
for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    y_pred = pipe.predict(X_test)
    print(regressor)
    print("model MAE : " , mean_absolute_error(y_test, y_pred))
    print("model MSE : " ,  mean_squared_error(y_test, y_pred))
    print("model R2  : " ,  r2_score(y_test, y_pred))

it increase but a little 

# FEATURE IMPORTANCE

## With feature importance we could see the variables that we need to concern about, the most important variables to the machine.

In [None]:
import lightgbm

In [None]:

feature= df.drop(['Valor_prima_Anual', 'REF1', 'REF2', 'REF3', 'Fecha_Emisión', 'Fecha_Inicio', 'Fecha_fin'], axis=1)
target= df[['Valor_prima_Anual']]
feature_train, feature_test, target_train, target_test= train_test_split(feature, target, test_size=0.12)

In [None]:
print('total feature training features: ', len(feature_train))
print('total feature testing features: ', len(feature_test))
print('total target training features: ', len(target_train))
print('total target testing features: ', len(target_test))

In [None]:
feature_train.shape , target_train.shape



In [None]:
train_data = lightgbm.Dataset(feature_train, label=target_train, categorical_feature=categorical_features)
test_data = lightgbm.Dataset(feature_test, label=target_test)

In [None]:
#basic parameter:
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

In [None]:
model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)

# HYPERPARAMETER TUNNING

In [None]:
X = df.drop(['Valor_prima_Anual', 'REF1', 'REF2', 'REF3', 'Fecha_Emisión', 'Fecha_Inicio', 'Fecha_fin'], axis=1)
Y = df['Valor_prima_Anual']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
X_train.head(2)

In [None]:
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

In [None]:
LGBMR = LGBMRegressor()

LGBMR_param_grid = {'objective':['regression','dart','goss','rf'],
                    'num_leaves':[7],
                    'learning_rate':[0.01], 
                    'n_estimators': [3300],
                    'max_depth':[4], 
                    'max_bin': [65],
                    'bagging_fraction':[0.6],
                    'bagging_freq':[9], 
                    'feature_fraction':[0.1],
                    'feature_fraction_seed':[1],
                    'bagging_seed':[14],
                    'min_data_in_leaf':[5], 
                    'min_sum_hessian_in_leaf':[5],
                    'colsample_bytree':[0],
                    'reg_alpha':[0.2],
                    'reg_lambda':[0.1]}

gsLGBMR = GridSearchCV(LGBMR, 
                      param_grid = LGBMR_param_grid, 
                      cv = 10, 
                      scoring = "neg_mean_squared_error", 
                      n_jobs = -1, 
                      verbose = 1)

gsLGBMR.fit(X_train,y_train)

LGBMR_best = gsLGBMR.best_estimator_

gsLGBMR.best_score_

# CONCLUSION HYPERTUNING AND FEATURING IMPORTANCES

1. Its necesary to do an optimal hypertuning, but due to the lack of time I couldnt finish it.
2. Look for the feature importances allow us to focus on the activities that boost the business
3. Would be a great idea to develop a PowerBI or TABLEAU project.

# HYPERTUNNING WITH PIPELINE

In [33]:
from lightgbm import LGBMRegressor

LGBMR = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LGBMRegressor())])

In [42]:
param_grid = { 
    'classifier__n_estimators': [600,900],
    'classifier__max_depth' : [9,11]}
from sklearn.model_selection import GridSearchCV
CV = GridSearchCV(LGBMR, param_grid, n_jobs= 1)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)    
print(CV.best_score_)

{'classifier__max_depth': 9, 'classifier__n_estimators': 900}
0.899200756272832
