In [332]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

In [333]:
dados = pd.read_csv("dados_pre_processados.csv")
dados.drop("Unnamed: 0", axis = "columns", inplace=True)
dados.head()

Unnamed: 0,CodigoReceita,Volume,TipoLancamento,DataEmissaoNotaFiscal,CodigoCliente,CodigoMotorista,TempoPermanenciaObra,ConsumoTotal,Idade,TipoTraco,...,FCK,Rompimento7Dias,Rompimento28Dias,ConsumoPorM3,DesvioAplicado,ResultadoProjetado,PercentualCrescimentoReal,PercentualCrescimentoProjetado,RelacaoAguaCimento,DataProjecao
0,672051,8.0,B,2015-01-16 07:35:31.770,6248051,11080,02:41:00,2.720.000,28,N,...,30.0,20.83,31.81,340.0,3.5,33.321754,52.712434,59.970014,0.535294,2015-01-16
1,672051,8.0,B,2015-01-16 08:20:03.153,6248051,87079,02:46:00,2.720.000,28,N,...,30.0,19.62,31.21,340.0,3.5,31.799067,59.072375,62.074756,0.535294,2015-01-16
2,672051,7.0,B,2015-01-16 10:05:20.293,6248051,43080,01:32:00,2.380.000,28,N,...,30.0,21.97,36.68,340.0,3.5,34.733698,66.954939,58.096029,0.535294,2015-01-16
3,672051,8.0,B,2015-01-20 18:52:57.373,12343051,43080,01:10:00,2.720.000,28,N,...,30.0,28.21,40.21,340.0,3.5,42.118539,42.538107,49.303577,0.535294,2015-01-23
4,672051,6.0,B,2015-01-21 07:08:54.847,6248051,42080,00:35:00,2.040.000,28,N,...,30.0,25.8,38.74,340.0,3.5,39.330637,50.155039,52.444329,0.535294,2015-01-23


In [334]:
mask_aglomerante = dados["CodigoAglomerante"] == 75051
dados_aglomerante = dados[mask_aglomerante]
dados_aglomerante.head()

Unnamed: 0,CodigoReceita,Volume,TipoLancamento,DataEmissaoNotaFiscal,CodigoCliente,CodigoMotorista,TempoPermanenciaObra,ConsumoTotal,Idade,TipoTraco,...,FCK,Rompimento7Dias,Rompimento28Dias,ConsumoPorM3,DesvioAplicado,ResultadoProjetado,PercentualCrescimentoReal,PercentualCrescimentoProjetado,RelacaoAguaCimento,DataProjecao
38008,33488051,6.0,B,2017-03-08 14:43:53.140,133258051,812051,00:54:00,2.172.000,28,N,...,30.0,25.73,35.08,362.0,6.0,35.882105,36.338904,39.456298,0.538674,2017-03-10
38066,33535051,8.0,B,2017-11-13 08:06:27.303,225643998,85079,01:05:00,2.560.000,28,N,...,25.0,24.26,33.4,320.0,6.0,28.834552,37.675185,18.856358,0.559375,2017-11-17
38067,33535051,8.0,B,2017-11-21 07:35:58.153,225643998,624051,01:39:00,2.560.000,28,N,...,25.0,24.24,31.72,320.0,6.0,28.817159,30.858086,18.882667,0.678125,2017-11-24
38068,33535051,8.0,B,2017-12-11 08:50:55.937,225643998,731051,01:09:00,2.560.000,28,N,...,25.0,23.45,31.12,320.0,6.0,28.125844,32.707889,19.939631,0.678125,2017-12-15
38069,33535051,8.0,B,2017-12-12 08:02:35.123,225643998,340051,01:28:00,2.560.000,28,N,...,25.0,20.88,28.39,320.0,6.0,25.816563,35.967433,23.642543,0.678125,2017-12-15


In [335]:
len(dados_aglomerante)

466

In [336]:
fig = px.box(dados_aglomerante, x="TipoLancamento", y="Rompimento28Dias", title='Tipo de Lançamento x Rompimento 28 Dias')
fig.show()

In [337]:
fig = px.box(dados_aglomerante, x="TipoTraco", y="Rompimento28Dias", title='Tipo de Traço x Rompimento 28 Dias')
fig.show()

In [338]:
fig = px.box(dados_aglomerante, x="FCK", y="Rompimento28Dias", title='FCK x Rompimento 28 Dias')
fig.show()

In [339]:
dados_aglomerante["TipoLancamento"].replace('C', 0, inplace = True)
dados_aglomerante["TipoLancamento"].replace('B', 1, inplace = True)

dados_aglomerante["TipoTraco"].replace('N', 0, inplace = True)
dados_aglomerante["TipoTraco"].replace('E', 1, inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [340]:
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])
    
class Debug(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        print(X)

modelo = Pipeline([
  #('features', FeatureUnion(
  #    transformer_list=[
  #        ('numericals', Pipeline([
  #            ('scaler', MinMaxScaler())
  #        ]))
  #    ]
  #)),
  ('model', GradientBoostingRegressor(learning_rate = 0.22, n_estimators = 12,  random_state = 1))
  #('model', DecisionTreeRegressor())
  #('model', xgb.XGBRegressor())
])

In [341]:
dados_aglomerante.head()

Unnamed: 0,CodigoReceita,Volume,TipoLancamento,DataEmissaoNotaFiscal,CodigoCliente,CodigoMotorista,TempoPermanenciaObra,ConsumoTotal,Idade,TipoTraco,...,FCK,Rompimento7Dias,Rompimento28Dias,ConsumoPorM3,DesvioAplicado,ResultadoProjetado,PercentualCrescimentoReal,PercentualCrescimentoProjetado,RelacaoAguaCimento,DataProjecao
38008,33488051,6.0,1,2017-03-08 14:43:53.140,133258051,812051,00:54:00,2.172.000,28,0,...,30.0,25.73,35.08,362.0,6.0,35.882105,36.338904,39.456298,0.538674,2017-03-10
38066,33535051,8.0,1,2017-11-13 08:06:27.303,225643998,85079,01:05:00,2.560.000,28,0,...,25.0,24.26,33.4,320.0,6.0,28.834552,37.675185,18.856358,0.559375,2017-11-17
38067,33535051,8.0,1,2017-11-21 07:35:58.153,225643998,624051,01:39:00,2.560.000,28,0,...,25.0,24.24,31.72,320.0,6.0,28.817159,30.858086,18.882667,0.678125,2017-11-24
38068,33535051,8.0,1,2017-12-11 08:50:55.937,225643998,731051,01:09:00,2.560.000,28,0,...,25.0,23.45,31.12,320.0,6.0,28.125844,32.707889,19.939631,0.678125,2017-12-15
38069,33535051,8.0,1,2017-12-12 08:02:35.123,225643998,340051,01:28:00,2.560.000,28,0,...,25.0,20.88,28.39,320.0,6.0,25.816563,35.967433,23.642543,0.678125,2017-12-15


In [342]:
dados_aglomerante_modelo = dados_aglomerante[["Rompimento7Dias", "Rompimento28Dias", "ResultadoProjetado"]]

X = dados_aglomerante_modelo[["Rompimento7Dias", "ResultadoProjetado"]]
Y = dados_aglomerante_modelo["Rompimento28Dias"].values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state = 1)

In [343]:
y_train = y_train.reshape(-1, 1)
#x_train = x_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
#x_test = x_test.reshape(-1, 1)

In [344]:
x_train_modelo = x_train["Rompimento7Dias"].values
x_test_modelo = x_test["Rompimento7Dias"].values

x_train_modelo = x_train_modelo.reshape(-1,1)
x_test_modelo = x_test_modelo.reshape(-1,1)

In [345]:
modelo.fit(x_train_modelo, y_train)
y_pred = modelo.predict(x_test_modelo)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [346]:
df = pd.DataFrame(columns = ["Rompimento28Dias", "ResultadoProjetado", "ResultadoPredito"])

df["Rompimento28Dias"] = y_test.reshape(1, -1)[0]
df["ResultadoProjetado"] = x_test["ResultadoProjetado"].values
df["ResultadoPredito"] = y_pred

In [347]:
df["ErroProjetado"] = abs(df["ResultadoProjetado"] - df["Rompimento28Dias"])
df["ErroPredito"] = abs(df["ResultadoPredito"] - df["Rompimento28Dias"])

In [348]:
df.head()

Unnamed: 0,Rompimento28Dias,ResultadoProjetado,ResultadoPredito,ErroProjetado,ErroPredito
0,28.41,33.010421,30.875256,4.600421,2.465256
1,30.0,34.044437,35.734285,4.044437,5.734285
2,33.17,35.628759,32.64765,2.458759,0.52235
3,34.09,29.838738,33.364843,4.251262,0.725157
4,26.27,24.704825,24.821577,1.565175,1.448423


In [349]:
df["ErroProjetado"].sum()

372.3348749931759

In [350]:
df["ErroPredito"].sum()

280.04967977847525

In [351]:
mask_modelo_predito_melhor = df["ErroPredito"] < df["ErroProjetado"]
quantidade_nf_modelo_eh_melhor = len(df[mask_modelo_predito_melhor])
quantidade_total = len(df)

quantidade_nf_modelo_eh_melhor/quantidade_total

0.6493506493506493

In [None]:
0.6493506493506493  - 0,22 - 12 - GradientBoostingRegressor