In [616]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

In [617]:
dados = pd.read_csv("dados_pre_processados.csv")
dados.drop("Unnamed: 0", axis = "columns", inplace=True)
dados.head()

Unnamed: 0,CodigoReceita,Volume,TipoLancamento,DataEmissaoNotaFiscal,CodigoCliente,CodigoMotorista,TempoPermanenciaObra,ConsumoTotal,Idade,TipoTraco,...,FCK,Rompimento7Dias,Rompimento28Dias,ConsumoPorM3,DesvioAplicado,ResultadoProjetado,PercentualCrescimentoReal,PercentualCrescimentoProjetado,RelacaoAguaCimento,DataProjecao
0,672051,8.0,B,2015-01-16 07:35:31.770,6248051,11080,02:41:00,2.720.000,28,N,...,30.0,20.83,31.81,340.0,3.5,33.321754,52.712434,59.970014,0.535294,2015-01-16
1,672051,8.0,B,2015-01-16 08:20:03.153,6248051,87079,02:46:00,2.720.000,28,N,...,30.0,19.62,31.21,340.0,3.5,31.799067,59.072375,62.074756,0.535294,2015-01-16
2,672051,7.0,B,2015-01-16 10:05:20.293,6248051,43080,01:32:00,2.380.000,28,N,...,30.0,21.97,36.68,340.0,3.5,34.733698,66.954939,58.096029,0.535294,2015-01-16
3,672051,8.0,B,2015-01-20 18:52:57.373,12343051,43080,01:10:00,2.720.000,28,N,...,30.0,28.21,40.21,340.0,3.5,42.118539,42.538107,49.303577,0.535294,2015-01-23
4,672051,6.0,B,2015-01-21 07:08:54.847,6248051,42080,00:35:00,2.040.000,28,N,...,30.0,25.8,38.74,340.0,3.5,39.330637,50.155039,52.444329,0.535294,2015-01-23


In [640]:
mask_aglomerante = dados["CodigoAglomerante"] == 109051
dados_aglomerante = dados[mask_aglomerante]
dados_aglomerante.head()

Unnamed: 0,CodigoReceita,Volume,TipoLancamento,DataEmissaoNotaFiscal,CodigoCliente,CodigoMotorista,TempoPermanenciaObra,ConsumoTotal,Idade,TipoTraco,...,FCK,Rompimento7Dias,Rompimento28Dias,ConsumoPorM3,DesvioAplicado,ResultadoProjetado,PercentualCrescimentoReal,PercentualCrescimentoProjetado,RelacaoAguaCimento,DataProjecao
50797,43970051,5.0,B,2019-08-07 12:30:15.877,94133998,989051,00:24:00,1.165.000,28,N,...,20.0,14.27,24.19,233.0,6.5,23.285522,69.516468,63.178149,0.798283,2019-08-09
50798,43970051,6.0,B,2019-08-19 10:05:12.940,77776998,104079,00:21:00,1.398.000,28,N,...,20.0,14.32,24.65,233.0,6.5,24.541518,72.136872,71.379314,0.798283,2019-08-23
50799,43970051,6.0,B,2019-08-26 09:21:01.200,11116051,189079,00:19:00,1.398.000,28,N,...,20.0,17.45,26.68,233.0,5.5,28.757142,52.893983,64.797377,0.798283,2019-08-30
50800,43993051,8.5,B,2019-07-16 06:59:26.483,851078998,95079,00:32:00,2.703.000,28,E,...,30.0,19.58,37.2,318.0,4.5,27.388509,89.989785,39.880024,0.600814,2019-07-19
50801,43993051,8.5,B,2019-07-16 07:33:02.217,851078998,104079,00:16:00,2.703.000,28,E,...,30.0,18.61,30.88,318.0,4.5,26.224568,65.932294,40.916539,0.600814,2019-07-19


In [641]:
len(dados_aglomerante)

1296

In [642]:
fig = px.box(dados_aglomerante, x="TipoLancamento", y="Rompimento28Dias", title='Tipo de Lançamento x Rompimento 28 Dias')
fig.show()

In [643]:
fig = px.box(dados_aglomerante, x="TipoTraco", y="Rompimento28Dias", title='Tipo de Traço x Rompimento 28 Dias')
fig.show()

In [644]:
fig = px.box(dados_aglomerante, x="FCK", y="Rompimento28Dias", title='FCK x Rompimento 28 Dias')
fig.show()

In [645]:
fig = px.scatter(dados_aglomerante, x="Rompimento7Dias", y="Rompimento28Dias", title='Rompimento7Dias x Rompimento 28 Dias')
fig.show()

In [646]:
fig = px.scatter(dados_aglomerante, x="RelacaoAguaCimento", y="Rompimento28Dias", title='Rompimento7Dias x Rompimento 28 Dias')
fig.show()

In [625]:
dados_aglomerante["TipoLancamento"].replace('C', 0, inplace = True)
dados_aglomerante["TipoLancamento"].replace('B', 1, inplace = True)

dados_aglomerante["TipoTraco"].replace('N', 0, inplace = True)
dados_aglomerante["TipoTraco"].replace('E', 1, inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [626]:
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])
    
class Debug(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        print(X)

modelo = Pipeline([
  ('features', FeatureUnion(
      transformer_list=[
          ('numericals', Pipeline([
              ('scaler', MinMaxScaler())
          ]))
      ]
  )),
  ('model', GradientBoostingRegressor(learning_rate = 0.3, min_samples_split = 4, random_state = 1))
  #('model', DecisionTreeRegressor())
  #('model', xgb.XGBRegressor())
])

In [627]:
dados_aglomerante.head()

Unnamed: 0,CodigoReceita,Volume,TipoLancamento,DataEmissaoNotaFiscal,CodigoCliente,CodigoMotorista,TempoPermanenciaObra,ConsumoTotal,Idade,TipoTraco,...,FCK,Rompimento7Dias,Rompimento28Dias,ConsumoPorM3,DesvioAplicado,ResultadoProjetado,PercentualCrescimentoReal,PercentualCrescimentoProjetado,RelacaoAguaCimento,DataProjecao
808,156051,6.5,0,2015-01-21 17:12:35.177,90003051,578051,00:07:00,1.586.000,28,0,...,20.0,15.14,27.15,244.0,4.0,25.314546,79.326288,67.203078,0.668348,2015-01-23
809,16051,8.0,0,2015-01-07 07:02:53.043,90203051,13060,01:59:00,2.176.000,28,0,...,25.0,19.13,26.26,272.0,4.0,31.193902,37.271302,63.062737,0.5625,2015-01-09
810,16051,6.5,0,2015-01-12 12:51:52.987,1511051,727051,01:36:00,1.768.000,28,0,...,25.0,22.14,28.92,272.0,4.0,35.529446,30.623306,60.476269,0.570136,2015-01-16
811,16051,8.0,0,2015-01-15 09:11:42.660,90203051,624051,01:40:00,2.176.000,28,0,...,25.0,22.25,27.78,272.0,4.0,35.686452,24.853933,60.388546,0.573529,2015-01-16
812,16051,5.5,0,2015-01-15 11:57:31.167,90203051,828051,01:00:00,1.496.000,28,0,...,25.0,23.59,33.11,272.0,4.0,37.591475,40.356083,59.353435,0.573529,2015-01-16


In [628]:
def RemoveOutliers(dados, coluna):
    q1 = dados[coluna].quantile(0.25)
    q3 = dados[coluna].quantile(0.75)
    iqr = q3 - q1
    
    maskOutlier = (dados[coluna] < (q1 - 1.5 * iqr)) | (dados[coluna] > (q3 + 1.5 * iqr))
    retorno = dados[maskOutlier == False]
    
    return retorno


dados_aglomerante = RemoveOutliers(dados_aglomerante, "Rompimento7Dias")
dados_aglomerante = RemoveOutliers(dados_aglomerante, "Rompimento28Dias")
dados_aglomerante = RemoveOutliers(dados_aglomerante, "RelacaoAguaCimento")

In [629]:
dados_aglomerante_modelo = dados_aglomerante[["Rompimento7Dias", "ConsumoPorM3", "RelacaoAguaCimento", "TipoLancamento", "TipoTraco", "Rompimento28Dias", "ResultadoProjetado"]]

X = dados_aglomerante_modelo[["Rompimento7Dias", "ConsumoPorM3", "RelacaoAguaCimento", "TipoLancamento", "TipoTraco", "ResultadoProjetado"]]
Y = dados_aglomerante_modelo["Rompimento28Dias"].values

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state = 1)

In [630]:
y_train = y_train.reshape(-1, 1)
#x_train = x_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
#x_test = x_test.reshape(-1, 1)

In [631]:
x_train_modelo = x_train[["Rompimento7Dias", "ConsumoPorM3", "RelacaoAguaCimento", "TipoLancamento", "TipoTraco"]]
x_test_modelo = x_test[["Rompimento7Dias", "ConsumoPorM3", "RelacaoAguaCimento", "TipoLancamento", "TipoTraco"]]

In [632]:
modelo.fit(x_train_modelo, y_train)
y_pred = modelo.predict(x_test_modelo)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [633]:
df = pd.DataFrame(columns = ["Rompimento28Dias", "ResultadoProjetado", "ResultadoPredito"])

df["Rompimento28Dias"] = y_test.reshape(1, -1)[0]
df["ResultadoProjetado"] = x_test["ResultadoProjetado"].values
df["ResultadoPredito"] = y_pred

In [634]:
df.head()

Unnamed: 0,Rompimento28Dias,ResultadoProjetado,ResultadoPredito
0,37.18,39.147484,39.421482
1,34.36,47.27074,42.273351
2,39.4,40.310872,40.409612
3,28.4,31.949387,27.486319
4,39.84,40.241639,36.29798


In [635]:
df["ErroProjetado"] = abs(df["ResultadoProjetado"] - df["Rompimento28Dias"])
df["ErroPredito"] = abs(df["ResultadoPredito"] - df["Rompimento28Dias"])

In [636]:
df.head()

Unnamed: 0,Rompimento28Dias,ResultadoProjetado,ResultadoPredito,ErroProjetado,ErroPredito
0,37.18,39.147484,39.421482,1.967484,2.241482
1,34.36,47.27074,42.273351,12.91074,7.913351
2,39.4,40.310872,40.409612,0.910872,1.009612
3,28.4,31.949387,27.486319,3.549387,0.913681
4,39.84,40.241639,36.29798,0.401639,3.54202


In [637]:
df["ErroProjetado"].sum()

559.807710898782

In [638]:
df["ErroPredito"].sum()

466.0479378469298

In [639]:
mask_modelo_predito_melhor = df["ErroPredito"] < df["ErroProjetado"]
quantidade_nf_modelo_eh_melhor = len(df[mask_modelo_predito_melhor])
quantidade_total = len(df)

quantidade_nf_modelo_eh_melhor/quantidade_total

0.5319148936170213

In [566]:
0.7019867549668874 - 0,3 - min_samples_split = 4 - GradientBoostingRegressor | Romp7Dias, ConsumoPorM3, RelacaoAguaCimento, TipoLancamento, TipoTraco | Removendo outliers

SyntaxError: cannot assign to operator (<ipython-input-566-943311438422>, line 1)