In [1722]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

In [1723]:
dados = pd.read_csv("dados_pre_processados.csv")
dados.drop("Unnamed: 0", axis = "columns", inplace=True)
dados.head()

Unnamed: 0,CodigoNotaFiscal,CodigoCentroSaida,CodigoReceita,Volume,TipoLancamento,DataEmissaoNotaFiscal,CodigoCliente,CodigoMotorista,Status,QuantidadeAgua,...,Rompimento7Dias,Rompimento28Dias,ConsumoPorM3,DesvioAplicado,ResultadoEsperado,ResultadoProjetado,PercentualCrescimentoReal,PercentualCrescimentoProjetado,RelacaoAguaCimento,DataProjecao
0,346641,80020,672051,8.0,B,2015-01-16 07:35:31.770,6248051,11080,N,145.6,...,20.83,31.81,340.0,3.5,39.430782,33.321754,52.712434,59.970014,0.535294,2015-01-16
1,346661,80020,672051,8.0,B,2015-01-16 08:20:03.153,6248051,87079,N,145.6,...,19.62,31.21,340.0,3.5,39.430782,31.799067,59.072375,62.074756,0.535294,2015-01-16
2,346695,80020,672051,7.0,B,2015-01-16 10:05:20.293,6248051,43080,N,127.4,...,21.97,36.68,340.0,3.5,39.430782,34.733698,66.954939,58.096029,0.535294,2015-01-16
3,347367,80020,672051,8.0,B,2015-01-20 18:52:57.373,12343051,43080,N,145.6,...,28.21,40.21,340.0,3.5,39.430782,42.118539,42.538107,49.303577,0.535294,2015-01-23
4,347388,80020,672051,6.0,B,2015-01-21 07:08:54.847,6248051,42080,N,109.2,...,25.8,38.74,340.0,3.5,39.430782,39.330637,50.155039,52.444329,0.535294,2015-01-23


In [1724]:
mask_aglomerante = dados["CodigoAglomerante"] == 75051
dados_aglomerante = dados[mask_aglomerante]
dados_aglomerante.head()

Unnamed: 0,CodigoNotaFiscal,CodigoCentroSaida,CodigoReceita,Volume,TipoLancamento,DataEmissaoNotaFiscal,CodigoCliente,CodigoMotorista,Status,QuantidadeAgua,...,Rompimento7Dias,Rompimento28Dias,ConsumoPorM3,DesvioAplicado,ResultadoEsperado,ResultadoProjetado,PercentualCrescimentoReal,PercentualCrescimentoProjetado,RelacaoAguaCimento,DataProjecao
38008,445227,51020,33488051,6.0,B,2017-03-08 14:43:53.140,133258051,812051,N,117.0,...,25.73,35.08,362.0,6.0,46.095231,35.882105,36.338904,39.456298,0.538674,2017-03-10
38066,472682,51020,33535051,8.0,B,2017-11-13 08:06:27.303,225643998,85079,N,143.2,...,24.26,33.4,320.0,6.0,44.353206,28.834552,37.675185,18.856358,0.559375,2017-11-17
38067,473524,51020,33535051,8.0,B,2017-11-21 07:35:58.153,225643998,624051,N,173.6,...,24.24,31.72,320.0,6.0,35.558927,28.817159,30.858086,18.882667,0.678125,2017-11-24
38068,475527,51020,33535051,8.0,B,2017-12-11 08:50:55.937,225643998,731051,N,173.6,...,23.45,31.12,320.0,6.0,35.558927,28.125844,32.707889,19.939631,0.678125,2017-12-15
38069,475669,51020,33535051,8.0,B,2017-12-12 08:02:35.123,225643998,340051,N,173.6,...,20.88,28.39,320.0,6.0,35.558927,25.816563,35.967433,23.642543,0.678125,2017-12-15


In [1725]:
dados_aglomerante["TipoTraco"].value_counts()

N    361
E    105
Name: TipoTraco, dtype: int64

In [1726]:
dados_aglomerante["TipoLancamento"].value_counts()

B    366
C    100
Name: TipoLancamento, dtype: int64

In [1727]:
len(dados_aglomerante)

466

In [1728]:
dados_aglomerante["LogRompimento7Dias"] = np.log(dados_aglomerante["Rompimento7Dias"])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [1120]:
fig = px.box(dados_aglomerante, x="TipoLancamento", y="Rompimento28Dias", title='Tipo de Lançamento x Rompimento 28 Dias')
fig.show()

In [1121]:
fig = px.box(dados_aglomerante, x="TipoTraco", y="Rompimento28Dias", title='Tipo de Traço x Rompimento 28 Dias')
fig.show()

In [1122]:
fig = px.box(dados_aglomerante, x="FCK", y="Rompimento28Dias", title='FCK x Rompimento 28 Dias')
fig.show()

In [1123]:
fig = px.scatter(dados_aglomerante, x="LogRompimento7Dias", y="Rompimento28Dias", title='Rompimento7Dias x Rompimento 28 Dias')
fig.show()

In [1124]:
fig = px.scatter(dados_aglomerante, x="RelacaoAguaCimento", y="Rompimento28Dias", title='Rompimento7Dias x Rompimento 28 Dias')
fig.show()

In [1729]:
fig = px.scatter(dados_aglomerante, x="ResultadoEsperado", y="Rompimento28Dias", title='Rompimento7Dias x Rompimento 28 Dias')
fig.show()

In [1125]:
fig = px.scatter(dados_aglomerante, x="ConsumoPorM3", y="Rompimento28Dias", title='Rompimento7Dias x Rompimento 28 Dias')
fig.show()

In [15]:
fig = px.scatter_3d(dados_aglomerante, x="ConsumoPorM3", y="Rompimento28Dias", z="RelacaoAguaCimento", color="TipoTraco", title='Rompimento7Dias x Rompimento 28 Dias')
fig.show()

In [1730]:
dados_aglomerante["TipoLancamento"].replace('C', 0, inplace = True)
dados_aglomerante["TipoLancamento"].replace('B', 1, inplace = True)

dados_aglomerante["TipoTraco"].replace('N', 0, inplace = True)
dados_aglomerante["TipoTraco"].replace('E', 1, inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [1731]:
dados_aglomerante["DataProjecao"] = pd.to_datetime(dados_aglomerante["DataProjecao"])
dados_aglomerante.sort_values("DataProjecao", inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [1732]:
dados_aglomerante.iloc[50:90]

Unnamed: 0,CodigoNotaFiscal,CodigoCentroSaida,CodigoReceita,Volume,TipoLancamento,DataEmissaoNotaFiscal,CodigoCliente,CodigoMotorista,Status,QuantidadeAgua,...,Rompimento28Dias,ConsumoPorM3,DesvioAplicado,ResultadoEsperado,ResultadoProjetado,PercentualCrescimentoReal,PercentualCrescimentoProjetado,RelacaoAguaCimento,DataProjecao,LogRompimento7Dias
39427,458856,51020,34650051,8.0,1,2017-07-05 08:12:15.767,154417998,66079,N,132.0,...,26.77,289.0,6.0,43.409279,28.118389,16.950633,22.841366,0.570934,2017-07-07,3.1307
39426,458792,51020,34646051,8.5,1,2017-07-04 14:33:39.543,133599051,39080,N,130.9,...,27.27,271.0,6.0,43.625395,23.854127,48.126018,29.571576,0.568266,2017-07-07,2.912894
39413,458875,51020,34642051,7.0,1,2017-07-05 09:27:28.143,12770051,828051,N,144.9,...,31.18,400.0,8.0,47.94787,27.666244,39.196429,23.510016,0.5175,2017-07-07,3.109061
40037,461721,51020,34749051,7.0,1,2017-07-28 11:57:28.027,12873051,173051,N,143.5,...,38.51,400.0,8.0,48.396107,35.767575,21.406053,12.760325,0.5125,2017-07-28,3.456947
40038,461745,51020,34749051,7.0,1,2017-07-28 13:29:54.357,12873051,85079,N,143.5,...,39.11,400.0,8.0,48.396107,34.728298,28.397899,14.012797,0.5125,2017-07-28,3.416414
40042,461969,51020,34749051,7.0,1,2017-07-31 13:50:43.240,12873051,731051,N,146.3,...,40.86,400.0,8.0,47.503784,37.145369,22.262118,11.147125,0.5225,2017-08-04,3.509155
40103,462438,51020,34905051,8.0,0,2017-08-03 14:54:29.060,200789998,517051,N,119.2,...,29.67,278.0,6.0,46.327671,27.42499,34.01084,23.870775,0.535971,2017-08-04,3.097386
40047,462388,51020,34749051,7.0,1,2017-08-03 10:33:36.220,12873051,577051,N,137.9,...,43.15,400.0,8.0,50.231352,34.378325,43.641811,14.441828,0.4925,2017-08-04,3.40253
40039,461855,51020,34749051,7.0,1,2017-07-31 06:32:35.680,12873051,340051,N,146.3,...,43.12,400.0,8.0,47.503784,36.200074,33.705426,12.248292,0.5225,2017-08-04,3.473518
40040,461920,51020,34749051,7.0,1,2017-07-31 10:55:31.060,12873051,517051,N,146.3,...,39.79,400.0,8.0,47.503784,35.109549,28.686934,13.54964,0.5225,2017-08-04,3.431403


In [1733]:
def RemoveOutliers(dados, coluna):
    q1 = dados[coluna].quantile(0.25)
    q3 = dados[coluna].quantile(0.75)
    iqr = q3 - q1
    
    maskOutlier = (dados[coluna] < (q1 - 1.5 * iqr)) | (dados[coluna] > (q3 + 1.5 * iqr))
    retorno = dados[maskOutlier == False]
    
    return retorno


dados_aglomerante = RemoveOutliers(dados_aglomerante, "Rompimento7Dias")
dados_aglomerante = RemoveOutliers(dados_aglomerante, "Rompimento28Dias")
dados_aglomerante = RemoveOutliers(dados_aglomerante, "ConsumoPorM3")
dados_aglomerante = RemoveOutliers(dados_aglomerante, "RelacaoAguaCimento")

In [1734]:
dados_aglomerante_modelo = dados_aglomerante[["ResultadoEsperado", "Rompimento7Dias", "Rompimento28Dias", "ResultadoProjetado", "DataProjecao"]]

#beleza até 4/8/2017

data_projecao_atual = dt.datetime(2017, 10, 27)
data_fim_busca_dados = data_projecao_atual - dt.timedelta(weeks = 4)

#mask_data_projecao_treino_inicio = dados_aglomerante_modelo["DataProjecao"] >= dt.datetime(2017, 7, 7)
mask_data_projecao_treino_final = dados_aglomerante_modelo["DataProjecao"] <= data_fim_busca_dados
mask_data_projecao_teste = dados_aglomerante_modelo["DataProjecao"] == data_projecao_atual

#dados_treino = dados_aglomerante_modelo[mask_data_projecao_treino_inicio & mask_data_projecao_treino_final]
dados_treino = dados_aglomerante_modelo[mask_data_projecao_treino_final]
dados_teste = dados_aglomerante_modelo[mask_data_projecao_teste]

In [1735]:
dados_treino

Unnamed: 0,ResultadoEsperado,Rompimento7Dias,Rompimento28Dias,ResultadoProjetado,DataProjecao
38008,46.095231,25.73,35.08,35.882105,2017-03-10
38107,46.998291,22.32,31.9,31.821606,2017-03-24
38106,46.506734,23.26,30.93,32.951628,2017-03-24
38113,46.020318,22.01,30.6,31.447056,2017-03-31
38111,46.506734,19.91,30.3,28.883882,2017-03-31
38110,46.506734,23.24,31.02,32.927673,2017-03-31
38109,46.506734,22.74,30.83,32.32756,2017-03-31
38108,46.506734,23.09,31.3,32.747889,2017-03-31
38112,46.506734,21.95,30.6,31.374452,2017-03-31
38428,50.156848,23.6,32.41,33.358292,2017-04-14


In [1736]:
x_train = dados_treino.drop(["Rompimento28Dias", "DataProjecao"], axis = 1)
y_train = dados_treino["Rompimento28Dias"].values

x_test = dados_teste.drop(["Rompimento28Dias", "DataProjecao"], axis = 1)
y_test = dados_teste["Rompimento28Dias"].values

In [1737]:
y_train = y_train.reshape(-1, 1)
#x_train = x_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
#x_test = x_test.reshape(-1, 1)

In [1738]:
x_train_modelo = x_train.drop("ResultadoProjetado", axis = 1)
x_test_modelo = x_test.drop("ResultadoProjetado", axis = 1)

In [1739]:
class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])
    
class Debug(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):        
        print(X)
        
parameters = {'loss':('ls', 'lad'),
              'learning_rate': np.arange(0, 0.15, 0.1),
              'min_samples_leaf': np.arange(1, 5, 1),
              'min_samples_split': np.arange(1, 4, 1),
              'n_estimators': np.arange(15, 50, 5)}

#loss = 'ls', learning_rate = 0.32, min_samples_leaf = 2, min_samples_split = 3, n_estimators = 5

modelo = Pipeline([
  #('features', FeatureUnion(
  #    transformer_list=[
  #        ('numericals', Pipeline([
  #            ('scaler', MinMaxScaler())
  #        ]))
  #    ]
  #)),
  #('model', GridSearchCV(estimator = GradientBoostingRegressor(random_state = 1), 
  #                       param_grid = parameters,
  #                       cv=10))
  #('model', GradientBoostingRegressor(random_state = 1, learning_rate = 0.3, min_samples_leaf = 2, min_samples_split = 4, n_estimators = 6))
  #('model', LocallyLinearEmbedding())
  #('model', DecisionTreeRegressor())
  ('model', xgb.XGBRegressor())
])

#{'ls', 'lad', 'huber', 'quantile'}

In [1740]:
modelo.fit(x_train_modelo, y_train)
y_pred = modelo.predict(x_test_modelo)

In [1741]:
#gs = modelo.named_steps['model']
#gs.feature_importances_
#coef = pd.Series(lasso.coef_, index = X.columns)
#lasso.coef_

#loss = 'ls', learning_rate = 0.32, min_samples_leaf = 2, min_samples_split = 3, n_estimators = 5

In [1742]:
df = pd.DataFrame(columns = ["Rompimento28Dias", "ResultadoProjetado", "ResultadoPredito"])

df["Rompimento28Dias"] = y_test.reshape(1, -1)[0]
df["ResultadoProjetado"] = x_test["ResultadoProjetado"].values
df["ResultadoPredito"] = y_pred

In [1743]:
df.head()

Unnamed: 0,Rompimento28Dias,ResultadoProjetado,ResultadoPredito
0,31.25,31.859329,33.928204
1,30.49,29.576969,31.598003
2,31.01,29.587471,31.598003
3,39.12,35.886319,34.182568
4,30.58,24.397753,26.974163


In [1744]:
df["ErroProjetado"] = abs(df["ResultadoProjetado"] - df["Rompimento28Dias"])
df["ErroPredito"] = abs(df["ResultadoPredito"] - df["Rompimento28Dias"])

In [1745]:
df.head()

Unnamed: 0,Rompimento28Dias,ResultadoProjetado,ResultadoPredito,ErroProjetado,ErroPredito
0,31.25,31.859329,33.928204,0.609329,2.678204
1,30.49,29.576969,31.598003,0.913031,1.108003
2,31.01,29.587471,31.598003,1.422529,0.588003
3,39.12,35.886319,34.182568,3.233681,4.937432
4,30.58,24.397753,26.974163,6.182247,3.605837


In [1746]:
indices_treino = x_train.index
df_treino = dados_aglomerante.loc[indices_treino]
df_treino.describe()

Unnamed: 0,CodigoNotaFiscal,CodigoCentroSaida,CodigoReceita,Volume,TipoLancamento,CodigoCliente,CodigoMotorista,QuantidadeAgua,ConsumoTotal,Idade,...,Rompimento7Dias,Rompimento28Dias,ConsumoPorM3,DesvioAplicado,ResultadoEsperado,ResultadoProjetado,PercentualCrescimentoReal,PercentualCrescimentoProjetado,RelacaoAguaCimento,LogRompimento7Dias
count,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,...,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
mean,454822.116667,51020.0,34199350.0,6.916667,0.883333,65160180.0,396375.616667,122.49,247.675,28.0,...,27.6915,35.933333,359.116667,7.0,49.905753,35.42638,31.507064,29.374894,0.499377,3.295003
std,5572.799419,0.0,459058.8,0.782037,0.323732,58885070.0,298519.988395,19.336404,41.416524,0.0,...,6.196214,6.490681,51.435361,1.008439,5.640552,7.090533,11.080815,12.406185,0.060886,0.23447
min,445227.0,51020.0,33488050.0,5.0,0.0,2013051.0,39080.0,78.5,159.0,28.0,...,16.33,24.99,241.0,6.0,39.749812,22.324239,10.11236,8.027701,0.406091,2.793004
25%,449644.25,51020.0,33840050.0,6.5,1.0,12560050.0,91829.0,104.225,228.3625,28.0,...,22.8525,30.905,354.0,6.0,46.506734,31.356171,21.8787,16.474002,0.440321,3.129056
50%,452901.5,51020.0,33964050.0,7.0,1.0,12873050.0,333051.0,130.25,256.1,28.0,...,26.505,34.73,383.5,7.0,50.118303,34.880885,32.772799,31.886081,0.493711,3.27733
75%,459845.25,51020.0,34670050.0,7.0,1.0,100546400.0,709801.0,137.9,280.0,28.0,...,33.015,41.9525,394.0,8.0,55.354975,37.587015,38.346351,40.314969,0.533898,3.496937
max,462802.0,51020.0,34910050.0,8.5,1.0,200790000.0,911051.0,147.2,315.2,28.0,...,37.49,46.78,400.0,8.0,58.994595,49.191669,58.539529,49.413214,0.618257,3.624074


In [1747]:
fig = px.scatter(df_treino, x="ConsumoPorM3", y="Rompimento7Dias", title='Rompimento7Dias x Rompimento 28 Dias')
fig.show()

In [1748]:
fig = px.scatter(df_treino, x="RelacaoAguaCimento", y="Rompimento28Dias", title='Rompimento7Dias x Rompimento 28 Dias')
fig.show()

In [1749]:
CodigoReceita - indefinido
Volume - não
TipoLancamento - não
CodigoCliente - não
CodigoMotorista - não
TempoPermanenciaObra - indefinido
ConsumoTotal - não
TipoTraco - sim
Rompimento7Dias - sim
ConsumoPorM3 - sim
DesvioAplicado - sim
RelacaoAguaCimento - sim


NameError: name 'CodigoReceita' is not defined

In [1750]:
CodigoReceita	Volume	TipoLancamento	DataEmissaoNotaFiscal	CodigoCliente	CodigoMotorista	TempoPermanenciaObra	ConsumoTotal	Idade	TipoTraco	...	Rompimento7Dias	Rompimento28Dias	ConsumoPorM3	DesvioAplicado	ResultadoProjetado	PercentualCrescimentoReal	PercentualCrescimentoProjetado	RelacaoAguaCimento	DataProjecao	LogRompimento7Dias

SyntaxError: invalid syntax (<ipython-input-1750-c3d40928f3c6>, line 1)

In [1751]:
indices_teste = x_test.index
df_teste = dados_aglomerante.loc[indices_teste]
df_teste.describe()

Unnamed: 0,CodigoNotaFiscal,CodigoCentroSaida,CodigoReceita,Volume,TipoLancamento,CodigoCliente,CodigoMotorista,QuantidadeAgua,ConsumoTotal,Idade,...,Rompimento7Dias,Rompimento28Dias,ConsumoPorM3,DesvioAplicado,ResultadoEsperado,ResultadoProjetado,PercentualCrescimentoReal,PercentualCrescimentoProjetado,RelacaoAguaCimento,LogRompimento7Dias
count,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,...,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0
mean,470964.166667,51020.0,34614970.0,6.416667,0.583333,57975620.0,269479.416667,100.983333,206.529167,28.0,...,24.344167,31.4425,322.416667,6.666667,50.53218,31.531245,30.528079,29.899877,0.490278,3.185135
std,161.593673,0.0,261540.0,1.183856,0.514929,98186720.0,240155.457131,25.095883,52.329891,0.0,...,2.928287,2.661316,57.750967,0.984732,3.179933,3.048375,16.117146,3.43348,0.034107,0.127639
min,470729.0,51020.0,34097050.0,5.0,0.0,500051.0,12080.0,69.0,136.0,28.0,...,17.63,28.5,241.0,6.0,44.287966,24.397753,13.410267,25.345157,0.430348,2.869602
25%,470845.5,51020.0,34408050.0,5.0,0.0,1629051.0,63079.0,75.5,162.5,28.0,...,22.79,30.3275,272.0,6.0,48.613729,29.964479,19.003263,28.012775,0.464615,3.12628
50%,470947.0,51020.0,34642050.0,6.75,1.0,2013051.0,224051.0,102.8,198.4,28.0,...,24.605,30.795,325.0,6.0,51.17491,31.843951,29.791632,29.420659,0.4825,3.202949
75%,471097.25,51020.0,34907050.0,7.125,1.0,62579540.0,419051.0,118.375,247.9375,28.0,...,25.9275,31.6975,373.75,8.0,52.90684,33.190275,36.155958,31.483069,0.51011,3.255287
max,471189.0,51020.0,34907050.0,8.0,1.0,223022000.0,731051.0,135.1,280.0,28.0,...,28.63,39.12,400.0,8.0,56.390727,35.886319,73.454339,38.387706,0.560166,3.354455


In [1752]:
fig = px.scatter(df_teste, x="ConsumoPorM3", y="Rompimento7Dias", title='Rompimento7Dias x Rompimento 28 Dias')
fig.show()

In [1753]:
df["ErroProjetado"].sum()

25.610770175289048

In [1754]:
df["ErroPredito"].sum()

32.317529525756825

In [1755]:
mask_modelo_predito_melhor = df["ErroPredito"] < df["ErroProjetado"]
quantidade_nf_modelo_eh_melhor = len(df[mask_modelo_predito_melhor])
quantidade_total = len(df)

quantidade_nf_modelo_eh_melhor/quantidade_total

0.25

In [1444]:
df.describe()

Unnamed: 0,Rompimento28Dias,ResultadoProjetado,ResultadoPredito,ErroProjetado,ErroPredito
count,19.0,19.0,19.0,19.0,19.0
mean,33.741053,32.079168,33.45669,1.866107,2.129453
std,6.19101,5.486033,4.037649,1.426281,1.615025
min,25.0,23.804703,27.195072,0.202925,0.023054
25%,29.42,28.115993,30.985889,1.218011,1.047913
50%,31.89,30.59012,31.881715,1.606217,1.925889
75%,36.91,35.556623,36.033054,1.971129,2.756297
max,45.08,43.605213,43.231871,5.307704,6.008174


In [1483]:
fig = px.scatter(df, y=["ResultadoPredito", "Rompimento28Dias", "ResultadoProjetado"], title='FCK x Rompimento 28 Dias')
fig.show()

0.7777777777777778

In [722]:
0.7019867549668874 - 0,3 - min_samples_split = 4 - GradientBoostingRegressor | Romp7Dias, ConsumoPorM3, RelacaoAguaCimento, TipoLancamento, TipoTraco | Removendo outliers

SyntaxError: invalid syntax (<ipython-input-722-d16c45cfe617>, line 1)

In [75]:
df.describe()

Unnamed: 0,Rompimento28Dias,ResultadoProjetado,ResultadoPredito,ErroProjetado,ErroPredito
count,151.0,151.0,151.0,151.0,151.0
mean,32.371722,32.517567,32.342284,2.534797,1.597965
std,5.62111,5.654231,5.316987,1.67175,1.140082
min,20.15,15.878297,20.231588,0.042719,0.015537
25%,28.715,28.71598,28.739299,1.306476,0.733631
50%,32.51,32.263872,31.838203,2.009991,1.288044
75%,35.925,36.005115,36.029348,3.809451,2.122222
max,46.6,48.436012,47.008568,7.980361,5.715299
