# Data modeling

## Library import

In [12]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error as mse

## Data import

In [13]:
df = pd.read_parquet('../data/interim/df_log_data_baseline.parquet')

In [14]:
df

Unnamed: 0,log_MSSubClass,log_LotArea,log_OverallQual,log_OverallCond,log_YearBuilt,log_YearRemodAdd,log_MasVnrArea,log_BsmtFinSF1,log_BsmtFinSF2,log_BsmtUnfSF,...,log_WoodDeckSF,log_OpenPorchSF,log_EnclosedPorch,log_3SsnPorch,log_ScreenPorch,log_PoolArea,log_MiscVal,log_MoSold,log_YrSold,log_SalePrice
0,4.094345,9.041922,1.945910,1.609438,7.602401,7.602401,5.283204,6.561031,0.000000,5.017280,...,0.000000,4.127134,0.000000,0.0,0.0,0.0,0.000000,0.693147,7.604894,12.247694
1,2.995732,9.169518,1.791759,2.079442,7.588830,7.588830,0.000000,6.886532,0.000000,5.652489,...,5.700444,0.000000,0.000000,0.0,0.0,0.0,0.000000,1.609438,7.604396,12.109011
2,4.094345,9.328123,1.945910,1.609438,7.601402,7.601902,5.093750,6.188264,0.000000,6.075346,...,0.000000,3.761200,0.000000,0.0,0.0,0.0,0.000000,2.197225,7.604894,12.317167
3,4.248495,9.164296,1.945910,1.609438,7.557473,7.585789,0.000000,5.379897,0.000000,6.293419,...,0.000000,3.583519,5.609472,0.0,0.0,0.0,0.000000,0.693147,7.603898,11.849398
4,4.094345,9.565214,2.079442,1.609438,7.600902,7.600902,5.860786,6.486161,0.000000,6.196444,...,5.262690,4.442651,0.000000,0.0,0.0,0.0,0.000000,2.484907,7.604894,12.429216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,4.094345,8.976768,1.791759,1.609438,7.600402,7.600902,0.000000,0.000000,0.000000,6.860664,...,0.000000,3.713572,0.000000,0.0,0.0,0.0,0.000000,2.079442,7.604396,12.072541
1456,2.995732,9.486076,1.791759,1.791759,7.589842,7.594884,4.787492,6.673298,5.099866,6.380123,...,5.857933,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.693147,7.605890,12.254863
1457,4.248495,9.109636,1.945910,2.197225,7.570959,7.603898,0.000000,5.620401,0.000000,6.777647,...,0.000000,4.110874,0.000000,0.0,0.0,0.0,7.824446,1.609438,7.605890,12.493130
1458,2.995732,9.181632,1.609438,1.791759,7.575585,7.598900,0.000000,3.912023,6.937314,0.000000,...,5.905362,0.000000,4.727388,0.0,0.0,0.0,0.000000,1.386294,7.605890,11.864462


## Data inspection

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 36 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   log_MSSubClass     1460 non-null   float64
 1   log_LotArea        1460 non-null   float64
 2   log_OverallQual    1460 non-null   float64
 3   log_OverallCond    1460 non-null   float64
 4   log_YearBuilt      1460 non-null   float64
 5   log_YearRemodAdd   1460 non-null   float64
 6   log_MasVnrArea     1460 non-null   float64
 7   log_BsmtFinSF1     1460 non-null   float64
 8   log_BsmtFinSF2     1460 non-null   float64
 9   log_BsmtUnfSF      1460 non-null   float64
 10  log_TotalBsmtSF    1460 non-null   float64
 11  log_1stFlrSF       1460 non-null   float64
 12  log_2ndFlrSF       1460 non-null   float64
 13  log_LowQualFinSF   1460 non-null   float64
 14  log_GrLivArea      1460 non-null   float64
 15  log_BsmtFullBath   1460 non-null   float64
 16  log_BsmtHalfBath   1460 

There is no null data. The dataset is ready to use.

## Data split

In [16]:
X = df.drop(['log_SalePrice'], axis=1)

In [17]:
y = df['log_SalePrice']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2811)

## Training the model

In [19]:
X_train_com_constante = sm.add_constant(X_train)

In [20]:
modelo_estatsmodels = sm.OLS(y_train, X_train_com_constante, hasconst=True).fit()

In [21]:
type(modelo_estatsmodels)

statsmodels.regression.linear_model.RegressionResultsWrapper

In [40]:
ms = pd.DataFrame(modelo_estatsmodels.summary().tables[1])
ms

Unnamed: 0,0,1,2,3,4,5,6
0,,coef,std err,t,P>|t|,[0.025,0.975]
1,const,31.8045,50.866,0.625,0.532,-67.997,131.606
2,log_MSSubClass,-0.0235,0.009,-2.579,0.010,-0.041,-0.006
3,log_LotArea,0.0864,0.011,7.908,0.000,0.065,0.108
4,log_OverallQual,0.3542,0.030,11.623,0.000,0.294,0.414
5,log_OverallCond,0.2245,0.026,8.520,0.000,0.173,0.276
6,log_YearBuilt,5.1855,0.660,7.860,0.000,3.891,6.480
7,log_YearRemodAdd,2.3076,0.626,3.684,0.000,1.078,3.537
8,log_MasVnrArea,-0.0006,0.002,-0.308,0.758,-0.004,0.003
9,log_BsmtFinSF1,0.0089,0.002,4.287,0.000,0.005,0.013


In [None]:
columns_to_drop = [
    'log_MasVnrArea',
    'log_BsmtUnfSF', 
    'log_1stFlrSF', 
    'log_BsmtHalfBath', 
    'log_GarageYrBlt', 
    'log_OpenPorchSF', 
    'log_3SsnPorch', 
    'log_MiscVal', 
    'log_MoSold', 
    'log_EnclosedPorch',
    'log_FullBath',
    'log_HalfBath']

X = X.drop(columns_to_drop, axis=1)


In [None]:
X.info()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2811)

In [None]:
X_train_com_constante = sm.add_constant(X_train)

In [None]:
modelo_estatsmodels = sm.OLS(y_train, X_train_com_constante, hasconst=True).fit()

In [None]:
modelo_estatsmodels.summary()

## Results graph analysis 

In [None]:
y_previsto_train = modelo_estatsmodels.predict(X_train_com_constante)

In [None]:
ax = sns.scatterplot(x=y_previsto_train, y=y_train)
ax.figure.set_size_inches(12, 6)
ax.set_title('Previsão X Real', fontsize=18)
ax.set_xlabel('log do Preço - Previsão', fontsize=14)
ax.set_ylabel('log do Preço - Real', fontsize=14)
ax

## Residual values

In [None]:
residuo = y_train - y_previsto_train

In [None]:
ax = sns.distplot(residuo)
ax.figure.set_size_inches(10, 6)
ax.set_title('Distribuição de Frequências dos Resíduos', fontsize=18)
ax.set_xlabel('log do Preço', fontsize=14)
ax

In [None]:
print(f"RMSE: {mse(y_test, y_previsto_train)**.5:.2f}")