In [2]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import numpy as np

In [3]:
train_processed_path = r'C:\Users\cardo\Projeto-House-Prices\data\processed\train_processed.csv'
test_processed_path = r'C:\Users\cardo\Projeto-House-Prices\data\processed\test_processed.csv'

In [4]:
df_train_processed = pd.read_csv(train_processed_path)
df_test_processed = pd.read_csv(test_processed_path)

In [5]:
X = df_train_processed.drop('SalePrice', axis=1)
y = df_train_processed['SalePrice']

In [6]:
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_normalized = scaler_X.fit_transform(X)
y_normalized = scaler_y.fit_transform(y.values.reshape(-1, 1))
y_normalized = y_normalized.ravel()

# dividing data in test and train

X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_normalized, test_size=0.2, random_state=42)

### Finding best alpha value

In [7]:
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 0.0001, 0.00001, 0.00001]}

lasso = Lasso()

grid_search = GridSearchCV(lasso, param_grid, cv=5)
grid_search.fit(X_normalized, y_normalized)

print("Best alpha Value: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best alpha Value:  {'alpha': 0.0001}
Best Score:  0.8123867799096368


In [8]:
model = Lasso(alpha=0.0001)

# training the model

model.fit(X_train, y_train)

# predictions

y_pred = model.predict(X_test)

# model performance

mse = mean_squared_error(y_test, y_pred)
print('Mean Square Error:', mse)
rmse = np.sqrt(mse)
print('Root Mean Square Error:', rmse)

Mean Square Error: 0.0016955980683514061
Root Mean Square Error: 0.04117764039319648


In [9]:
X_test_normalized = scaler_X.transform(df_test_processed.drop('Id',axis = 1))

In [11]:
predictions_normalized = model.predict(X_test_normalized)
predictions_denormalized = scaler_y.inverse_transform(predictions_normalized.reshape(-1, 1))

In [12]:
df_results = pd.DataFrame({'Id': df_test_processed['Id'], 'SalePrice': predictions_denormalized.flatten()})

In [13]:
df_results.to_csv(r'C:\\Users\cardo\Projeto-House-Prices\data\final\lasso_model.csv', index = False)

In [14]:
df_results

Unnamed: 0,Id,SalePrice
0,1461,97228.228875
1,1462,158328.012577
2,1463,173925.975044
3,1464,187063.830509
4,1465,196778.358102
...,...,...
1454,2915,64917.342067
1455,2916,69394.818675
1456,2917,151742.626423
1457,2918,126845.245150


In [15]:
# Saving model

import joblib

joblib.dump(lasso, r'C:\\Users\cardo\Projeto-House-Prices\models\lasso_model.pkl')

['C:\\\\Users\\cardo\\Projeto-House-Prices\\models\\lasso_model.pkl']