In [1]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
train_processed_path = r'C:\Users\cardo\Projeto House Prices\data\processed\train_processed.csv'
test_processed_path = r'C:\Users\cardo\Projeto House Prices\data\processed\test_processed.csv'

In [9]:
df_train_processed = pd.read_csv(train_processed_path)
df_test_processed = pd.read_csv(test_processed_path)

In [10]:
X = df_train_processed.drop('SalePrice', axis=1)
y = df_train_processed['SalePrice']

In [23]:
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_normalized = scaler_X.fit_transform(X)
y_normalized = scaler_y.fit_transform(y.values.reshape(-1, 1))
y_normalized = y_normalized.ravel()

# dividing data in test and train

X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_normalized, test_size=0.2, random_state=42)

### Finding best alpha value

In [33]:
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 0.0001, 0.00001, 0.00001]}

lasso = Lasso()

grid_search = GridSearchCV(lasso, param_grid, cv=5)
grid_search.fit(X_normalized, y_normalized)

print("Best alpha Value: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best alpha Value:  {'alpha': 0.0001}
Best Score:  0.8119032143416108


In [34]:
lasso = Lasso(alpha=0.0001)

# training the model

lasso.fit(X_train, y_train)

# predictions

y_pred = lasso.predict(X_test)

# model performance

mse = mean_squared_error(y_test, y_pred)
print('Mean Square Error:', mse)
rmse = np.sqrt(mse)
print('Root Mean Square Error:', rmse)

Mean Square Error: 0.0016329492243360442
Root Mean Square Error: 0.040409766447432535


In [39]:
X_test_normalized = scaler_X.transform(df_test_processed.drop('Id',axis = 1))

In [41]:
predictions_normalized = lasso.predict(X_test_normalized)
predictions_denormalized = scaler_y.inverse_transform(predictions_normalized.reshape(-1, 1))

In [42]:
df_results = pd.DataFrame({'Id': df_test_processed['Id'], 'SalePrice': predictions_denormalized.flatten()})

In [45]:
df_results.to_csv(r'C:\\Users\cardo\Projeto House Prices\data\final\lasso_model.csv', index = False)

In [44]:
df_results

Unnamed: 0,Id,SalePrice
0,1461,104864.564607
1,1462,160909.052812
2,1463,173493.179092
3,1464,191085.723794
4,1465,194595.894494
...,...,...
1454,2915,67023.044350
1455,2916,71573.066811
1456,2917,156997.791803
1457,2918,124799.952523


In [38]:
# Saving model

import joblib

joblib.dump(lasso, r'C:\\Users\cardo\Projeto House Prices\models\lasso_model.pkl')

['C:\\\\Users\\cardo\\Projeto House Prices\\models\\lasso_model.pkl']