In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import mean_squared_error

In [None]:
# Load data
train = pd.read_csv('/content/train_new.csv')
test = pd.read_csv('/content/test_new.csv')

train.head()

Unnamed: 0,SalePrice,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,...,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,159000,531363010,80.0,9605,Pave,SawyerW,1Fam,1Story,7,6,...,1,1,3,6,1218,Typ,0,0,2009,WD
1,271900,906203120,90.0,14684,Pave,SawyerW,1Fam,1Story,7,7,...,2,0,3,7,2196,Typ,0,0,2009,WD
2,137500,916176030,,14375,Pave,Timber,1Fam,SLvl,6,6,...,1,0,3,7,1344,Typ,233,0,2009,COD
3,248500,528180130,48.0,6472,Pave,NridgHt,TwnhsE,1Story,9,5,...,2,0,2,6,1456,Typ,0,0,2009,WD
4,167000,528290030,61.0,9734,Pave,Gilbert,1Fam,SLvl,7,5,...,2,1,3,7,1374,Typ,0,0,2009,WD


In [None]:
train.isna().sum()

Unnamed: 0,0
SalePrice,0
PID,0
Lot Frontage,362
Lot Area,0
Street,0
Neighborhood,0
Bldg Type,0
House Style,0
Overall Qual,0
Overall Cond,0


In [None]:
# Fill missing values for Lot Frontage with the median
lot_frontage_median_train = train['Lot Frontage'].median()

train['Lot Frontage'].fillna(lot_frontage_median_train, inplace=True)

# Fill missing values for Electrical with the mode
electrical_mode = train['Electrical'].mode()[0]

train['Electrical'].fillna(electrical_mode, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Lot Frontage'].fillna(lot_frontage_median_train, inplace=True)


In [None]:
# Separate features and target variable
X = train.drop(columns=["SalePrice", "PID"])
y = np.log1p(train["SalePrice"])

# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# CT
ct = ColumnTransformer(
    [
        ("dummify",
         OneHotEncoder(handle_unknown="ignore", sparse_output=False),
         make_column_selector(dtype_include=object)),
        ("standardize",
         StandardScaler(),
         make_column_selector(dtype_include="number"))
    ],
    remainder="passthrough"
)

# Create pipeline for Linear Regression
linear_pipeline = Pipeline([
    ("preprocessor", ct),
    ("model", LinearRegression())
])

# Train the Linear Regression model
linear_pipeline.fit(X_train, y_train)

# Validate the model
y_val_pred_linear = linear_pipeline.predict(X_val)
rmse_val_linear = np.sqrt(mean_squared_error(y_val, y_val_pred_linear))

rmse_val_linear

3798518731.2440386

In [None]:
# Ridge
# Define parameter grid
param_grid = {"model__alpha": [0.01, 0.1, 1.0, 10.0, 100.0]}

# Ridge pipeline
ridge_pipeline = Pipeline([
    ("preprocessor", ct),
    ("model", Ridge())
])

# Grid search with cross-validation
ridge_grid_search = GridSearchCV(ridge_pipeline, param_grid=param_grid, scoring="neg_root_mean_squared_error", cv=5)
ridge_grid_search.fit(X_train, y_train)

# Best parameters and performance
ridge_best_params = ridge_grid_search.best_params_
ridge_best_score = -ridge_grid_search.best_score_

ridge_best_params, ridge_best_score

({'model__alpha': 1.0}, 0.1507677861968424)

In [None]:
# Lasso
# Define parameter grid
lasso_param_grid = {"model__alpha": [0.001, 0.01, 0.1, 1.0, 10.0]}

# Lasso Pipeline
lasso_pipeline = Pipeline([
    ("preprocessor", ct),
    ("model", Lasso())
])

# Perform grid search with cross-validation
lasso_grid_search = GridSearchCV(lasso_pipeline, param_grid=lasso_param_grid, scoring="neg_root_mean_squared_error", cv=5)
lasso_grid_search.fit(X_train, y_train)

# Best parameters and performance
lasso_best_params = lasso_grid_search.best_params_
lasso_best_score = -lasso_grid_search.best_score_

lasso_best_params, lasso_best_score

({'model__alpha': 0.001}, 0.15450534310126057)

In [None]:
# Elastic Net
# Define parameter grid
elastic_net_param_grid = {
    "model__alpha": [0.001, 0.01, 0.1, 1.0, 10.0],
    "model__l1_ratio": [0.1, 0.5, 0.9]
}

# Elastic Net pipeline
elastic_net_pipeline = Pipeline([
    ("preprocessor", ct),
    ("model", ElasticNet())
])

# Perform grid search with cross-validation
elastic_net_grid_search = GridSearchCV(
    elastic_net_pipeline, param_grid=elastic_net_param_grid, scoring="neg_root_mean_squared_error", cv=5
)
elastic_net_grid_search.fit(X_train, y_train)

# Best parameters and performance
elastic_net_best_params = elastic_net_grid_search.best_params_
elastic_net_best_score = -elastic_net_grid_search.best_score_

elastic_net_best_params, elastic_net_best_score

({'model__alpha': 0.001, 'model__l1_ratio': 0.1}, 0.1505914009813004)

In [None]:
# Ridge best model based on RMSE

# Create a pipeline with Ridge regression
ridge_pipeline = Pipeline([
    ("preprocessor", ct),
    ("model", Ridge(alpha=1.0))
])

# Train the Ridge model on the entire training data
ridge_pipeline.fit(X, y)

# Predict SalePrice for the test set
test_features = test.drop(columns=["PID"])
test_predictions = ridge_pipeline.predict(test_features)

# Transform predictions back from log scale
test["SalePrice"] = np.expm1(test_predictions)

# Create the submission file
submission1 = test[["PID", "SalePrice"]]
submission1.to_csv('submission1.csv', index=False)

submission1.head()

Unnamed: 0,PID,SalePrice
0,907135180,129655.297733
1,528181040,220741.862793
2,528175010,220258.530313
3,531379030,185197.734697
4,923275090,129989.50872


In [None]:
# Elastic Net best based on RMSE
elastic_net_best_model = ElasticNet(alpha=0.001,l1_ratio=0.01)

# Pipeline
elastic_net_pipeline_final = Pipeline([
    ("preprocessor", ct),
    ("model", elastic_net_best_model)
])

# Train the Elastic Net model on the  training set
elastic_net_pipeline_final.fit(X, y)

# Predict SalePrice for the test set
test_predictions = elastic_net_pipeline_final.predict(test_features)

# Transform predictions back from the log scale
test["SalePrice"] = np.expm1(test_predictions)

# Create the submission file
submission2 = test[["PID", "SalePrice"]]
submission2.to_csv('submission2.csv', index=False)

submission2.head()

Unnamed: 0,PID,SalePrice
0,907135180,128850.526628
1,528181040,220481.190172
2,528175010,219893.020493
3,531379030,185285.123097
4,923275090,129745.333837
