# **Import Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

# **Data Collection**

* **Read Dataset**

In [2]:
df = pd.read_csv('AmesHousing.csv')
print("Shape:", df.shape)
print("Columns:", list(df.columns))

Shape: (2930, 82)
Columns: ['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual', 'Garage Cond',

# **Data Analysis**

* **Check Null**

In [3]:
df.isna().sum().sum()

15749

* **Remove columns if 50% is NULL**

In [4]:
threshold = 0.5
cols_to_drop = [col for col in df.columns if df[col].isna().mean() > threshold]
df = df.drop(columns=cols_to_drop)
print("Shape:", df.shape)

Shape: (2930, 77)


* **Fill NULL numarical value with median**

In [5]:
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

* **Fill NULL object value with Unknown**

In [6]:
cat_cols = df.select_dtypes(include=["object"]).columns
df[cat_cols] = df[cat_cols].fillna("Unknown")

* **Check NULL value**

In [7]:
df.isna().sum().sum()

0

* **Check duplicate value**

In [8]:
df.duplicated().sum()

0

# **Feature Engineering**

* **Apply one hot encoder on objects columns**

In [9]:
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df.shape

(2930, 260)

* **Split Data to X, and Y**

In [10]:
target = df['SalePrice']
X = df.drop(columns=["SalePrice"])

* **Split Data to train, and test**

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (2344, 259)
Test shape: (586, 259)


* **Apply Scaling to train, and test**

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# **Modeling**

#### **Multiple Linear Regression**
* **Models the relationship between multiple independent variables and one dependent variable.**

In [13]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

r2_lr = r2_score(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))

print("ðŸ“˜ Linear Regression:")
print(f"RÂ² = {r2_lr:.4f}")
print(f"RMSE = {rmse_lr:.2f}\n")

ðŸ“˜ Linear Regression:
RÂ² = 0.8941
RMSE = 29141.76



#### **Ridge Regression (L2)**
* **Adds a penalty proportional to the square of the coefficients. Shrinks large coefficients but doesnâ€™t make them zero.**

In [14]:
ridge = Ridge(alpha=10)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_test_scaled)

r2_ridge = r2_score(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))

print("ðŸ“— Ridge Regression:")
print(f"RÂ² = {r2_ridge:.4f}")
print(f"RMSE = {rmse_ridge:.2f}\n")


ðŸ“— Ridge Regression:
RÂ² = 0.8945
RMSE = 29084.80



#### **Lasso Regression (L1)**
* **Adds a penalty proportional to the absolute value of the coefficients. Can shrink some coefficients to zero.**

In [15]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_test_scaled)

r2_lasso = r2_score(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))

print("ðŸ“™ Lasso Regression:")
print(f"RÂ² = {r2_lasso:.4f}")
print(f"RMSE = {rmse_lasso:.2f}\n")


ðŸ“™ Lasso Regression:
RÂ² = 0.8948
RMSE = 29039.90



  model = cd_fast.enet_coordinate_descent(


#### **Elastic Net Regression**
* **Combines L1 (Lasso) and L2 (Ridge) penalties for balance.**

In [16]:
elastic = ElasticNet(alpha=0.001, l1_ratio=0.5)
elastic.fit(X_train_scaled, y_train)
y_pred_elastic = elastic.predict(X_test_scaled)

r2_elastic = r2_score(y_test, y_pred_elastic)
rmse_elastic = np.sqrt(mean_squared_error(y_test, y_pred_elastic))

print("ðŸ“• ElasticNet Regression:")
print(f"RÂ² = {r2_elastic:.4f}")
print(f"RMSE = {rmse_elastic:.2f}\n")

ðŸ“• ElasticNet Regression:
RÂ² = 0.8949
RMSE = 29034.49



  model = cd_fast.enet_coordinate_descent(


#### **Result**
* **Performance Comparison**

In [17]:
results = pd.DataFrame({
    "Model": ["Linear", "Ridge", "Lasso", "ElasticNet"],
    "RÂ² Score": [r2_lr, r2_ridge, r2_lasso, r2_elastic],
    "RMSE": [rmse_lr, rmse_ridge, rmse_lasso, rmse_elastic]
})

print("ðŸ“Š Performance Comparison:")
print(results.sort_values(by="RÂ² Score", ascending=False))


ðŸ“Š Performance Comparison:
        Model  RÂ² Score          RMSE
3  ElasticNet  0.894855  29034.485540
2       Lasso  0.894816  29039.898204
1       Ridge  0.894491  29084.800397
0      Linear  0.894077  29141.760392
