HANDS-ON SESSION-II

Maahi Subedi

Ait 664-DL3

REGRESSION MODELS

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np


In [4]:
housingprice=pd.read_csv('/content/ny_housing.csv')
housingprice.dropna(inplace=True)
target = 'PRICE'


In [5]:
# Define categorical and numerical columns
categorical_cols = ['TYPE']
numerical_cols = ['PRICE', 'BEDS', 'BATH',	'PROPERTYSQFT']

# One-hot encode categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_data = encoder.fit_transform(housingprice[categorical_cols])

# Create a DataFrame from encoded data
encoded_columns = encoder.get_feature_names_out(categorical_cols)
df_encoded = pd.DataFrame(encoded_data, columns=encoded_columns, index=housingprice.index)

# Combine numerical data and encoded categorical data
X = pd.concat([housingprice[numerical_cols], df_encoded], axis=1)
y = housingprice[target]

# Scale the features for better performance of regression models
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
#Split the data into 70% training and 30% test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [7]:
# Initialize and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_linear = linear_model.predict(X_test)

# Evaluate the Linear Regression model
print("Linear Regression Mean Squared Error:", mean_squared_error(y_test, y_pred_linear))
print("Linear Regression R^2 Score:", r2_score(y_test, y_pred_linear))


Linear Regression Mean Squared Error: 1.0107911815883683e-13
Linear Regression R^2 Score: 1.0


In [8]:
# Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest Regressor
print("Random Forest Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))
print("Random Forest Regressor R^2 Score:", r2_score(y_test, y_pred_rf))


Random Forest Regressor Mean Squared Error: 280915666440.554
Random Forest Regressor R^2 Score: 0.9857835269930872


In [9]:
# Create a RandomForestRegressor with regularization parameters
regressor = RandomForestRegressor(
    n_estimators=100,          # Number of trees
    max_depth=10,              # Limit the depth of each tree
    min_samples_split=5,       # Minimum samples required to split an internal node
    min_samples_leaf=4,        # Minimum samples required to be at a leaf node
    max_features='sqrt',       # Use the square root of the total features at each split
    random_state=42
)

# Fit the model
regressor.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Random Forest Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Random Forest Regressor R^2 Score:", r2_score(y_test, y_pred_rf))


Random Forest Regressor Mean Squared Error: 58727360328370.586
Random Forest Regressor R^2 Score: 0.9857835269930872


In [10]:
# Assuming X and y are your features and target variable
# Define the model with regularization parameters
regressor = RandomForestRegressor(
    n_estimators=300,
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=42
)

# Set up k-fold cross-validation (5 folds)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and calculate MSE for each fold
scores = cross_val_score(
    regressor, X, y, cv=kf, scoring=make_scorer(mean_squared_error)
)

# Calculate the mean and standard deviation of the MSE scores
mean_mse = np.mean(scores)
std_mse = np.std(scores)

print(f"Mean MSE from cross-validation: {mean_mse:.2f}")
print(f"R^2 Score: {r2_score(y_test, y_pred):.2f}")

Mean MSE from cross-validation: 972835892033484.25
R^2 Score: -1.97


In [11]:
#define the gradient boosting model with different parameters
gbm_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=3, random_state=42)
gbm_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_gbm = gbm_model.predict(X_test)

# Evaluate the Gradient Boosting Regressor
print("Gradient Boosting Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred_gbm))
print("Gradient Boosting Regressor R^2 Score:", r2_score(y_test, y_pred_gbm))

Gradient Boosting Regressor Mean Squared Error: 92259062914.92688
Gradient Boosting Regressor R^2 Score: 0.9953309884984621


In [12]:
# L2 Regularization: Ridge Regression
ridge = Ridge(alpha=1.0)  # alpha controls the regularization strength; higher means more regularization
ridge.fit(X_train, y_train)
ridge_predictions = ridge.predict(X_test)
print("Ridge MSE:", mean_squared_error(y_test, ridge_predictions))
print("Ridge R^2 Score:", r2_score(y_test, ridge_predictions))

# L1 Regularization: Lasso Regression
lasso = Lasso(alpha=0.1)  # alpha is the regularization parameter
lasso.fit(X_train, y_train)
lasso_predictions = lasso.predict(X_test)
print("Lasso MSE:", mean_squared_error(y_test, lasso_predictions))
print("Lasso R^2 Score:", r2_score(y_test, lasso_predictions))

# L1 + L2 Regularization: Elastic Net
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  # l1_ratio balances between L1 and L2 (0 = pure L2, 1 = pure L1)
elastic_net.fit(X_train, y_train)
elastic_net_predictions = elastic_net.predict(X_test)
print("Elastic Net MSE:", mean_squared_error(y_test, elastic_net_predictions))
print("Elastic Net R^2 Score:", r2_score(y_test, elastic_net_predictions))


Ridge MSE: 1534052.183138152
Ridge R^2 Score: 0.9999999223652717
Lasso MSE: 0.00010241741407229526
Lasso R^2 Score: 1.0
Elastic Net MSE: 36223361872.125725
Elastic Net R^2 Score: 0.9981668219049518


**Conclusion: **

Linear Regression: 1.0

Random Forest Regressor: 0.99

Random Forest Regressor w/ regularization: 0.99

Gradient Boosting Regressor: 0.99

Ridge: 0.99

Lasso: 1.0

Elastic: 0.99

In Regression model, Linear regression and Lasso regression are the best model among others.