NIJMI BAJRACHARYA     ID:2508912    GROUP:11

In [22]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score

In [32]:
#task1
# Load dataset
df = pd.read_csv(
    "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
)

# Handle missing values
df["total_bedrooms"].fillna(df["total_bedrooms"].mean(), inplace=True)

# One-hot encode categorical column
df = pd.get_dummies(df, drop_first=True)

# Split features and target
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Dataset cleaned, encoded, and split successfully")



Dataset cleaned, encoded, and split successfully


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["total_bedrooms"].fillna(df["total_bedrooms"].mean(), inplace=True)


In [24]:
#task:2 Step1:baseline linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Predictions
y_train_pred = lin_reg.predict(X_train)
y_test_pred = lin_reg.predict(X_test)

# MSE
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print("Baseline Linear Regression")
print("Train MSE:", train_mse)
print("Test MSE:", test_mse)
print("Number of Coefficients:", len(lin_reg.coef_))



Baseline Linear Regression
Train MSE: 4683203783.504253
Test MSE: 4904399775.949265
Number of Coefficients: 12


In [25]:
#step:2 ridge regression+ gridsearch
ridge = Ridge()

ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}

ridge_cv = GridSearchCV(
    ridge,
    ridge_params,
    cv=5,
    scoring='neg_mean_squared_error'
)

ridge_cv.fit(X_train, y_train)

best_ridge = ridge_cv.best_estimator_

ridge_test_mse = mean_squared_error(
    y_test, best_ridge.predict(X_test)
)

print("Best Ridge Alpha:", ridge_cv.best_params_)
print("Ridge Test MSE:", ridge_test_mse)



Best Ridge Alpha: {'alpha': 1}
Ridge Test MSE: 4905952780.84936


In [26]:
#lasso regression and gridsearch
lasso = Lasso(max_iter=10000)

lasso_params = {'alpha': [0.01, 0.1, 1, 10, 100]}

lasso_cv = GridSearchCV(
    lasso,
    lasso_params,
    cv=5,
    scoring='neg_mean_squared_error'
)

lasso_cv.fit(X_train, y_train)

best_lasso = lasso_cv.best_estimator_

lasso_test_mse = mean_squared_error(
    y_test, best_lasso.predict(X_test)
)

print("Best Lasso Alpha:", lasso_cv.best_params_)
print("Lasso Test MSE:", lasso_test_mse)


Best Lasso Alpha: {'alpha': 0.01}
Lasso Test MSE: 4904401711.447271


In [27]:
#step:3 coefficient comparision
print("Ridge non-zero coefficients:", np.sum(best_ridge.coef_ != 0))
print("Lasso non-zero coefficients:", np.sum(best_lasso.coef_ != 0))


Ridge non-zero coefficients: 12
Lasso non-zero coefficients: 12


In [28]:
#task3
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Breast cancer dataset ready")


Breast cancer dataset ready


In [29]:
#task4 step:1 baseline logistic regression
log_reg = LogisticRegression(max_iter=5000)
log_reg.fit(X_train, y_train)

train_acc = accuracy_score(y_train, log_reg.predict(X_train))
test_acc = accuracy_score(y_test, log_reg.predict(X_test))

print("Baseline Logistic Regression")
print("Train Accuracy:", train_acc)
print("Test Accuracy:", test_acc)

Baseline Logistic Regression
Train Accuracy: 0.9582417582417583
Test Accuracy: 0.956140350877193


In [30]:
#step:2 logistic regression and grid search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

log_cv = GridSearchCV(
    LogisticRegression(max_iter=5000),
    param_grid,
    cv=5,
    scoring='accuracy'
)

log_cv.fit(X_train, y_train)

best_log = log_cv.best_estimator_

print("Best Parameters:", log_cv.best_params_)


Best Parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}


In [31]:
#step:3 final acccuracy and coefficient
train_acc = accuracy_score(y_train, best_log.predict(X_train))
test_acc = accuracy_score(y_test, best_log.predict(X_test))

print("Optimized Logistic Regression")
print("Train Accuracy:", train_acc)
print("Test Accuracy:", test_acc)

print("Non-zero coefficients:",
      np.sum(best_log.coef_ != 0))


Optimized Logistic Regression
Train Accuracy: 0.989010989010989
Test Accuracy: 0.9824561403508771
Non-zero coefficients: 21
