In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
from sklearn.datasets import fetch_california_housing
import warnings
warnings.filterwarnings('ignore')

# Load and prepare the California Housing dataset
housing = fetch_california_housing()
data = pd.DataFrame(housing.data, columns=housing.feature_names)
data['PRICE'] = housing.target

# Data preprocessing
def preprocess_data(df):
    # Handle missing values
    df = df.fillna(df.mean())
    
    # Feature scaling
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(df.drop('PRICE', axis=1))
    
    return pd.DataFrame(features_scaled, columns=df.drop('PRICE', axis=1).columns), df['PRICE']

# Prepare data for both regression types
X, y = preprocess_data(data)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert continuous predictions to binary for comparison
def get_binary_metrics(y_true, y_pred):
    median = np.median(y_true)
    y_true_binary = (y_true > median).astype(int)
    y_pred_binary = (y_pred > median).astype(int)
    
    accuracy = accuracy_score(y_true_binary, y_pred_binary)
    f1 = f1_score(y_true_binary, y_pred_binary)
    
    return accuracy, f1

# Linear Regression
def train_linear_regression(X_train, X_test, y_train, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate regression metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    # Calculate classification metrics
    accuracy, f1 = get_binary_metrics(y_test, y_pred)
    
    return {
        'R2': round(r2, 3),
        'MSE': round(mse, 2),
        'Accuracy': round(accuracy, 3),
        'F1': round(f1, 3)
    }

# Logistic Regression
def train_logistic_regression(X_train, X_test, y_train, y_test):
    # Convert to binary classification
    median = np.median(y_train)
    y_train_binary = (y_train > median).astype(int)
    y_test_binary = (y_test > median).astype(int)
    
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train_binary)
    
    # Get predicted probabilities for regression metrics
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate regression metrics using probabilities
    r2 = r2_score(y_test_binary, y_pred_proba)
    mse = mean_squared_error(y_test_binary, y_pred_proba)
    
    # Calculate classification metrics
    accuracy = accuracy_score(y_test_binary, y_pred)
    f1 = f1_score(y_test_binary, y_pred)
    
    return {
        'R2': round(r2, 3),
        'MSE': round(mse, 2),
        'Accuracy': round(accuracy, 3),
        'F1': round(f1, 3)
    }

# Run both models and get results
linear_results = train_linear_regression(X_train, X_test, y_train, y_test)
logistic_results = train_logistic_regression(X_train, X_test, y_train, y_test)

# Print results
print("Linear Regression Results:")
print(f"R² Score: {linear_results['R2']}")
print(f"Mean Squared Error: {linear_results['MSE']}")
print(f"Accuracy: {linear_results['Accuracy']}")
print(f"F1-Score: {linear_results['F1']}")

print("\nLogistic Regression Results:")
print(f"R² Score: {logistic_results['R2']}")
print(f"Mean Squared Error: {logistic_results['MSE']}")
print(f"Accuracy: {logistic_results['Accuracy']}")
print(f"F1-Score: {logistic_results['F1']}")

# Feature importance for both models
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, (y_train > np.median(y_train)).astype(int))

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Linear_Coefficient': lr_model.coef_,
    'Logistic_Coefficient': log_model.coef_[0]
})
print("\nFeature Importance Comparison:")
print(feature_importance.sort_values(by='Linear_Coefficient', key=abs, ascending=False))

Linear Regression Results:
R² Score: 0.576
Mean Squared Error: 0.56
Accuracy: 0.793
F1-Score: 0.814

Logistic Regression Results:
R² Score: 0.513
Mean Squared Error: 0.12
Accuracy: 0.826
F1-Score: 0.825

Feature Importance Comparison:
      Feature  Linear_Coefficient  Logistic_Coefficient
6    Latitude           -0.896635             -3.594444
7   Longitude           -0.868927             -3.393665
0      MedInc            0.852382              2.578740
3   AveBedrms            0.371132              1.053286
2    AveRooms           -0.305116             -0.892185
1    HouseAge            0.122382              0.266601
5    AveOccup           -0.036624             -2.340539
4  Population           -0.002298              0.058806
