In [1]:
# Import all necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

def load_house_data():
    """Load USA Housing dataset with error handling"""
    try:
        # Try to load from local file first
        df = pd.read_csv('USA_Housing.csv')
    except FileNotFoundError:
        # If local file not found, try to download from a public source
        try:
            # Alternative: Load from a public URL (you may need to update this URL)
            url = 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv'
            df = pd.read_csv(url)
            # Rename columns to match expected format if needed
            if 'median_house_value' in df.columns:
                df = df.rename(columns={'median_house_value': 'Price'})
        except:
            # Create a synthetic dataset for demonstration if no real data available
            print("Creating synthetic housing data for demonstration...")
            np.random.seed(42)
            n_samples = 5000

            # Generate synthetic features
            avg_area_income = np.random.normal(68000, 10000, n_samples)
            avg_area_house_age = np.random.normal(5.0, 1.0, n_samples)
            avg_area_number_of_rooms = np.random.normal(6.5, 1.0, n_samples)
            avg_area_number_of_bedrooms = np.random.normal(1.2, 0.3, n_samples)
            area_population = np.random.normal(36000, 5000, n_samples)

            # Generate synthetic price based on features
            price = (avg_area_income * 0.5 +
                    avg_area_number_of_rooms * 50000 +
                    avg_area_number_of_bedrooms * 25000 -
                    avg_area_house_age * 1000 +
                    area_population * 0.1 +
                    np.random.normal(0, 50000, n_samples))

            df = pd.DataFrame({
                'Avg. Area Income': avg_area_income,
                'Avg. Area House Age': avg_area_house_age,
                'Avg. Area Number of Rooms': avg_area_number_of_rooms,
                'Avg. Area Number of Bedrooms': avg_area_number_of_bedrooms,
                'Area Population': area_population,
                'Price': price
            })

    print("House Dataset Shape:", df.shape)
    print("Columns:", df.columns.tolist())
    return df

def k_fold_cross_validation(X, y, k=5):
    """Improved K-fold cross validation with proper error handling"""
    n = len(X)
    fold_size = n // k
    best_beta = None
    best_r2 = -np.inf
    results = []

    # Reset indices to ensure proper indexing
    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)

    for i in range(k):
        start_idx = i * fold_size
        end_idx = (i + 1) * fold_size if i < k - 1 else n

        test_indices = list(range(start_idx, end_idx))
        train_indices = list(range(0, start_idx)) + list(range(end_idx, n))

        X_train = X.iloc[train_indices]
        X_test = X.iloc[test_indices]
        y_train = y.iloc[train_indices]
        y_test = y.iloc[test_indices]

        # Add intercept term
        X_train_with_intercept = np.column_stack([np.ones(len(X_train)), X_train])
        X_test_with_intercept = np.column_stack([np.ones(len(X_test)), X_test])

        try:
            # Calculate beta using least squares: β = (X^T X)^(-1) X^T y
            XTX = X_train_with_intercept.T @ X_train_with_intercept
            XTX_inv = np.linalg.inv(XTX)
            XTy = X_train_with_intercept.T @ y_train
            beta = XTX_inv @ XTy

            # Make predictions
            y_pred = X_test_with_intercept @ beta
            r2 = r2_score(y_test, y_pred)

            results.append({
                'fold': i + 1,
                'beta': beta,
                'r2_score': r2
            })

            print(f"Fold {i+1}: R² = {r2:.4f}")

            if r2 > best_r2:
                best_r2 = r2
                best_beta = beta

        except np.linalg.LinAlgError:
            print(f"Fold {i+1}: Singular matrix encountered, skipping...")
            continue

    return results, best_beta, best_r2

def gradient_descent(X, y, learning_rate=0.01, iterations=1000):
    """Improved gradient descent with convergence checking"""
    X_with_intercept = np.column_stack([np.ones(len(X)), X])
    m, n = X_with_intercept.shape
    beta = np.zeros(n)

    prev_cost = float('inf')
    tolerance = 1e-6

    for i in range(iterations):
        y_pred = X_with_intercept @ beta
        cost = np.mean((y_pred - y) ** 2)
        gradient = (2 / m) * X_with_intercept.T @ (y_pred - y)
        beta -= learning_rate * gradient

        # Check for convergence
        if abs(prev_cost - cost) < tolerance:
            print(f"  Converged after {i+1} iterations")
            break
        prev_cost = cost

    return beta

def question_1():
    """Question 1: K-Fold Cross Validation for Multiple Linear Regression"""
    print("=== QUESTION 1: K-FOLD CROSS VALIDATION ===")
    df = load_house_data()

    # Handle different possible column names
    price_col = 'Price' if 'Price' in df.columns else df.columns[-1]
    X = df.drop(price_col, axis=1)
    y = df[price_col]

    # Scale features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    # Perform K-fold cross validation
    results, best_beta, best_r2 = k_fold_cross_validation(X_scaled, y)
    print(f"\nBest R² Score from CV: {best_r2:.4f}")

    # Final evaluation on 70-30 split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

    X_train_with_intercept = np.column_stack([np.ones(len(X_train)), X_train])
    X_test_with_intercept = np.column_stack([np.ones(len(X_test)), X_test])

    if best_beta is not None:
        y_pred_test = X_test_with_intercept @ best_beta
        final_r2 = r2_score(y_test, y_pred_test)
        print(f"Final Test R² Score: {final_r2:.4f}")
    else:
        print("No valid beta found from cross-validation")
        return None, None

    return results, best_beta

def question_2():
    """Question 2: Validation Set Concept with Gradient Descent"""
    print("\n=== QUESTION 2: GRADIENT DESCENT WITH VALIDATION SET ===")
    df = load_house_data()

    # Handle different possible column names
    price_col = 'Price' if 'Price' in df.columns else df.columns[-1]
    X = df.drop(price_col, axis=1)
    y = df[price_col]

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data: train (56%), validation (14%), test (30%)
    X_train_val, X_test, y_train_val, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

    print(f"Dataset split - Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

    learning_rates = [0.001, 0.01, 0.1, 1]
    best_lr = None
    best_val_r2 = -np.inf
    best_coefficients = None

    for lr in learning_rates:
        print(f"\nTrying learning rate: {lr}")
        try:
            coefficients = gradient_descent(X_train, y_train, lr, 1000)

            # Make predictions
            X_val_with_intercept = np.column_stack([np.ones(len(X_val)), X_val])
            X_test_with_intercept = np.column_stack([np.ones(len(X_test)), X_test])

            y_val_pred = X_val_with_intercept @ coefficients
            y_test_pred = X_test_with_intercept @ coefficients

            val_r2 = r2_score(y_val, y_val_pred)
            test_r2 = r2_score(y_test, y_test_pred)

            print(f"Learning Rate: {lr}, Validation R²: {val_r2:.4f}, Test R²: {test_r2:.4f}")

            if val_r2 > best_val_r2:
                best_val_r2 = val_r2
                best_lr = lr
                best_coefficients = coefficients

        except Exception as e:
            print(f"Error with learning rate {lr}: {str(e)}")
            continue

    print(f"\nBest Learning Rate: {best_lr}")
    print(f"Best Validation R²: {best_val_r2:.4f}")

    return best_coefficients, best_lr

def load_car_data():
    """Load car dataset with proper error handling"""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
    columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
               "num_doors", "body_style", "drive_wheels", "engine_location",
               "wheel_base", "length", "width", "height", "curb_weight",
               "engine_type", "num_cylinders", "engine_size", "fuel_system",
               "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
               "city_mpg", "highway_mpg", "price"]

    try:
        df = pd.read_csv(url, names=columns, na_values='?')
        print("Car Dataset Shape:", df.shape)
        return df
    except Exception as e:
        print(f"Error loading car dataset: {e}")
        return None

def preprocess_car_data(df):
    """Improved preprocessing with better error handling"""
    if df is None:
        return None

    # Make a copy to avoid modifying original
    df = df.copy()

    # Drop rows with missing price (target variable)
    initial_rows = len(df)
    df = df.dropna(subset=['price'])
    print(f"Dropped {initial_rows - len(df)} rows with missing price")

    # Handle missing values
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    categorical_columns = df.select_dtypes(include=['object']).columns

    # Fill numeric columns with median
    for col in numeric_columns:
        if col != 'price':
            df[col] = df[col].fillna(df[col].median())

    # Fill categorical columns with mode
    for col in categorical_columns:
        mode_val = df[col].mode()
        if len(mode_val) > 0:
            df[col] = df[col].fillna(mode_val[0])

    # Convert word numbers to figures
    door_mapping = {'two': 2, 'four': 4}
    cylinder_mapping = {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12}

    if 'num_doors' in df.columns:
        df['num_doors'] = df['num_doors'].map(door_mapping).fillna(df['num_doors'])
    if 'num_cylinders' in df.columns:
        df['num_cylinders'] = df['num_cylinders'].map(cylinder_mapping).fillna(df['num_cylinders'])

    # Dummy encoding for categorical variables
    categorical_for_dummies = ['body_style', 'drive_wheels']
    for col in categorical_for_dummies:
        if col in df.columns:
            dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
            df = pd.concat([df, dummies], axis=1)
            df = df.drop(col, axis=1)

    # Label encoding for specific columns
    label_encode_cols = ['make', 'aspiration', 'engine_location', 'fuel_type']
    for col in label_encode_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

    # Binary encoding for fuel_system and engine_type
    if 'fuel_system' in df.columns:
        df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x).lower() else 0)
    if 'engine_type' in df.columns:
        df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in str(x).lower() else 0)

    return df

def question_3():
    """Question 3: Car Price Prediction with Preprocessing and PCA"""
    print("\n=== QUESTION 3: CAR PRICE PREDICTION WITH PREPROCESSING ===")
    df = load_car_data()

    if df is None:
        print("Failed to load car dataset")
        return None, None, None

    df_processed = preprocess_car_data(df)

    if df_processed is None:
        print("Failed to preprocess car dataset")
        return None, None, None

    print(f"Processed Dataset Shape: {df_processed.shape}")

    # Prepare features and target
    X = df_processed.drop('price', axis=1)
    y = df_processed['price']

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

    # Original model without PCA
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_original = r2_score(y_test, y_pred)

    print(f"Original Model R² Score: {r2_original:.4f}")

    # Model with PCA
    pca = PCA(n_components=0.95)  # Preserve 95% of variance
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    print(f"PCA Components: {pca.n_components_} (from {X.shape[1]} original features)")
    print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")

    model_pca = LinearRegression()
    model_pca.fit(X_train_pca, y_train)
    y_pred_pca = model_pca.predict(X_test_pca)
    r2_pca = r2_score(y_test, y_pred_pca)

    print(f"PCA Model R² Score: {r2_pca:.4f}")

    # Performance comparison
    improvement = r2_pca - r2_original
    if improvement > 0.01:
        print(f"PCA improved performance by {improvement:.4f}!")
    elif improvement < -0.01:
        print(f"PCA reduced performance by {-improvement:.4f}.")
    else:
        print("PCA had no significant impact on performance.")

    return r2_original, r2_pca, pca.n_components_

def main():
    """Main function to run all questions"""
    print("Machine Learning Lab Assignment 3")
    print("=" * 50)

    try:
        # Question 1
        results_q1, best_beta = question_1()

        # Question 2
        best_coeffs, best_lr = question_2()

        # Question 3
        r2_orig, r2_pca, n_components = question_3()

        # Summary
        print("\n" + "=" * 50)
        print("=== FINAL SUMMARY ===")
        print("=" * 50)

        if results_q1 is not None:
            best_cv_r2 = max([r['r2_score'] for r in results_q1])
            print(f"Q1 - Best K-Fold CV R²: {best_cv_r2:.4f}")
        else:
            print("Q1 - K-Fold CV: Failed")

        if best_lr is not None:
            print(f"Q2 - Best Learning Rate: {best_lr}")
        else:
            print("Q2 - Gradient Descent: Failed")

        if r2_orig is not None and r2_pca is not None:
            print(f"Q3 - Original R²: {r2_orig:.4f}, PCA R²: {r2_pca:.4f}")
            print(f"Q3 - Dimensionality Reduction: {n_components} components")
        else:
            print("Q3 - Car Price Prediction: Failed")

        print("=" * 50)
        print("Assignment completed successfully!")

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

# Run the main function
if __name__ == "__main__":
    main()


Machine Learning Lab Assignment 3
=== QUESTION 1: K-FOLD CROSS VALIDATION ===
House Dataset Shape: (5000, 6)
Columns: ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 'Area Population', 'Price']
Fold 1: R² = 0.9176
Fold 2: R² = 0.9203
Fold 3: R² = 0.9152
Fold 4: R² = 0.9209
Fold 5: R² = 0.9138

Best R² Score from CV: 0.9209
Final Test R² Score: 0.9147

=== QUESTION 2: GRADIENT DESCENT WITH VALIDATION SET ===
House Dataset Shape: (5000, 6)
Columns: ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms', 'Avg. Area Number of Bedrooms', 'Area Population', 'Price']
Dataset split - Train: 2800, Val: 700, Test: 1500

Trying learning rate: 0.001
Learning Rate: 0.001, Validation R²: 0.6820, Test R²: 0.6490

Trying learning rate: 0.01
Learning Rate: 0.01, Validation R²: 0.9098, Test R²: 0.9148

Trying learning rate: 0.1
  Converged after 150 iterations
Learning Rate: 0.1, Validation R²: 0.9098, Test R²: 0.9148

Trying lear