In [180]:
import numpy as np
import pandas as pd
import csv

# Train test split

In [181]:
# Hàm tự định nghĩa để chia dữ liệu thành train/val/test
def custom_train_test_split(X, y, train_size=0.5, val_size=0.4, test_size=0.1, random_state=42):
    assert abs(train_size + val_size + test_size - 1.0) < 1e-6, "Tổng tỷ lệ train_size, val_size, test_size phải bằng 1"
    
    np.random.seed(random_state)
    n_samples = X.shape[0]
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    train_end = int(train_size * n_samples)
    val_end = train_end + int(val_size * n_samples)
    
    train_indices = indices[:train_end]
    val_indices = indices[train_end:val_end]
    test_indices = indices[val_end:]
    
    X_train = X[train_indices]
    X_val = X[val_indices]
    X_test = X[test_indices]
    
    y_train = y[train_indices]
    y_val = y[val_indices]
    y_test = y[test_indices]
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Linear Regression model

In [182]:
# Định nghĩa class Model
class Model():
    def __init__(self, data, regression_type='standard', lr=0.0001, epochs=100, batch_size=32, optimizer='batch', 
                 lambda_reg=1.0, use_log=False, scale_type='normalize'):
        # Chia dữ liệu thành train/val/test
        self.X_train, self.X_val, self.X_test, self.y_train, self.y_val, self.y_test = custom_train_test_split(
            data[:, :-1], data[:, -1], train_size=0.5, val_size=0.4, test_size=0.1, random_state=42)
        
        self.y_train_original = self.y_train.copy()
        self.y_val_original = self.y_val.copy()
        self.y_test_original = self.y_test.copy()
        
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.regression_type = regression_type
        self.lambda_reg = lambda_reg
        self.use_log = use_log
        self.scale_type = scale_type
        
        # Scale dữ liệu
        if self.scale_type == 'normalize':
            self.X_train, self.X_mean, self.X_std = normalize(self.X_train)
            self.X_val = (self.X_val - self.X_mean) / self.X_std
            self.X_test = (self.X_test - self.X_mean) / self.X_std
        elif self.scale_type == 'minmax':
            self.X_train, self.X_min, self.X_max = minmax_scale(self.X_train)
            self.X_val = (self.X_val - self.X_min) / (self.X_max - self.X_min)
            self.X_test = (self.X_test - self.X_min) / (self.X_max - self.X_min)
        
        # Scale y nếu cần
        if self.use_log:
            self.y_train = np.log(self.y_train + 1)
            self.y_val = np.log(self.y_val + 1)
            self.y_test = np.log(self.y_test + 1)
        elif self.scale_type == 'minmax':
            self.y_train, self.y_min, self.y_max = minmax_scale(self.y_train.reshape(-1, 1))
            self.y_val = (self.y_val - self.y_min) / (self.y_max - self.y_min)
            self.y_test = (self.y_test - self.y_min) / (self.y_max - self.y_min)
        
        # Thêm cột bias
        self.X_train = np.hstack((np.ones((self.X_train.shape[0], 1)), self.X_train))
        self.X_val = np.hstack((np.ones((self.X_val.shape[0], 1)), self.X_val))
        self.X_test = np.hstack((np.ones((self.X_test.shape[0], 1)), self.X_test))
        
        self._prepare_features()
        
        print(f"Max X_train: {np.max(self.X_train)}, Min X_train: {np.min(self.X_train)}")
        if np.any(np.isnan(self.X_train)) or np.any(np.isinf(self.X_train)):
            print("Warning: NaN or Inf detected in X_train after _prepare_features")
        
        self.weight = np.random.randn(self.X_train.shape[1], 1) * 0.01
        
        self.beta1 = 0.9
        self.beta2 = 0.999
        self.epsilon = 1e-8
        self.m = np.zeros_like(self.weight)
        self.v = np.zeros_like(self.weight)
        self.t = 0
    
    def _prepare_features(self):
        if self.regression_type == 'standard':
            pass
        elif self.regression_type == 'polynomial':
            self.X_train = np.vstack([self.X_train[:, 0], self.X_train[:, 1]**2, self.X_train[:, 2], 
                                    self.X_train[:, 3]**2, self.X_train[:, 4],
                                    self.X_train[:, 5], self.X_train[:, 6]**2,
                                    self.X_train[:, 7], self.X_train[:, 8]**2,
                                    self.X_train[:, 9]]).T
            self.X_val = np.vstack([self.X_val[:, 0], self.X_val[:, 1]**2, self.X_val[:, 2], 
                                   self.X_val[:, 3]**2, self.X_val[:, 4],
                                   self.X_val[:, 5], self.X_val[:, 6]**2,
                                   self.X_val[:, 7], self.X_val[:, 8]**2,
                                   self.X_val[:, 9]]).T
            self.X_test = np.vstack([self.X_test[:, 0], self.X_test[:, 1]**2, self.X_test[:, 2], 
                                   self.X_test[:, 3]**2, self.X_test[:, 4],
                                   self.X_test[:, 5], self.X_test[:, 6]**2,
                                   self.X_test[:, 7], self.X_test[:, 8]**2,
                                   self.X_test[:, 9]]).T
        elif self.regression_type == 'mixed':
            self.X_train = np.vstack([self.X_train[:, 0], self.X_train[:, 1] + self.X_train[:, 2], 
                                    self.X_train[:, 3]**2, self.X_train[:, 4],
                                    self.X_train[:, 5], self.X_train[:, 6],
                                    self.X_train[:, 7]**2, self.X_train[:, 8],
                                    self.X_train[:, 9]]).T
            self.X_val = np.vstack([self.X_val[:, 0], self.X_val[:, 1] + self.X_val[:, 2], 
                                   self.X_val[:, 3]**2, self.X_val[:, 4],
                                   self.X_val[:, 5], self.X_val[:, 6],
                                   self.X_val[:, 7]**2, self.X_val[:, 8],
                                   self.X_val[:, 9]]).T
            self.X_test = np.vstack([self.X_test[:, 0], self.X_test[:, 1] + self.X_test[:, 2], 
                                   self.X_test[:, 3]**2, self.X_test[:, 4],
                                   self.X_test[:, 5], self.X_test[:, 6],
                                   self.X_test[:, 7]**2, self.X_test[:, 8],
                                   self.X_test[:, 9]]).T
        elif self.regression_type == 'interaction':
            self.X_train = np.vstack([self.X_train[:, 0], self.X_train[:, 1] * self.X_train[:, 2], 
                                    self.X_train[:, 3]**2, self.X_train[:, 4],
                                    self.X_train[:, 5] * self.X_train[:, 6],
                                    self.X_train[:, 7], self.X_train[:, 8]**2,
                                    self.X_train[:, 9]]).T
            self.X_val = np.vstack([self.X_val[:, 0], self.X_val[:, 1] * self.X_val[:, 2], 
                                   self.X_val[:, 3]**2, self.X_val[:, 4],
                                   self.X_val[:, 5] * self.X_val[:, 6],
                                   self.X_val[:, 7], self.X_val[:, 8]**2,
                                   self.X_val[:, 9]]).T
            self.X_test = np.vstack([self.X_test[:, 0], self.X_test[:, 1] * self.X_test[:, 2], 
                                   self.X_test[:, 3]**2, self.X_test[:, 4],
                                   self.X_test[:, 5] * self.X_test[:, 6],
                                   self.X_test[:, 7], self.X_test[:, 8]**2,
                                   self.X_test[:, 9]]).T
    
    def predict(self, X):
        y_pred = np.dot(X, self.weight)
        if self.use_log:
            y_pred = np.clip(y_pred, -100, 100)
            y_pred = np.exp(y_pred) - 1
        elif self.scale_type == 'minmax':
            y_pred = y_pred * (self.y_max - self.y_min) + self.y_min
        return y_pred
    
    def gradient(self, X, y, y_hat):
        grad = 2 * np.dot(X.T, (y_hat - y.reshape(-1, 1))) / len(y)
        if np.any(np.isnan(grad)) or np.any(np.isinf(grad)):
            print("Warning: NaN or Inf detected in gradient")
        return grad
    
    def ridge_gradient(self, X, y, y_hat):
        grad = self.gradient(X, y, y_hat) + 2 * self.lambda_reg * self.weight
        return grad
    
    def batch_gradient_descent(self):
        for epoch in range(self.epochs):
            y_hat = self.predict(self.X_train)
            grad = self.gradient(self.X_train, self.y_train, y_hat)
            self.weight -= self.lr * grad
            if epoch % 10 == 0:
                loss = mse(self.y_train.reshape(-1, 1), y_hat)
                print(f'Epoch {epoch}, Loss: {loss:.4f}')
    
    def stochastic_gradient_descent(self):
        n_samples = len(self.X_train)
        indices = np.arange(n_samples)
        for epoch in range(self.epochs):
            np.random.shuffle(indices)
            total_loss = 0
            for idx in indices:
                X_i = self.X_train[idx:idx+1]
                y_i = self.y_train[idx:idx+1]
                y_hat = self.predict(X_i)
                grad = self.gradient(X_i, y_i, y_hat)
                self.weight -= self.lr * grad
                total_loss += mse(y_i.reshape(-1, 1), y_hat)
            if epoch % 10 == 0:
                print(f'Epoch {epoch}, Average Loss: {total_loss/n_samples:.4f}')
    
    def adam_optimizer(self):
        for epoch in range(self.epochs):
            self.t += 1
            y_hat = self.predict(self.X_train)
            grad = self.gradient(self.X_train, self.y_train, y_hat)
            self.m = self.beta1 * self.m + (1 - self.beta1) * grad
            self.v = self.beta2 * self.v + (1 - self.beta2) * (grad ** 2)
            m_hat = self.m / (1 - self.beta1 ** self.t)
            v_hat = self.v / (1 - self.beta2 ** self.t)
            self.weight -= self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
            if epoch % 10 == 0:
                loss = mse(self.y_train.reshape(-1, 1), y_hat)
                print(f'Epoch {epoch}, Loss: {loss:.4f}')
                if np.isnan(loss) or np.isinf(loss):
                    print("Training stopped due to NaN/Inf loss")
                    break
    
    def pseudo_inverse(self):
        X = self.X_train
        y = self.y_train.reshape(-1, 1)
        self.weight = np.linalg.pinv(X.T @ X) @ X.T @ y
        loss = mse(self.y_train.reshape(-1, 1), self.predict(X))
        print(f'Pseudo-inverse Loss: {loss:.4f}')
    
    def ridge_analytical(self):
        X = self.X_train
        y = self.y_train.reshape(-1, 1)
        n_features = X.shape[1]
        I = np.eye(n_features)
        self.weight = np.linalg.inv(X.T @ X + self.lambda_reg * I) @ X.T @ y
        loss = mse(self.y_train.reshape(-1, 1), self.predict(X)) + self.lambda_reg * np.sum(self.weight ** 2)
        print(f'Ridge Analytical Loss: {loss:.4f}')
    
    def ridge_gradient_descent(self):
        for epoch in range(self.epochs):
            y_hat = self.predict(self.X_train)
            grad = self.ridge_gradient(self.X_train, self.y_train, y_hat)
            self.weight -= self.lr * grad
            if epoch % 10 == 0:
                loss = mse(self.y_train.reshape(-1, 1), y_hat) + self.lambda_reg * np.sum(self.weight ** 2)
                print(f'Epoch {epoch}, Ridge Loss: {loss:.4f}')
                if np.isnan(loss) or np.isinf(loss):
                    print("Training stopped due to NaN/Inf loss")
                    break
    
    def least_squares(self):
        X = self.X_train
        y = self.y_train.reshape(-1, 1)
        self.weight, residuals, rank, s = np.linalg.lstsq(X, y, rcond=None)
        loss = mse(self.y_train.reshape(-1, 1), self.predict(X)) if residuals.size == 0 else residuals[0]
        print(f'Least Squares Loss: {loss:.4f}')
    
    def fit(self):
        if self.optimizer == 'batch':
            self.batch_gradient_descent()
        elif self.optimizer == 'sgd':
            self.stochastic_gradient_descent()
        elif self.optimizer == 'adam':
            self.adam_optimizer()
        elif self.optimizer == 'pseudo':
            self.pseudo_inverse()
        elif self.optimizer == 'ridge_analytical':
            self.ridge_analytical()
        elif self.optimizer == 'ridge_gd':
            self.ridge_gradient_descent()
        elif self.optimizer == 'least_squares':
            self.least_squares()

# Loss function

In [183]:
# Hàm tính MSE
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

# Hàm tính MAE
def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

# Data preprocessing

In [184]:
# Hàm đọc dữ liệu từ file CSV
def load_data_from_csv(file_path):
    required_columns = ['Year', 'Engine', 'Length', 'Width', 'Fuel Tank Capacity', 
                        'Max Power BHP', 'Max Power RPM', 'Max Torque Nm', 'Max Torque RPM', 'Price']
    data = []
    
    try:
        with open(file_path, 'r') as file:
            reader = csv.DictReader(file)
            if not all(col in reader.fieldnames for col in required_columns):
                raise ValueError("File CSV thiếu một hoặc nhiều cột cần thiết!")
            
            for row in reader:
                row_data = []
                for col in required_columns:
                    value = row[col].strip()
                    if value:
                        row_data.append(float(value))
                    else:
                        row_data.append(0.0)
                data.append(row_data)
    except FileNotFoundError:
        print(f"Không tìm thấy file: {file_path}")
        raise
    except Exception as e:
        print(f"Lỗi khi đọc file CSV: {e}")
        raise
    
    data = np.array(data)
    X = data[:, :-1]
    y = data[:, -1]
    return X, y

# Chuẩn hóa dữ liệu (StandardScaler: mean=0, std=1)
def normalize(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    std[std == 0] = 1.0
    X_normalized = (X - mean) / std
    return X_normalized, mean, std

# Scale dữ liệu (MinMaxScaler: [0, 1])
def minmax_scale(X, min_range=0, max_range=1):
    X_min = np.min(X, axis=0)
    X_max = np.max(X, axis=0)
    X_range = X_max - X_min
    mask = X_range == 0
    X_range[mask] = 1.0
    X_scaled = (X - X_min) / X_range * (max_range - min_range) + min_range
    X_scaled[:, mask] = X[:, mask]
    return X_scaled, X_min, X_max# Hàm đọc dữ liệu từ file CSV
def load_data_from_csv(file_path):
    required_columns = ['Year', 'Engine', 'Length', 'Width', 'Fuel Tank Capacity', 
                        'Max Power BHP', 'Max Power RPM', 'Max Torque Nm', 'Max Torque RPM', 'Price']
    data = []
    
    try:
        with open(file_path, 'r') as file:
            reader = csv.DictReader(file)
            if not all(col in reader.fieldnames for col in required_columns):
                raise ValueError("File CSV thiếu một hoặc nhiều cột cần thiết!")
            
            for row in reader:
                row_data = []
                for col in required_columns:
                    value = row[col].strip()
                    if value:
                        row_data.append(float(value))
                    else:
                        row_data.append(0.0)
                data.append(row_data)
    except FileNotFoundError:
        print(f"Không tìm thấy file: {file_path}")
        raise
    except Exception as e:
        print(f"Lỗi khi đọc file CSV: {e}")
        raise
    
    data = np.array(data)
    X = data[:, :-1]
    y = data[:, -1]
    return X, y

# Chuẩn hóa dữ liệu (StandardScaler: mean=0, std=1)
def normalize(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    std[std == 0] = 1.0
    X_normalized = (X - mean) / std
    return X_normalized, mean, std

# Scale dữ liệu (MinMaxScaler: [0, 1])
def minmax_scale(X, min_range=0, max_range=1):
    X_min = np.min(X, axis=0)
    X_max = np.max(X, axis=0)
    X_range = X_max - X_min
    mask = X_range == 0
    X_range[mask] = 1.0
    X_scaled = (X - X_min) / X_range * (max_range - min_range) + min_range
    X_scaled[:, mask] = X[:, mask]
    return X_scaled, X_min, X_max

# Test

In [185]:
# Đọc dữ liệu từ file CSV
file_path = './data/train_mean.csv'
X, y = load_data_from_csv(file_path)
data = np.hstack((X, y.reshape(-1, 1)))

In [186]:
# Train và predict trên tập test
models = [
    ("Standard Linear Regression with Least Squares", Model(data, regression_type='standard', optimizer='least_squares', use_log=False, scale_type='minmax')),
    ("Polynomial Regression with Ridge Gradient Descent", Model(data, regression_type='polynomial', optimizer='ridge_gd', epochs=50, lambda_reg=1.0, use_log=True, scale_type='normalize')),
    ("Mixed Terms Regression with Adam", Model(data, regression_type='mixed', optimizer='adam', epochs=50, use_log=True, scale_type='normalize')),
    ("Interaction Terms Regression with Least Squares", Model(data, regression_type='interaction', optimizer='least_squares', use_log=False, scale_type='minmax'))
]

for name, model in models:
    print(f"\nTraining: {name}")
    model.fit()
    
    print(f"Number of weights: {model.weight.shape[0]}")
    print(f"Weights: {model.weight.flatten()}")
    
    # Dự đoán trên tập validation và test
    y_val_pred = model.predict(model.X_val)
    y_test_pred = model.predict(model.X_test)
    
    # Tính MSE và MAE trên tập validation
    val_mse = mse(model.y_val_original.reshape(-1, 1), y_val_pred)
    val_mae = mae(model.y_val_original.reshape(-1, 1), y_val_pred)
    
    # Tính MSE và MAE trên tập test
    test_mse = mse(model.y_test_original.reshape(-1, 1), y_test_pred)
    test_mae = mae(model.y_test_original.reshape(-1, 1), y_test_pred)
    
    # In kết quả trên tập validation
    print(f"\nPredictions vs True Values (Validation Set):")
    print("Index | Predicted Price | True Price")
    for i in range(min(len(model.y_val), 10)):  # In 10 dòng đầu
        print(f"{i:5d} | {y_val_pred[i, 0]:15.2f} | {model.y_val_original[i]:15.2f}")
    
    print(f"\nValidation MSE: {val_mse:.4f}")
    print(f"Validation MAE: {val_mae:.4f}")
    
    # In kết quả trên tập test
    print(f"\nPredictions vs True Values (Test Set):")
    print("Index | Predicted Price | True Price")
    for i in range(min(len(model.y_test), 10)):  # In 10 dòng đầu
        print(f"{i:5d} | {y_test_pred[i, 0]:15.2f} | {model.y_test_original[i]:15.2f}")
    
    print(f"\nTest MSE: {test_mse:.4f}")
    print(f"Test MAE: {test_mae:.4f}")

Max X_train: 1.0, Min X_train: 0.0
Max X_train: 98.16009310257178, Min X_train: -2.534156404924392
Max X_train: 10.449421621520598, Min X_train: -6.1984996543181525
Max X_train: 1.0, Min X_train: 0.0

Training: Standard Linear Regression with Least Squares
Least Squares Loss: 2.5312
Number of weights: 10
Weights: [-0.24574932  0.2238761  -0.13380438 -0.04374224 -0.0411989   0.09068511
  0.80487626 -0.05653534  0.03497721  0.04400134]

Predictions vs True Values (Validation Set):
Index | Predicted Price | True Price
    0 |       969886.88 |       550000.00
    1 |       649743.61 |       240000.00
    2 |       750002.65 |       295000.00
    3 |      -512444.94 |       380000.00
    4 |      2123165.84 |      1299000.00
    5 |       516907.84 |       550000.00
    6 |       406355.86 |       591000.00
    7 |       771846.25 |       145000.00
    8 |      1957188.97 |      1375000.00
    9 |      3674602.63 |      6000000.00

Validation MSE: 2305515634249.5459
Validation MAE: 795281.