In [2]:
# 23122003 Nguyễn Văn Linh
# 23122022 Trần Hoàng Gia Bảo
# 23122026 Trần Chấn Hiệp
# 23122040 Nguyễn Thị Mỹ Kim

In [8]:
import pandas as pd
import numpy as np
import json

Đây là code block sử dụng các model parameters đã được lưu trong file "model_params.json" để đánh giá trên tập test.

Thầy chỉ cần thay đổi đường dẫn tới file test.csv bên dưới thôi ạ.

In [4]:
def standard_data(data: np.ndarray):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0) + 1e-8 # Avoid division by zero
    return (data - mean) / std

def mean_squared_error(y, y_hat):
    n = len(y_hat)
    cost = np.sum(np.square(y_hat - y)) / n
    return cost

def mean_absolute_error(y, y_hat):
    n = len(y_hat)
    cost = np.sum(abs(y_hat - y)) / n
    return cost

def r2_score(y, y_hat):
    mean_y = np.mean(y)
    ss_total = np.sum((y - mean_y) ** 2)
    ss_residual = np.sum((y - y_hat) ** 2)
    r2 = 1 - (ss_residual / ss_total)
    return r2

In [5]:
def DATA_BY_MODEL1(df: pd.DataFrame):
    df1 = df.copy()

    df1['Price'] = np.log1p(df1['Price'])
    X = df1.drop("Price", axis=1)
    X = standard_data(X) 
    
    y = df1['Price'] 
    y = standard_data(y)
    
    return X, y

def DATA_BY_MODEL2(df: pd.DataFrame):
    df1 = df.copy()
    
    df1['Price'] = np.sqrt(df1['Price'])
    df1["Engine^2"] = df1["Engine"] ** 2
    df1["Age^2"] = df1["Age"] ** 2
    df1["Engine_Length"] = df1["Engine"] * df1["Length"]
    
    X = df1.drop("Price", axis=1)
    X = standard_data(X) 
    y = df1['Price'] 
    y = standard_data(y)
    
    return X, y

def DATA_BY_MODEL3(df: pd.DataFrame):
    df1 = df.copy()

    X = df1.drop("Price", axis=1)
    X['max_power_bhp*Make_encoded'] = X['max_power_bhp'] * X['Make_encoded']
    X = standard_data(X) 
    
    y = df1['Price'] 
    y = standard_data(y)
    
    return X, y

def DATA_BY_MODEL4(df: pd.DataFrame):
    df1 = df.copy()
    df1['Price'] = df1['Price'] ** (1 / 3)
    df1["LW"] = df1["Length"] + df1["Width"]
    df1["mp_ft_fc"] = df1["max_power_bhp"] + df1["Fuel_type_encoded"] + df1["Fuel Tank Capacity"]
    df1["mp_mt"] = df1["max_power_rpm"] + df1["max_torque_rpm"]
    
    X = df1.drop("Price", axis=1)
    X = standard_data(X) 
    
    y = df1['Price'] 
    y = standard_data(y)
    
    return X, y


In [6]:
def load_data(model_params, name_csv):
    df = pd.read_csv(name_csv)
    df["Engine"] = df["Engine"].str.extract(r'(\d+) cc').astype(float)

    df[['max_power_bhp', 'max_power_rpm']] = df['Max Power'].str.extract(r'(\d+) bhp @ (\d+) rpm')
    df['max_power_bhp'] = df['max_power_bhp'].map(lambda x: int(x) if pd.notna(x) else None)
    df['max_power_rpm'] = df['max_power_rpm'].map(lambda x: int(x) if pd.notna(x) else None)

    df[['max_torque_nm', 'max_torque_rpm']] = df["Max Torque"].str.extract(r'(\d+) Nm @ (\d+) rpm')
    df['max_torque_nm'] = df['max_torque_nm'].map(lambda x: int(x) if pd.notna(x) else None)
    df['max_torque_rpm'] = df['max_torque_rpm'].map(lambda x: int(x) if pd.notna(x) else None)

    df.drop(["Max Power", "Max Torque"], axis='columns', inplace=True)

    # NẾU CÓ GIÁ TRỊ NaN
    df['Engine'].fillna(df['Engine'].median(), inplace=True)
    df["Drivetrain"].fillna(df["Drivetrain"].mode()[0], inplace=True)
    df['Seating Capacity'].fillna(df['Seating Capacity'].mode()[0], inplace=True)
    df['Fuel Tank Capacity'].fillna(df['Fuel Tank Capacity'].mean(), inplace=True)
    df["max_power_bhp"].fillna(df["max_power_bhp"].mean(), inplace=True)
    df["max_power_rpm"].fillna(df["max_power_rpm"].mean(), inplace=True)
    df["max_torque_nm"].fillna(df["max_torque_nm"].median(), inplace=True)
    df["max_torque_rpm"].fillna(df["max_torque_rpm"].mean(), inplace=True)
    df["Width"].fillna(df["Width"].mean(), inplace=True)
    df["Height"].fillna(df["Height"].mean(), inplace=True)
    df["Length"].fillna(df["Length"].mean(), inplace=True)
    
        
    df['Owner'] = df['Owner'].map(model_params['owner_mapping'])
    df['Make_encoded'] = df['Make'].map(model_params['make_mapping'])
    df['Location_encoded'] = df['Location'].map(model_params['location_mapping'])
    # df['Model_encoded'] = df['Model'].map(model_params['model_mapping'])
    df['Color_encoded'] = df['Color'].map(model_params['color_mapping'])
    df['Fuel_type_encoded'] = df['Fuel Type'].map(model_params['fuel_mapping'])

    df = pd.get_dummies(df, columns=['Transmission', 'Drivetrain', 'Seller Type'], dtype=int)

    df.drop(df.select_dtypes(["object"]), axis='columns', inplace=True)

    df['Age'] = df["Year"].max() - df["Year"] + 1
    df.drop(['Year'], axis='columns', inplace=True)
    return df 


def predict_by_csv(name_csv, model_params_json="model_params.json"):
    with open("model_params.json", "r") as f:
        model_params = json.load(f)

    df = load_data(model_params=model_params, name_csv=name_csv)
    
    for model_name, model_param in model_params["MODEL"].items():

        W = np.array(model_param['W'], dtype=np.float64)
        b = np.array(model_param['b'], dtype=np.float64)
        features = model_param['features']
        df1 = df.copy()
        
        if model_name == 'MODEL1': X, y = DATA_BY_MODEL1(df1)
        elif model_name == 'MODEL2': X, y = DATA_BY_MODEL2(df1)
        elif model_name == 'MODEL3': X, y = DATA_BY_MODEL3(df1)
        elif model_name == 'MODEL4': X, y = DATA_BY_MODEL4(df1)
        else: continue
        
        X = X[features]

        y_pred = np.dot(X, W) + b
        mse = mean_squared_error(y, y_pred)
        mae = mean_absolute_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        
        print(f'{model_name}: Mean Squared Error: {mse}')
        print(f'{model_name}: Mean Absolute Error: {mae}')
        print(f'{model_name}: R² Score: {r2}\n')

LỰA CHỌN MODEL DỰA TRÊN KẾT QUẢ SAU KHI CHẠY FILE 'train.ipnyb': MODEL1 > MODEL4 > MODEL2 > MODEL3

In [None]:
# đổi đường dẫn file dẫn đến file train/test csv
csv_file = "train.csv"
# file model parameters đã cung cấp
model_params = "model_params.json"
predict_by_csv(csv_file, model_params)

MODEL1: Mean Squared Error: 0.07375951391638992
MODEL1: Mean Absolute Error: 0.19354026054436477
MODEL1: R² Score: 0.9262404845830609

MODEL2: Mean Squared Error: 0.12922389275432586
MODEL2: Mean Absolute Error: 0.2369105458679009
MODEL2: R² Score: 0.8707761072416537

MODEL3: Mean Squared Error: 0.27910239482725985
MODEL3: Mean Absolute Error: 0.27911423557522325
MODEL3: R² Score: 0.7208976051727378

MODEL4: Mean Squared Error: 0.10480909008891066
MODEL4: Mean Absolute Error: 0.22317782082049467
MODEL4: R² Score: 0.8951909098556261

