## 3. XGBoost Regressor


In [2]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [5]:
dataset_path = './Problem3.csv'
data_df = pd.read_csv(dataset_path)
data_df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,4.468204,26.2,94.3,1.808289,8.2,51,6.7,False,0.0
1,7,4,oct,tue,4.517431,35.4,669.1,2.04122,18.0,33,0.9,False,0.0
2,7,4,oct,sat,4.517431,43.7,686.9,2.04122,14.6,33,1.3,False,0.0
3,8,6,mar,fri,4.529368,33.3,77.5,2.302585,8.3,97,4.0,True,0.0
4,8,6,mar,sun,4.503137,51.3,102.2,2.360854,11.4,99,1.8,False,0.0


In [6]:
# đổi toàn bộ các giá trị này về dạng số
categorical_cols = data_df.select_dtypes(include=['object', 'bool']).columns.to_list()

for col_name in categorical_cols:
    n_categories = data_df[col_name].nunique()
    print(f'Number of categorties in {col_name}: {n_categories}')

orinal_encoder = OrdinalEncoder()
encoded_categorical_cols = orinal_encoder.fit_transform(data_df[categorical_cols])

encoded_categorical_df = pd.DataFrame(encoded_categorical_cols, columns = categorical_cols)

numerical_df = data_df.drop(categorical_cols, axis=1)

encoded_df = pd.concat([numerical_df, encoded_categorical_df], axis=1)
encoded_df.head()

Number of categorties in month: 12
Number of categorties in day: 7
Number of categorties in rain: 2


Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,area,month,day,rain
0,7,5,4.468204,26.2,94.3,1.808289,8.2,51,6.7,0.0,7.0,0.0,0.0
1,7,4,4.517431,35.4,669.1,2.04122,18.0,33,0.9,0.0,10.0,5.0,0.0
2,7,4,4.517431,43.7,686.9,2.04122,14.6,33,1.3,0.0,10.0,2.0,0.0
3,8,6,4.529368,33.3,77.5,2.302585,8.3,97,4.0,0.0,7.0,0.0,1.0
4,8,6,4.503137,51.3,102.2,2.360854,11.4,99,1.8,0.0,7.0,3.0,0.0


In [8]:
# Tách dữ liệu X, y
X = encoded_df.drop(columns = 'area')
y = encoded_df['area']

In [9]:

# Chia tập dữ liệu train, val tỷ lệ 7:3
X_train, X_test, y_train, y_test = train_test_split(
                    X, y,
                    test_size=0.3,
                    random_state=7)


In [10]:
# Huấn luyện mô hình
xg_reg = xgb.XGBRegressor(seed=7,
                          learning_rate=0.01,
                          n_estimators=102,
                          max_depth=3)

xg_reg.fit(X_train, y_train)

In [11]:
# Dự đoán trên toàn bộ tập test
preds = xg_reg.predict(X_test)

In [12]:
# Đánh giá mô hình
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)

print('Evaluation results on test set: ')
print(f'Mean Absolute Error - MAE: {mae}')
print(f'Mean Squared Error - MSE: {mse}')

Evaluation results on test set: 
Mean Absolute Error - MAE: 1.1484401341167767
Mean Squared Error - MSE: 1.8845074196256495


## 4. XGBoost Classification

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


In [16]:
dataset_path2 = './Problem4.csv'
data_df2 = pd.read_csv(dataset_path2)
data_df2.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,Target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [20]:
# input các đặc trưng
X2, y2 = data_df2.iloc[:, :-1], data_df2.iloc[:, -1]

# chia thành hai tập dữ liệu con
X_train2, X_test2, y_train2, y_test2 = train_test_split(
                    X2, y2,
                    test_size=0.3,
                    random_state=7)

# xây dựng model XGBoost
xg_class2 = xgb.XGBClassifier(seed=7)
xg_class2.fit(X_train2, y_train2)

# dự đoán trên tập test
preds2 = xg_class2.predict(X_test2)

# đánh giá mô hình
train_acc = accuracy_score(y_train2, xg_class2.predict(X_train2))
test_acc = accuracy_score(y_test2, preds2)

print(f'Train accuracy: {train_acc}')
print(f'Test accuracy: {test_acc}')

Train accuracy: 1.0
Test accuracy: 0.9629629629629629


##2.

In [23]:
import numpy as np

# Dữ liệu đầu vào
X = np.array([23, 24, 26, 27])
Y = np.array([False, False, True, True], dtype=int)  # Chuyển đổi False/True thành 0/1

# Các tham số
f0 = 0.5
lambda_reg = 0
lr = 0.3

# Bước 2: Tính toán Similarity Score của root
def compute_similarity_score(Y, f0, lambda_reg):
    sum_of_residuals = np.sum(Y - f0)
    previous_probability = f0
    similarity_score = (sum_of_residuals ** 2) / (len(Y) * previous_probability * (1 - previous_probability) + lambda_reg)
    return similarity_score

root_similarity_score = compute_similarity_score(Y, f0, lambda_reg)

# Bước 3: Tính toán Similarity Score cho các node
def compute_node_scores(X, Y, condition, f0, lambda_reg):
    left_indices = X < condition
    right_indices = X >= condition
    Y_left = Y[left_indices]
    Y_right = Y[right_indices]

    left_similarity_score = compute_similarity_score(Y_left, f0, lambda_reg)
    right_similarity_score = compute_similarity_score(Y_right, f0, lambda_reg)

    return left_similarity_score, right_similarity_score

# Các điều kiện
conditions = [23.5, 25, 26.5]
scores = {}

for cond in conditions:
    left_score, right_score = compute_node_scores(X, Y, cond, f0, lambda_reg)
    gain = left_score + right_score - root_similarity_score
    scores[cond] = (left_score, right_score, gain)

# Bước 4: Tính Gain và chọn điều kiện có Gain lớn nhất
best_condition = max(scores, key=lambda k: scores[k][2])
best_left_score, best_right_score, best_gain = scores[best_condition]

# Bước 5: Tính Output cho từng node trong nhánh trái và phải
def compute_output(Y, f0):
    sum_of_residuals = np.sum(Y - f0)
    previous_probability = f0
    output = sum_of_residuals * (1 - previous_probability) / len(Y)
    return output

Y_left = Y[X < best_condition]
Y_right = Y[X >= best_condition]
left_output = compute_output(Y_left, f0)
right_output = compute_output(Y_right, f0)

# Bước 6: Dự đoán kết quả khi x = 23
def predict_probability(x, best_condition, left_output, right_output, f0, lr):
    if x < best_condition:
        output = left_output
    else:
        output = right_output

    log_prediction = np.log(f0 / (1 - f0)) + lr * output
    probability = np.exp(log_prediction) / (1 + np.exp(log_prediction))

    return probability

x_to_predict = 23
predicted_probability = predict_probability(x_to_predict, best_condition, left_output, right_output, f0, lr)

# In kết quả
print(f"Root Similarity Score: {root_similarity_score}")
print(f"Best Condition: X < {best_condition}")
print(f"Left Similarity Score: {best_left_score}")
print(f"Right Similarity Score: {best_right_score}")
print(f"Best Gain: {best_gain}")
print(f"Left Output: {left_output}")
print(f"Right Output: {right_output}")
print(f"Predicted Probability for x = {x_to_predict}: {predicted_probability}")


Root Similarity Score: 0.0
Best Condition: X < 25
Left Similarity Score: 2.0
Right Similarity Score: 2.0
Best Gain: 4.0
Left Output: -0.25
Right Output: 0.25
Predicted Probability for x = 23: 0.4812587841214647


In [24]:
import numpy as np

# Dữ liệu đầu vào
X = np.array([23, 24, 26, 27])
Y = np.array([False, False, True, True], dtype=int)  # Chuyển đổi False/True thành 0/1

# Các tham số
f0 = 0.5
lambda_reg = 0

# Bước 1: Xác định các chỉ số của nhánh bên trái khi điều kiện phân tách là X < 26.5
condition = 26.5
left_indices = X < condition

# Bước 2: Lấy dữ liệu tương ứng với nhánh bên trái
Y_left = Y[left_indices]

# Bước 3: Tính toán Similarity Score cho nhánh bên trái
def compute_similarity_score(Y, f0, lambda_reg):
    sum_of_residuals = np.sum(Y - f0)
    previous_probability = f0
    similarity_score = (sum_of_residuals ** 2) / (len(Y) * previous_probability * (1 - previous_probability) + lambda_reg)
    return similarity_score

left_similarity_score = compute_similarity_score(Y_left, f0, lambda_reg)

# In kết quả
print(f"Left Similarity Score for X < {condition}: {left_similarity_score}")


Left Similarity Score for X < 26.5: 0.3333333333333333


In [25]:
import numpy as np

# Dữ liệu đầu vào
X = np.array([23, 24, 26, 27])
Y = np.array([False, False, True, True], dtype=int)  # Chuyển đổi False/True thành 0/1

# Các tham số
f0 = 0.5
lambda_reg = 0

# Bước 1: Tính toán Similarity Score của root
def compute_similarity_score(Y, f0, lambda_reg):
    sum_of_residuals = np.sum(Y - f0)
    previous_probability = f0
    similarity_score = (sum_of_residuals ** 2) / (len(Y) * previous_probability * (1 - previous_probability) + lambda_reg)
    return similarity_score

root_similarity_score = compute_similarity_score(Y, f0, lambda_reg)

# Bước 2: Tính toán Similarity Score cho các nhánh bên trái và bên phải theo điều kiện X < 25
condition = 25
left_indices = X < condition
right_indices = X >= condition

Y_left = Y[left_indices]
Y_right = Y[right_indices]

left_similarity_score = compute_similarity_score(Y_left, f0, lambda_reg)
right_similarity_score = compute_similarity_score(Y_right, f0, lambda_reg)

# Bước 3: Tính toán Gain
gain = left_similarity_score + right_similarity_score - root_similarity_score

# In kết quả
print(f"Root Similarity Score: {root_similarity_score}")
print(f"Left Similarity Score for X < {condition}: {left_similarity_score}")
print(f"Right Similarity Score for X >= {condition}: {right_similarity_score}")
print(f"Gain for X < {condition}: {gain}")


Root Similarity Score: 0.0
Left Similarity Score for X < 25: 2.0
Right Similarity Score for X >= 25: 2.0
Gain for X < 25: 4.0


## 1.

In [26]:
import numpy as np

# Dữ liệu đầu vào
X = np.array([23, 24, 26, 27])
Y = np.array([50, 70, 80 ,85])

# Các tham số
lambda_reg = 0
lr = 0.3

# Bước 1: Khởi tạo giá trị f0 dự đoán của model
f0 = np.mean(Y)
print(f"Initial f0: {f0}")

def compute_similarity_score(Y, f0, lambda_reg):
    sum_of_residuals = np.sum(Y - f0)
    number_of_residuals = len(Y)
    similarity_score = (sum_of_residuals ** 2) / (number_of_residuals + lambda_reg)
    return similarity_score

# Bước 2: Tính toán Similarity Score của root
root_similarity_score = compute_similarity_score(Y, f0, lambda_reg)
print(f"Root Similarity Score: {root_similarity_score}")

def compute_node_scores(X, Y, condition, f0, lambda_reg):
    left_indices = X < condition
    right_indices = X >= condition
    Y_left = Y[left_indices]
    Y_right = Y[right_indices]

    left_similarity_score = compute_similarity_score(Y_left, f0, lambda_reg)
    right_similarity_score = compute_similarity_score(Y_right, f0, lambda_reg)

    return left_similarity_score, right_similarity_score

# Các điều kiện
conditions = [23.5, 25, 26.5]
scores = {}

for cond in conditions:
    left_score, right_score = compute_node_scores(X, Y, cond, f0, lambda_reg)
    gain = left_score + right_score - root_similarity_score
    scores[cond] = (left_score, right_score, gain)

best_condition = max(scores, key=lambda k: scores[k][2])
best_left_score, best_right_score, best_gain = scores[best_condition]

print(f"Best Condition: X < {best_condition}")
print(f"Left Similarity Score: {best_left_score}")
print(f"Right Similarity Score: {best_right_score}")
print(f"Best Gain: {best_gain}")

def compute_output(Y, f0):
    sum_of_residuals = np.sum(Y - f0)
    number_of_residuals = len(Y)
    output = sum_of_residuals / number_of_residuals
    return output

Y_left = Y[X < best_condition]
Y_right = Y[X >= best_condition]
left_output = compute_output(Y_left, f0)
right_output = compute_output(Y_right, f0)

print(f"Left Output: {left_output}")
print(f"Right Output: {right_output}")

def predict_value(x, best_condition, left_output, right_output, f0, lr):
    if x < best_condition:
        output = left_output
    else:
        output = right_output

    prediction = f0 + lr * output
    return prediction

x_to_predict = 25
predicted_value = predict_value(x_to_predict, best_condition, left_output, right_output, f0, lr)

print(f"Predicted Value for x = {x_to_predict}: {predicted_value}")


Initial f0: 71.25
Root Similarity Score: 0.0
Best Condition: X < 23.5
Left Similarity Score: 451.5625
Right Similarity Score: 150.52083333333334
Best Gain: 602.0833333333334
Left Output: -21.25
Right Output: 7.083333333333333
Predicted Value for x = 25: 73.375
