**Câu 1: XGBoost for Regression** 

In [1]:
import numpy as np

# Given data
X = np.array([23, 24, 26, 27])
Y = np.array([50, 70, 80, 85])

# Hyperparameters
lmbda = 0  # λ from the problem
lr = 0.3  # learning rate
max_depth = 1  # limiting the depth of the tree

# Step 1: Initialize f0 as the mean of Y
f0 = np.mean(Y)
print(f"Initial prediction (f0): {f0}")

# Step 2: Compute Residuals and Similarity Score for the root
residuals = Y - f0
sum_residuals = np.sum(residuals)
number_of_residuals = len(Y)
root_similarity_score = (sum_residuals ** 2) / (number_of_residuals + lmbda)
print(f"Similarity score of the root: {root_similarity_score}")

# Step 3: Try different splits and calculate gain
splits = [23.5, 25, 26.5]

def compute_similarity(residuals, lmbda):
    sum_residuals = np.sum(residuals)
    number_of_residuals = len(residuals)
    return (sum_residuals ** 2) / (number_of_residuals + lmbda)

# Find the best split with max depth = 1
best_split = None
best_gain = -np.inf
best_left_indices = None
best_right_indices = None

for split in splits:
    left_indices = np.nonzero(X < split)
    right_indices = np.nonzero(X >= split)
    
    left_residuals = residuals[left_indices]
    right_residuals = residuals[right_indices]
    
    left_similarity = compute_similarity(left_residuals, lmbda)
    right_similarity = compute_similarity(right_residuals, lmbda)
    
    gain = left_similarity + right_similarity - root_similarity_score
    
    print(f"Split at {split}: Gain = {gain}")
    
    # Select the best split based on the highest gain
    if gain > best_gain:
        best_gain = gain
        best_split = split
        best_left_indices = left_indices
        best_right_indices = right_indices

print(f"Best split at X < {best_split} with gain: {best_gain}")

# Step 4: Compute output for left and right nodes (leaf nodes since max_depth = 1)
left_residuals = residuals[best_left_indices]
right_residuals = residuals[best_right_indices]

# Output for left node
left_output = np.sum(left_residuals) / len(left_residuals)

# Output for right node
right_output = np.sum(right_residuals) / len(right_residuals)

# Step 5: Update the predictions using the learning rate
left_prediction = f0 + lr * left_output
right_prediction = f0 + lr * right_output

print(f"Prediction for left node (X < {best_split}): {left_prediction}")
print(f"Prediction for right node (X >= {best_split}): {right_prediction}")


Initial prediction (f0): 71.25
Similarity score of the root: 0.0
Split at 23.5: Gain = 602.0833333333334
Split at 25: Gain = 506.25
Split at 26.5: Gain = 252.08333333333334
Best split at X < 23.5 with gain: 602.0833333333334
Prediction for left node (X < 23.5): 64.875
Prediction for right node (X >= 23.5): 73.375


**Câu 2: XGBoost for Classification** 

In [2]:
import numpy as np

# Given data
X = np.array([23, 24, 26, 27])
Y = np.array([False, False, True, True])

# Convert Y to 0 (False) and 1 (True)
Y = Y.astype(int)

# Hyperparameters
lmbda = 0  # λ from the problem
lr = 0.3  # learning rate
max_depth = 1  # limiting the depth of the tree to 1

# Step 1: Initialize f0 as 0.5 (initial prediction is 50% probability)
f0 = 0.5
print(f"Initial prediction (f0): {f0}")

# Step 2: Compute Residuals and Similarity Score for the root
previous_probability = np.full(Y.shape, f0)
residuals = Y - previous_probability
sum_residuals = np.sum(residuals)

# Calculate Previous Probability and Similarity Score
similarity_score = (sum_residuals ** 2) / (np.sum(previous_probability * (1 - previous_probability)) + lmbda)
print(f"Similarity score of the root: {similarity_score}")

# Step 3: Try different splits and calculate gain
splits = [23.5, 25, 26.5]

def compute_similarity(residuals, previous_probability, lmbda):
    sum_residuals = np.sum(residuals)
    return (sum_residuals ** 2) / (np.sum(previous_probability * (1 - previous_probability)) + lmbda)

# Find the best split with max depth = 1
best_split = None
best_gain = -np.inf
best_left_indices = None
best_right_indices = None

for split in splits:
    left_indices = np.nonzero(X < split)
    right_indices = np.nonzero(X >= split)
    
    left_residuals = residuals[left_indices]
    right_residuals = residuals[right_indices]
    
    left_probabilities = previous_probability[left_indices]
    right_probabilities = previous_probability[right_indices]
    
    left_similarity = compute_similarity(left_residuals, left_probabilities, lmbda)
    right_similarity = compute_similarity(right_residuals, right_probabilities, lmbda)
    
    gain = left_similarity + right_similarity - similarity_score
    
    print(f"Split at {split}: Gain = {gain}")
    
    # Select the best split based on the highest gain
    if gain > best_gain:
        best_gain = gain
        best_split = split
        best_left_indices = left_indices
        best_right_indices = right_indices

print(f"Best split at X < {best_split} with gain: {best_gain}")

# Step 4: Compute output for left and right nodes (leaf nodes since max_depth = 1)
left_residuals = residuals[best_left_indices]
right_residuals = residuals[best_right_indices]

# Output for left and right nodes
left_output = np.sum(left_residuals) / np.sum(previous_probability[best_left_indices] * (1 - previous_probability[best_left_indices]))
right_output = np.sum(right_residuals) / np.sum(previous_probability[best_right_indices] * (1 - previous_probability[best_right_indices]))

# Step 5: Update the predictions using LogPrediction for both branches
log_left_prediction = np.log(previous_probability[best_left_indices] / (1 - previous_probability[best_left_indices])) + lr * left_output
log_right_prediction = np.log(previous_probability[best_right_indices] / (1 - previous_probability[best_right_indices])) + lr * right_output

# Convert LogPrediction back to probabilities using the sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

left_prediction = sigmoid(log_left_prediction)
right_prediction = sigmoid(log_right_prediction)

print(f"Prediction for left node (X < {best_split}): {left_prediction}")
print(f"Prediction for right node (X >= {best_split}): {right_prediction}")


Initial prediction (f0): 0.5
Similarity score of the root: 0.0
Split at 23.5: Gain = 1.3333333333333333
Split at 25: Gain = 4.0
Split at 26.5: Gain = 1.3333333333333333
Best split at X < 25 with gain: 4.0
Prediction for left node (X < 25): [0.35434369 0.35434369]
Prediction for right node (X >= 25): [0.64565631 0.64565631]


**Câu 3: XGBoost Regressor**

In [3]:
import numpy as np
import pandas as pd
import matplotlib . pyplot as plt
import xgboost as xgb
from sklearn . metrics import mean_squared_error , mean_absolute_error 
from sklearn . preprocessing import OrdinalEncoder
from sklearn . model_selection import train_test_split


In [4]:
dataset_path = 'Problem3.csv'
data_df = pd.read_csv( dataset_path )
data_df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,4.468204,26.2,94.3,1.808289,8.2,51,6.7,False,0.000000
1,7,4,oct,tue,4.517431,35.4,669.1,2.041220,18.0,33,0.9,False,0.000000
2,7,4,oct,sat,4.517431,43.7,686.9,2.041220,14.6,33,1.3,False,0.000000
3,8,6,mar,fri,4.529368,33.3,77.5,2.302585,8.3,97,4.0,True,0.000000
4,8,6,mar,sun,4.503137,51.3,102.2,2.360854,11.4,99,1.8,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,4,3,aug,sun,4.414010,56.7,665.6,1.064711,27.8,32,2.7,False,2.006871
506,2,4,aug,sun,4.414010,56.7,665.6,1.064711,21.9,71,5.8,False,4.012592
507,7,4,aug,sun,4.414010,56.7,665.6,1.064711,21.2,70,6.7,False,2.498152
508,1,4,aug,sat,4.558079,146.0,614.7,2.509599,25.6,42,4.0,False,0.000000


In [5]:
categorical_cols = data_df.select_dtypes(include=['object', 'bool']).columns.to_list()

for col_name in categorical_cols:
    n_categories = data_df[col_name].nunique()
    print(f'Number of categories in {col_name}: {n_categories}')

ordinal_encoder = OrdinalEncoder()
encoded_categorical_cols = ordinal_encoder.fit_transform(data_df[categorical_cols])

encoded_categorical_df = pd.DataFrame(
    encoded_categorical_cols,
    columns=categorical_cols
)

numerical_df = data_df.drop(categorical_cols, axis=1)
encoded_df = pd.concat([numerical_df, encoded_categorical_df], axis=1)

Number of categories in month: 12
Number of categories in day: 7
Number of categories in rain: 2


In [6]:
X = encoded_df.drop (columns =[ 'area'])
y = encoded_df['area']

In [7]:
X_train , X_test , y_train , y_test = train_test_split (X, y, test_size =0.3,
random_state =7)

In [8]:
xg_reg = xgb.XGBRegressor ( seed =7, learning_rate =0.01 , n_estimators =102 , max_depth =3)
xg_reg.fit( X_train , y_train )

In [9]:
preds = xg_reg . predict ( X_test )

In [10]:
mae = mean_absolute_error ( y_test , preds )
mse = mean_squared_error ( y_test , preds )

print ('Evaluation results on test set:')
print (f'Mean Absolute Error : { mae}')
print (f'Mean Squared Error : { mse}')

Evaluation results on test set:
Mean Absolute Error : 1.1484401341167767
Mean Squared Error : 1.8845074196256495


**Câu 4: XGBoost Classifier**

In [11]:
import numpy as np
import pandas as pd
import matplotlib . pyplot as plt
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [12]:
dataset_path = 'Problem4.csv'
data_df = pd.read_csv( dataset_path )
data_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,Target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [13]:
X, y = data_df . iloc [: ,: -1] , data_df . iloc [: , -1]

In [14]:
X_train , X_test , y_train , y_test = train_test_split (X, y, test_size =0.3,
random_state =7)

In [15]:
xg_class = xgb. XGBClassifier ( seed =7)

xg_class.fit ( X_train , y_train )

In [16]:
preds = xg_class . predict ( X_test )

In [17]:
train_acc = accuracy_score ( y_train , xg_class.predict( X_train ))
test_acc = accuracy_score (y_test , preds )

print (f'Train ACC: { train_acc }')
print (f'Test ACC: { test_acc }')

Train ACC: 1.0
Test ACC: 0.9629629629629629
