# Tải thư viện

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold  
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, MinMaxScaler
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
pd.set_option('display.float_format', '{:.12f}'.format)

Tải dữ liệu

In [None]:
df = pd.read_csv('online_gaming_behavior_dataset.csv')

In [4]:
display(df)

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,9000,43,Male,Other,Strategy,16.271118760553,0,Medium,6,108,79,25,Medium
1,9001,29,Female,USA,Strategy,5.525961380571,0,Medium,5,144,11,10,Medium
2,9002,22,Female,USA,Sports,8.223755243500,0,Easy,16,142,35,41,High
3,9003,35,Male,USA,Action,5.265351277318,1,Easy,9,85,57,47,Medium
4,9004,33,Male,Europe,Action,15.531944521134,0,Medium,2,131,95,37,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40029,49029,32,Male,USA,Strategy,20.619662421375,0,Easy,4,75,85,14,Medium
40030,49030,44,Female,Other,Simulation,13.539280463946,0,Hard,19,114,71,27,High
40031,49031,15,Female,USA,RPG,0.240056881178,1,Easy,10,176,29,1,High
40032,49032,34,Male,USA,Sports,14.017817977715,1,Medium,3,128,70,10,Medium


Xóa cột không cần thiết 

In [5]:
df = df.drop(columns=["PlayerID","PlayTimeHours"])

Chuẩn hóa lại các cột và đánh giá mức độ ảnh hưởng của cột độ khó 

Trong đó Easy là 1, Medium là 4, Hard là 8, dựa trên trải nghiệm cá nhân : Easy = Trải Nghiệm trẻ con, Medium : Tạm tạm, Hard : Khó nhưng vẫn có thể vượt qua 

Sau này sẽ điều chỉnh dựa trên thể loại game của 2 mục AchievementsUnlocked và PlayerLevel (Dấn thân càng nhiều thì càng khó dứt)

In [6]:
df["InGamePurchases"] = df["InGamePurchases"].map({0:"False",1:"True"})

In [7]:
df["IsStressed"] = np.where(df["GameDifficulty"]=="Hard","True","False")
df["GameDifficultyQuantified"] = df["GameDifficulty"].map({"Easy":1,"Medium":4,"Hard":8})
df = df.drop(columns=["GameDifficulty"])

Đánh giá độ nghiện dựa trên tiêu chí nếu chơi quá 180 phút/ ngày <=> 1280 phút / tuần thì khả năng cao là nghiện 



Sử dụng framework optuna để tìm các bộ hyperparameter tốt nhất

In [36]:
ADDICTION_CUTOFF=1280
NUMBER_OF_FOLDS=10
OPTUNA_TRIALS = 20
RANDOM_STATE = 42
TARGET_VARIABLE = "EngagementLevel"
TEST_SIZE = 0.15

In [9]:
label_encoder = LabelEncoder()
df[TARGET_VARIABLE] = label_encoder.fit_transform(df[TARGET_VARIABLE])

Data :

In [10]:
display(df)
print(df.info())

Unnamed: 0,Age,Gender,Location,GameGenre,InGamePurchases,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel,IsStressed,GameDifficultyQuantified
0,43,Male,Other,Strategy,False,6,108,79,25,2,False,4
1,29,Female,USA,Strategy,False,5,144,11,10,2,False,4
2,22,Female,USA,Sports,False,16,142,35,41,0,False,1
3,35,Male,USA,Action,True,9,85,57,47,2,False,1
4,33,Male,Europe,Action,False,2,131,95,37,2,False,4
...,...,...,...,...,...,...,...,...,...,...,...,...
40029,32,Male,USA,Strategy,False,4,75,85,14,2,False,1
40030,44,Female,Other,Simulation,False,19,114,71,27,0,True,8
40031,15,Female,USA,RPG,True,10,176,29,1,0,False,1
40032,34,Male,USA,Sports,True,3,128,70,10,2,False,4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40034 entries, 0 to 40033
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Age                        40034 non-null  int64 
 1   Gender                     40034 non-null  object
 2   Location                   40034 non-null  object
 3   GameGenre                  40034 non-null  object
 4   InGamePurchases            40034 non-null  object
 5   SessionsPerWeek            40034 non-null  int64 
 6   AvgSessionDurationMinutes  40034 non-null  int64 
 7   PlayerLevel                40034 non-null  int64 
 8   AchievementsUnlocked       40034 non-null  int64 
 9   EngagementLevel            40034 non-null  int64 
 10  IsStressed                 40034 non-null  object
 11  GameDifficultyQuantified   40034 non-null  int64 
dtypes: int64(7), object(5)
memory usage: 3.7+ MB
None


In [11]:
y = df.pop(TARGET_VARIABLE)
X = df.copy()

Tạo 2 cột mới là số phút chơi/tuần và cột khả năng nghiện

In [20]:
X["AvgSessionDurationMinutesPerWeek"] = X["SessionsPerWeek"] * X["AvgSessionDurationMinutes"]
X["IsAddicted"] = np.where(X["AvgSessionDurationMinutesPerWeek"]>=ADDICTION_CUTOFF,"True","False")

Lấy danh sách header các cột số và tag

In [19]:
numerical_columns = [x for x in X.select_dtypes(include=['float64','int64']).columns]
categorical_columns = [x for x in X.columns if x not in numerical_columns]

In [18]:
def scale_numerical_columns(training_data, test_data):
    scaler = MinMaxScaler() 
    for column in numerical_columns:
        training_data[numerical_columns] = scaler.fit_transform(training_data[numerical_columns])
        test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])

    return training_data, test_data

In [15]:
display(numerical_columns)
display(categorical_columns)

['Age',
 'SessionsPerWeek',
 'AvgSessionDurationMinutes',
 'PlayerLevel',
 'AchievementsUnlocked',
 'GameDifficultyQuantified',
 'AvgSessionDurationMinutesPerWeek']

['Gender',
 'Location',
 'GameGenre',
 'InGamePurchases',
 'IsStressed',
 'IsAddicted']

Chuẩn hóa các cột tag về số

In [21]:
def encode_ordinal_fields(training_data, test_data):
    ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    
    training_data[categorical_columns] = ordinal_encoder.fit_transform(training_data[categorical_columns].astype(str))
    test_data[categorical_columns] = ordinal_encoder.transform(test_data[categorical_columns].astype(str))
    
    return training_data, test_data

# Mô hình 

Sử dụng Xgboost (Top 1 server)

## 1. 10-Fold Cross Validation

In [22]:
skf = StratifiedKFold(n_splits=NUMBER_OF_FOLDS, shuffle=True, random_state=RANDOM_STATE)
k_fold_accuracies = []
k_fold_f1_scores = []

for fold_index, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"========================================================")
    print(f"Starting with fold: {fold_index+1}/{NUMBER_OF_FOLDS}")
    train_X, test_X = X.iloc[train_index].copy(), X.iloc[val_index] .copy()
    train_y, test_y = y[train_index].copy(), y[val_index].copy()

    train_X, test_X = scale_numerical_columns(train_X.copy(), test_X.copy())
    train_X, test_X = encode_ordinal_fields(train_X.copy(), test_X.copy())
    
    model = XGBClassifier(random_state=RANDOM_STATE)
    trained_model= model.fit(train_X, train_y)    
    
    predictions = trained_model.predict(test_X)
    score = accuracy_score(predictions, test_y)
    f1value = f1_score(predictions, test_y, average="weighted")
    
    print(f"Accuracy: {score}| F1 score: {f1value}")
    k_fold_accuracies.append(score)
    k_fold_f1_scores.append(f1value)

Starting with fold: 1/10
Accuracy: 0.9268231768231768| F1 score: 0.9271102572094231
Starting with fold: 2/10
Accuracy: 0.9328171828171828| F1 score: 0.9330653048869169
Starting with fold: 3/10
Accuracy: 0.9250749250749251| F1 score: 0.9253247901675551
Starting with fold: 4/10
Accuracy: 0.9280719280719281| F1 score: 0.9284045454831811
Starting with fold: 5/10
Accuracy: 0.9313015238571072| F1 score: 0.9314915785784934
Starting with fold: 6/10
Accuracy: 0.927554334249313| F1 score: 0.9277988653214907
Starting with fold: 7/10
Accuracy: 0.9280539595303522| F1 score: 0.928271948506242
Starting with fold: 8/10
Accuracy: 0.9210592055958031| F1 score: 0.9213260519924571
Starting with fold: 9/10
Accuracy: 0.9233075193604796| F1 score: 0.9236265419488666
Starting with fold: 10/10
Accuracy: 0.926305271046715| F1 score: 0.9264641071214493


In [27]:
print(f"Mean Accuracy: {np.mean(k_fold_accuracies):.16f}")
print(f"Mean F1-Score: {np.mean(k_fold_f1_scores):.16f}")
print(f"Accuracy Standard Deviation : {np.std(k_fold_accuracies):.16f}")
print(f"F1-Score Standard Deviation : {np.std(k_fold_f1_scores):.16f}")

Mean Accuracy: 0.9270369026426983
Mean F1-Score: 0.9272883991216074
Accuracy Standard Deviation : 0.0032869975960152
F1-Score Standard Deviation : 0.0032716787557861


## 2. Optuna

In [32]:
train_X, test_X, train_y, test_y = train_test_split(X, y,test_size = TEST_SIZE, random_state =RANDOM_STATE, stratify=y)
train_X, test_X = scale_numerical_columns(train_X.copy(), test_X.copy())
train_X, test_X = encode_ordinal_fields(train_X.copy(), test_X.copy())

model = XGBClassifier(random_state=RANDOM_STATE)
trained_model= model.fit(train_X, train_y,eval_set=[(test_X,test_y)],verbose=False)

predictions = trained_model.predict(test_X)
score = accuracy_score(predictions, test_y)
f1value = f1_score(predictions, test_y, average="weighted")
print(f"Accuracy: {score}| F1 score: {f1value}")

Accuracy: 0.9277389277389277| F1 score: 0.9279345989808685


In [37]:
def objective(trial):
    param = {
        'device':'cuda',
        'lambda': trial.suggest_float('lambda', 0.001, 10.0,log=True),
        'alpha': trial.suggest_float('alpha', 0.001, 10.0,log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.001, 1.0,log=True),
        'subsample': trial.suggest_float('subsample',0.001,1.0,log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 1.0,log=True),
        'n_estimators': trial.suggest_categorical('n_estimators', [32,64,128,256,512,768,1024,1536,2048,3096,4096,5120]),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_child_weight': trial.suggest_categorical('min_child_weight', [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]),
        "enable_categorical": trial.suggest_categorical("enable_categorical", [True]),
        "max_depth" : trial.suggest_int("max_depth", 4, 32),
        "eta" : trial.suggest_float("eta", 1e-8, 1.0,log=True),
        "gamma" : trial.suggest_float("gamma", 1e-8, 1.0,log=True)        
    }    

    model = XGBClassifier(**param,early_stopping_rounds=50)  
    
    model.fit(train_X,train_y,eval_set=[(test_X,test_y)],verbose=False)
    
    preds = model.predict(test_X)
    
    acc = accuracy_score(test_y, preds)
    
    return acc

OPTUNAL_TRIALS để tạm 20 do lap ghẻ yếu quá, bao giờ test cuối thì tăng

In [38]:
study = optuna.create_study(direction='maximize', study_name="first")
study.optimize(objective, n_trials=OPTUNA_TRIALS)

  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.num_boosted_rounds()
  self.starting_round = model.n

In [37]:
print(f'Best trial score: {study.best_trial.value}| params: {study.best_trial.params}')

Best trial score: 0.9305694305694305| params: {'lambda': 1.201519861350796, 'alpha': 0.016553066292075185, 'colsample_bytree': 0.9916352063838033, 'subsample': 0.4721651857418294, 'learning_rate': 0.06651202049988039, 'n_estimators': 5120, 'random_state': 42, 'min_child_weight': 5, 'enable_categorical': True, 'max_depth': 10, 'eta': 0.006333152904101868, 'gamma': 0.0002676778449051496}


In [38]:
study.trials_dataframe().nlargest(20,"value").reset_index(drop=True).head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_enable_categorical,params_eta,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_n_estimators,params_random_state,params_subsample,state
0,307,0.930569430569,2025-11-15 14:42:25.330124,2025-11-15 14:42:27.113777,0 days 00:00:01.783653,0.016553066292,0.991635206384,True,0.006333152904,0.000267677845,1.201519861351,0.0665120205,10,5,5120,42,0.472165185742,COMPLETE
1,306,0.930236430236,2025-11-15 14:42:22.485296,2025-11-15 14:42:25.330059,0 days 00:00:02.844763,0.01608677245,0.880507614152,True,0.004370037148,3.0832961e-05,1.423523476759,0.044849222899,11,5,5120,42,0.474328017887,COMPLETE
2,326,0.930236430236,2025-11-15 14:43:14.843806,2025-11-15 14:43:16.682668,0 days 00:00:01.838862,0.023236274295,0.6997665415,True,0.008200901406,0.000581150703,1.801823623988,0.061923558645,8,5,5120,42,0.496576937123,COMPLETE
3,152,0.93006993007,2025-11-15 14:21:18.796133,2025-11-15 14:21:27.310240,0 days 00:00:08.514107,0.106667546626,0.996218006513,True,0.000522862048,8.0656e-08,0.321862404729,0.010071661205,12,8,4096,42,0.505778549507,COMPLETE
4,291,0.93006993007,2025-11-15 14:41:14.880677,2025-11-15 14:41:17.105736,0 days 00:00:02.225059,0.006063722195,0.888319595211,True,0.000882325153,6.3986185e-05,0.938675983273,0.047416128664,10,6,4096,42,0.35602739958,COMPLETE


In [39]:
params = study.best_trial.params

model = XGBClassifier(**study.best_trial.params,early_stopping_rounds=50)
trained_model= model.fit(train_X, train_y,eval_set=[(test_X,test_y)],verbose=False)    

predictions = trained_model.predict(test_X)
score = accuracy_score(predictions, test_y)
f1value = f1_score(predictions, test_y, average="weighted")
print(f"Accuracy: {score}| F1 score: {f1value}")

Accuracy: 0.9269064269064269| F1 score: 0.9270818767236308
