In [127]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metric
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
import random

In [128]:
clean_data_file_name = r'..\preprocessing\clean_data\status_per_min_01.csv'

In [129]:
df = pd.read_csv(clean_data_file_name)
new_df = df.copy()

In [130]:
win_rate_columns = df.columns[:20]
gold_columns = df.columns[471:512]
new_df[win_rate_columns] = StandardScaler().fit_transform(df[win_rate_columns])
new_df[gold_columns] = StandardScaler().fit_transform(df[gold_columns])
new_df = pd.DataFrame(new_df, columns=df.columns)

In [131]:
level_columns_drop = df.columns[20:50]
gold_columns_drop = df.columns[471:477]
turrets_columns_drop = df.columns[225:273]
monstermonster_columns_drop = df.columns[512:562]
columns_to_drop = list(level_columns_drop) + list(gold_columns_drop) + list(turrets_columns_drop) + list(monstermonster_columns_drop)
new_df = new_df.drop(columns=columns_to_drop)
new_df['label'] = new_df['label'].replace(-1, 0)

In [132]:
columns_to_drop_01 = [col for col in df.columns if col[-2:] in {'36', '37', '38', '39', '40'}]
new_df = new_df.drop(columns=columns_to_drop_01)

In [133]:
old_df, new_df = train_test_split(new_df, test_size=0.3, random_state=1)

In [134]:
old_X = old_df.drop(columns=['label'])
old_Y = old_df['label']
new_X = new_df.drop(columns=['label'])
new_Y = new_df['label']

In [135]:
old_X_training, old_X_test, old_Y_training, old_Y_test = train_test_split(old_X, old_Y, test_size=0.2, random_state=0)

In [136]:
XG_boost = xgb.XGBClassifier(n_estimators = 50, max_depth = 11, learning_rate = 0.3).fit(old_X_training, old_Y_training)

In [137]:
# param_grid = {
#     'n_estimators': [30, 40, 50],
#     'learning_rate': [0.2, 0.3, 0.4],
#     'max_depth': [7, 9, 11],
# }

# grid_search = GridSearchCV(
#     estimator=xgb.XGBClassifier(random_state=42),
#     param_grid=param_grid,
#     cv=3,
#     scoring='accuracy',
#     verbose=1
# ).fit(old_X_training, old_Y_training)
# XG_boost = grid_search.best_estimator_

In [138]:
# grid_search.best_params_

### Performance trên old_df

In [139]:
# Accuracy
print('Train')
print(metric.accuracy_score(old_Y_training, XG_boost.predict(old_X_training)))
print('Test')
print(metric.accuracy_score(old_Y_test, XG_boost.predict(old_X_test)))

Train
1.0
Test
0.9537898936170213


In [140]:
# F1 score
print('Train')
print(metric.f1_score(old_Y_training, XG_boost.predict(old_X_training), pos_label=1))
print('Test')
print(metric.f1_score(old_Y_test, XG_boost.predict(old_X_test), pos_label=1))

Train
1.0
Test
0.9535893155258764


In [141]:
# Precision Score
print('Train')
print(metric.precision_score(old_Y_training, XG_boost.predict(old_X_training), pos_label=1))
print('Test')
print(metric.precision_score(old_Y_test, XG_boost.predict(old_X_test), pos_label=1))

Train
1.0
Test
0.9583892617449664


In [142]:
random_matches = new_df.copy()
num_matches = new_df.shape[0]

random_minutes = [random.randint(0, 36) for _ in range(num_matches)]
random_matches['minute'] = random_minutes

def get_data_until_minute(row):
    minute = row['minute']
    selected_columns = list(random_matches.columns[:20])
    for col in new_df.columns:
        last_part = col.split('_')[-1]  # Lấy phần cuối của tên cột
        if last_part.isdigit() and int(last_part) <= minute:  # Kiểm tra và so sánh
            selected_columns.append(col)
    selected_columns.append('label')
    return row[selected_columns]

filtered_data = random_matches.apply(get_data_until_minute, axis=1)
filtered_data = filtered_data.reindex(columns=random_matches.columns)

X_real_time = filtered_data.drop(columns=["label", "minute"])
Y_real_time = filtered_data["label"]

X_real_time = X_real_time.fillna(0)  
Y_real_time = Y_real_time.fillna(0) 

In [143]:
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)
}

# Thực hiện cross-validation
results = cross_validate(
    estimator=XG_boost, 
    X=X_real_time, 
    y=Y_real_time, 
    scoring=scoring, 
    cv=10  # Số lượng folds, có thể thay đổi
)

In [144]:
print('Test Accuracy:', results['test_accuracy'].mean())

print('Test Precision:', results['test_precision'].mean())

print('Test Recall:', results['test_recall'].mean())

print('Test F1 Score:', results['test_f1'].mean())

Test Accuracy: 0.7337466897780345
Test Precision: 0.7348483525537135
Test Recall: 0.7192696863281948
Test F1 Score: 0.7267934888146247


In [145]:
Y_predict = XG_boost.predict(X_real_time)
predicted_proba = XG_boost.predict_proba(X_real_time)
random_matches['per_red_win'] = predicted_proba[:, 0].round(2) 
random_matches['per_blue_win'] = predicted_proba[:, 1].round(2) 
random_matches['predicted_label'] = Y_predict

In [146]:
result_predict = random_matches[['per_blue_win', 'per_red_win', 'minute', 'predicted_label', 'label']]

In [148]:
result_predict[(result_predict['minute'] < 10) & (result_predict['predicted_label'] != result_predict['label'])].shape[0]

833

In [149]:
result_predict[(result_predict['minute'] < 20) & (result_predict['predicted_label'] != result_predict['label'])].shape[0]

1717

In [150]:
result_predict[(result_predict['minute'] < 30) & (result_predict['predicted_label'] != result_predict['label'])].shape[0]

2160

In [151]:
result_predict[(result_predict['minute'] < 36) & (result_predict['predicted_label'] != result_predict['label'])].shape[0]

2223

In [152]:
result_predict.to_csv(r"D:\Learning\ML\BTL\analyzing\result.csv")