In [55]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metric

In [56]:
clean_data_file_name = r'..\preprocessing\clean_data\status_per_min_01.csv'

In [57]:
df = pd.read_csv(clean_data_file_name)

In [58]:
X = df.drop(columns=["label"])
Y = df["label"]
X_training, X_test, Y_training, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

In [59]:
lg_regression = LogisticRegression(solver='saga').fit(X_training, Y_training)



In [60]:
# Accuracy
print('Train')
print(metric.accuracy_score(Y_training, lg_regression.predict(X_training)))
print('Test')
print(metric.accuracy_score(Y_test, lg_regression.predict(X_test)))

Train
0.9448729884293124
Test
0.9335919317300233


In [61]:
# F1 score
print('Train')
print(metric.f1_score(Y_training, lg_regression.predict(X_training), pos_label=1))
print('Test')
print(metric.f1_score(Y_test, lg_regression.predict(X_test), pos_label=1))

Train
0.9451900826446281
Test
0.9330413016270338


In [62]:
# Precision Score
print('Train')
print(metric.precision_score(Y_training, lg_regression.predict(X_training), pos_label=1))
print('Test')
print(metric.precision_score(Y_test, lg_regression.predict(X_test), pos_label=1))

Train
0.9442536327608982
Test
0.9266625233064015


In [66]:
import random

num_matches = random.randint(0, 1000)
random_matches = df.sample(n=num_matches, random_state=1)

random_minutes = [random.randint(0, 41) for _ in range(num_matches)]
random_matches['minute'] = random_minutes

def get_data_until_minute(row):
    minute = row['minute']
    selected_columns = list(random_matches.columns[:20])
    for col in df.columns:
        last_part = col.split('_')[-1]  # Lấy phần cuối của tên cột
        if last_part.isdigit() and int(last_part) <= minute:  # Kiểm tra và so sánh
            selected_columns.append(col)
    selected_columns.append('label')
    return row[selected_columns]

filtered_data = random_matches.apply(get_data_until_minute, axis=1)
filtered_data = filtered_data.reindex(columns=random_matches.columns)

X_real_time = filtered_data.drop(columns=["label", "minute"])
Y_real_time = filtered_data["label"]

X_real_time = X_real_time.fillna(0)  
Y_real_time = Y_real_time.fillna(0) 

print('Test Accuracy:', metric.accuracy_score(Y_real_time, lg_regression.predict(X_real_time)))

print('Test F1 Score:', metric.f1_score(Y_real_time, lg_regression.predict(X_real_time), pos_label=1))

print('Test Precision:', metric.precision_score(Y_real_time, lg_regression.predict(X_real_time), pos_label=1))

Test Accuracy: 0.7162393162393162
Test F1 Score: 0.6770428015564203
Test Precision: 0.7131147540983607
