In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score

# import CSv fi
data = pd.read_csv("trainingdata02.csv")

data.head()

Unnamed: 0,battle_id,turn,total_turn,rank,weather,field,condition,p1_side,p1a_form,p1a_hp,...,p2c_status,p2c_tera,p2d_form,p2d_hp,p2d_ability,p2d_item,p2d_move,p2d_status,p2d_tera,win
0,2099996083,0,5,1643.5,,Psychic Terrain:5,,,Smeargle,100,...,,unknown,unknown,100,unknown,unknown,unknown,,unknown,-1
1,2099996083,1,5,1643.5,,Psychic Terrain:4,Trick Room:4,,Smeargle,1,...,,unknown,unknown,100,unknown,unknown,unknown,,unknown,-1
2,2099996083,2,5,1643.5,SunnyDay:5,Psychic Terrain:3,Trick Room:3,,Torkoal,100,...,,unknown,unknown,100,unknown,unknown,unknown,,unknown,-1
3,2099996083,3,5,1643.5,SunnyDay:4,Psychic Terrain:2,Trick Room:2,,Torkoal,100,...,fnt,unknown,unknown,100,unknown,unknown,unknown,,unknown,-1
4,2099996083,4,5,1643.5,SunnyDay:3,Psychic Terrain:1,Trick Room:1,,Torkoal,55,...,fnt,unknown,Urshifu,0,unknown,unknown,"Wicked Blow:3,Detect:4",fnt,unknown,-1


In [15]:
# To filter specific columns.
columns_to_keep = [
    'battle_id', 'p1a_form', 'p1b_form', 'p1c_form', 'p1d_form',
    'p2a_form', 'p2b_form', 'p2c_form', 'p2d_form', 'win'
]
filtered_data = data[columns_to_keep]

# To delete rows where the 'win' column is 0.
filtered_data = filtered_data[filtered_data['win'] != 0]

# Extra the unique name
columns_to_encode = [
    'p1a_form', 'p1b_form', 'p1c_form', 'p1d_form',
    'p2a_form', 'p2b_form', 'p2c_form', 'p2d_form'
]
unique_names = pd.unique(filtered_data[columns_to_encode].values.ravel('K'))

# assign number to each name
name_to_number = {name: idx for idx, name in enumerate(unique_names, start=1)}

# 使用字典对列进行编码
for col in columns_to_encode:
    filtered_data[col] = filtered_data[col].map(name_to_number)

# initialize the data
battle_ids = []
p1_forms = []
p2_forms = []
wins = []

# aggregate the data
for battle_id, group in filtered_data.groupby('battle_id'):
    battle_ids.append(battle_id)
    p1_forms.append(group[['p1a_form', 'p1b_form', 'p1c_form', 'p1d_form']].values.flatten().tolist())
    p2_forms.append(group[['p2a_form', 'p2b_form', 'p2c_form', 'p2d_form']].values.flatten().tolist())
    wins.append(group['win'].iloc[0])

# create a new DataFrame
aggregated_data = pd.DataFrame({
    'battle_id': battle_ids,
    'p1_forms': p1_forms,
    'p2_forms': p2_forms,
    'win': wins
})

# define function delete duplicates
def remove_duplicates(forms_list):
    return list(dict.fromkeys(forms_list))

# move duplicates in p1_forms and p2_forms
aggregated_data['p1_forms'] = aggregated_data['p1_forms'].apply(remove_duplicates)
aggregated_data['p2_forms'] = aggregated_data['p2_forms'].apply(remove_duplicates)

# convert to fixed length list and fill zeros if the data missed
max_length = max(aggregated_data['p1_forms'].apply(len).max(), aggregated_data['p2_forms'].apply(len).max())

def pad_list(forms_list, length):
    return forms_list + [0] * (length - len(forms_list))

aggregated_data['p1_forms'] = aggregated_data['p1_forms'].apply(lambda x: pad_list(x, max_length))
aggregated_data['p2_forms'] = aggregated_data['p2_forms'].apply(lambda x: pad_list(x, max_length))

# 
X = pd.concat([pd.DataFrame(aggregated_data['p1_forms'].tolist()), pd.DataFrame(aggregated_data['p2_forms'].tolist())], axis=1)
y = aggregated_data['win']


In [16]:
aggregated_data

Unnamed: 0,battle_id,p1_forms,p2_forms,win
0,2099996083,"[1, 13, 483, 2, 0, 0, 0, 0, 0]","[10, 18, 483, 2, 13, 0, 0, 0, 0]",-1
1,2099997604,"[3, 41, 483, 2, 13, 0, 0, 0, 0]","[10, 18, 483, 5, 19, 0, 0, 0, 0]",-1
2,2100002072,"[4, 18, 483, 5, 19, 0, 0, 0, 0]","[47, 52, 483, 2, 13, 0, 0, 0, 0]",1
3,2100002639,"[6, 17, 483, 140, 7, 0, 0, 0, 0]","[19, 5, 483, 10, 22, 0, 0, 0, 0]",1
4,2100002744,"[8, 13, 483, 9, 18, 0, 0, 0, 0]","[13, 41, 483, 18, 0, 0, 0, 0, 0]",-1
...,...,...,...,...
14468,2127997699,"[122, 2, 483, 0, 0, 0, 0, 0, 0]","[70, 79, 483, 28, 0, 0, 0, 0, 0]",1
14469,2127997839,"[37, 61, 483, 141, 10, 0, 0, 0, 0]","[65, 13, 483, 26, 59, 0, 0, 0, 0]",-1
14470,2127998319,"[62, 17, 483, 28, 4, 0, 0, 0, 0]","[17, 75, 483, 231, 121, 0, 0, 0, 0]",1
14471,2127998356,"[27, 75, 483, 0, 0, 0, 0, 0, 0]","[94, 10, 483, 146, 102, 0, 0, 0, 0]",1


In [17]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1
0,1,13,483,2,0,0,0,0,0,10,18,483,2,13,0,0,0,0
1,3,41,483,2,13,0,0,0,0,10,18,483,5,19,0,0,0,0
2,4,18,483,5,19,0,0,0,0,47,52,483,2,13,0,0,0,0
3,6,17,483,140,7,0,0,0,0,19,5,483,10,22,0,0,0,0
4,8,13,483,9,18,0,0,0,0,13,41,483,18,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14468,122,2,483,0,0,0,0,0,0,70,79,483,28,0,0,0,0,0
14469,37,61,483,141,10,0,0,0,0,65,13,483,26,59,0,0,0,0
14470,62,17,483,28,4,0,0,0,0,17,75,483,231,121,0,0,0,0
14471,27,75,483,0,0,0,0,0,0,94,10,483,146,102,0,0,0,0


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1
11535,4,75,483,62,77,0,0,0,0,123,50,483,65,40,0,0,0,0
13443,8,55,483,33,96,0,0,0,0,37,4,483,27,10,0,0,0,0
6065,55,12,483,13,46,0,0,0,0,278,148,483,2,76,0,0,0,0
13041,105,171,483,42,84,0,0,0,0,33,27,483,55,28,0,0,0,0
8215,27,4,483,33,0,0,0,0,0,47,20,483,203,46,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12314,37,52,483,28,349,0,0,0,0,79,55,483,32,0,0,0,0,0
1329,13,154,483,120,102,0,0,0,0,75,100,483,27,0,0,0,0,0
11109,409,47,483,8,17,0,0,0,0,26,28,483,33,61,0,0,0,0
685,136,16,483,28,12,0,0,0,0,11,28,483,26,102,0,0,0,0


In [20]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth=100, random_state=100)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report for RandomForestClassifier:")
print(report)

Accuracy: 0.6117443868739205
Classification Report for RandomForestClassifier:
              precision    recall  f1-score   support

          -1       0.61      0.59      0.60      1430
           1       0.61      0.63      0.62      1465

    accuracy                           0.61      2895
   macro avg       0.61      0.61      0.61      2895
weighted avg       0.61      0.61      0.61      2895



In [21]:
et = ExtraTreesClassifier(n_estimators=500, max_depth=100)

et.fit(X_train, y_train)
y_pred = et.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(f'Classification Report for ExtraTreesClassifier:\n{report}')

Accuracy: 0.6107081174438688
Classification Report for ExtraTreesClassifier:
              precision    recall  f1-score   support

          -1       0.61      0.60      0.60      1430
           1       0.61      0.62      0.62      1465

    accuracy                           0.61      2895
   macro avg       0.61      0.61      0.61      2895
weighted avg       0.61      0.61      0.61      2895



#total turn effact

In [22]:
# Filter specific columns
columns_to_keep = [
    'battle_id', 'total_turn', 'p1a_form', 'p1b_form', 'p1c_form', 'p1d_form',
    'p2a_form', 'p2b_form', 'p2c_form', 'p2d_form', 'win'
]
filtered_data = data[columns_to_keep]

# Delete rows where the 'win' column is 0
filtered_data = filtered_data[filtered_data['win'] != 0]

# Extract unique names
columns_to_encode = [
    'p1a_form', 'p1b_form', 'p1c_form', 'p1d_form',
    'p2a_form', 'p2b_form', 'p2c_form', 'p2d_form'
]
unique_names = pd.unique(filtered_data[columns_to_encode].values.ravel('K'))

# Assign a unique number to each name
name_to_number = {name: idx for idx, name in enumerate(unique_names, start=1)}

# Encode the columns using the dictionary
for col in columns_to_encode:
    filtered_data[col] = filtered_data[col].map(name_to_number)
    
# Manually perform aggregation
# Initialize lists to store the results
battle_ids = []
p1_forms = []
p2_forms = []
wins = []
total_turns = []

# Iterate over grouped data and aggregate
for battle_id, group in filtered_data.groupby('battle_id'):
    battle_ids.append(battle_id)
    total_turns.append(group[['total_turn']].values.flatten().tolist())
    p1_forms.append(group[['p1a_form', 'p1b_form', 'p1c_form', 'p1d_form']].values.flatten().tolist())
    p2_forms.append(group[['p2a_form', 'p2b_form', 'p2c_form', 'p2d_form']].values.flatten().tolist())
    wins.append(group['win'].iloc[0])

# Create a new DataFrame with the aggregated data
aggregated_data = pd.DataFrame({
    'battle_id': battle_ids,
    'p1_forms': p1_forms,
    'p2_forms': p2_forms,
    'win': wins,
    'total_turn': total_turns
})

# Define a function to remove duplicates
def remove_duplicates(forms_list):
    return list(dict.fromkeys(forms_list))

# Remove duplicates from p1_forms and p2_forms
aggregated_data['p1_forms'] = aggregated_data['p1_forms'].apply(remove_duplicates)
aggregated_data['p2_forms'] = aggregated_data['p2_forms'].apply(remove_duplicates)

# Convert list columns to fixed-length vectors, padding with 0 if necessary
max_length = max(aggregated_data['p1_forms'].apply(len).max(), aggregated_data['p2_forms'].apply(len).max())

def pad_list(forms_list, length):
    return forms_list + [0] * (length - len(forms_list))

aggregated_data['p1_forms'] = aggregated_data['p1_forms'].apply(lambda x: pad_list(x, max_length))
aggregated_data['p2_forms'] = aggregated_data['p2_forms'].apply(lambda x: pad_list(x, max_length))

# Convert feature columns to vector format
X = pd.concat([pd.DataFrame(aggregated_data['p1_forms'].tolist()), pd.DataFrame(aggregated_data['p2_forms'].tolist())], axis=1)
y = aggregated_data['win']

In [23]:
aggregated_data['total_turn']

0                       [5, 5, 5, 5, 5]
1                 [6, 6, 6, 6, 6, 6, 6]
2                          [3, 3, 3, 3]
3        [9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
4                       [4, 4, 4, 4, 4]
                      ...              
14468                         [3, 3, 3]
14469             [6, 6, 6, 6, 6, 6, 6]
14470                [5, 5, 5, 5, 5, 5]
14471                      [3, 3, 3, 3]
14472                         [3, 3, 3]
Name: total_turn, Length: 14473, dtype: object

In [24]:
import numpy as np

In [26]:
# define a function to get unique values
def get_unique(arr):
    return np.unique(arr)

# apply to every row of the dataframe
aggregated_data['total_turn'] = aggregated_data['total_turn'].apply(get_unique)

In [27]:
aggregated_data

Unnamed: 0,battle_id,p1_forms,p2_forms,win,total_turn
0,2099996083,"[1, 13, 483, 2, 0, 0, 0, 0, 0]","[10, 18, 483, 2, 13, 0, 0, 0, 0]",-1,[5]
1,2099997604,"[3, 41, 483, 2, 13, 0, 0, 0, 0]","[10, 18, 483, 5, 19, 0, 0, 0, 0]",-1,[6]
2,2100002072,"[4, 18, 483, 5, 19, 0, 0, 0, 0]","[47, 52, 483, 2, 13, 0, 0, 0, 0]",1,[3]
3,2100002639,"[6, 17, 483, 140, 7, 0, 0, 0, 0]","[19, 5, 483, 10, 22, 0, 0, 0, 0]",1,[9]
4,2100002744,"[8, 13, 483, 9, 18, 0, 0, 0, 0]","[13, 41, 483, 18, 0, 0, 0, 0, 0]",-1,[4]
...,...,...,...,...,...
14468,2127997699,"[122, 2, 483, 0, 0, 0, 0, 0, 0]","[70, 79, 483, 28, 0, 0, 0, 0, 0]",1,[3]
14469,2127997839,"[37, 61, 483, 141, 10, 0, 0, 0, 0]","[65, 13, 483, 26, 59, 0, 0, 0, 0]",-1,[6]
14470,2127998319,"[62, 17, 483, 28, 4, 0, 0, 0, 0]","[17, 75, 483, 231, 121, 0, 0, 0, 0]",1,[5]
14471,2127998356,"[27, 75, 483, 0, 0, 0, 0, 0, 0]","[94, 10, 483, 146, 102, 0, 0, 0, 0]",1,[3]


In [28]:
X = pd.concat([pd.DataFrame(aggregated_data['p1_forms'].tolist()), pd.DataFrame(aggregated_data['p2_forms'].tolist()),pd.DataFrame(aggregated_data['total_turn'].tolist())+len(unique_names)+1], axis=1)
Z= pd.concat([pd.DataFrame(aggregated_data['p1_forms'].tolist()), pd.DataFrame(aggregated_data['p2_forms'].tolist()),pd.DataFrame(aggregated_data['total_turn'].tolist())], axis=1)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,0.2
0,1,13,483,2,0,0,0,0,0,10,18,483,2,13,0,0,0,0,520
1,3,41,483,2,13,0,0,0,0,10,18,483,5,19,0,0,0,0,521
2,4,18,483,5,19,0,0,0,0,47,52,483,2,13,0,0,0,0,518
3,6,17,483,140,7,0,0,0,0,19,5,483,10,22,0,0,0,0,524
4,8,13,483,9,18,0,0,0,0,13,41,483,18,0,0,0,0,0,519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14468,122,2,483,0,0,0,0,0,0,70,79,483,28,0,0,0,0,0,518
14469,37,61,483,141,10,0,0,0,0,65,13,483,26,59,0,0,0,0,521
14470,62,17,483,28,4,0,0,0,0,17,75,483,231,121,0,0,0,0,520
14471,27,75,483,0,0,0,0,0,0,94,10,483,146,102,0,0,0,0,518


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [30]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=500, max_depth=200, random_state=100)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 0.616580310880829
Classification Report:
              precision    recall  f1-score   support

          -1       0.62      0.60      0.61      1430
           1       0.62      0.64      0.63      1465

    accuracy                           0.62      2895
   macro avg       0.62      0.62      0.62      2895
weighted avg       0.62      0.62      0.62      2895



In [31]:
et = ExtraTreesClassifier(n_estimators=500, max_depth=200)

et.fit(X_train, y_train)
y_pred = et.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.6172711571675302
Classification Report:
              precision    recall  f1-score   support

          -1       0.62      0.59      0.60      1430
           1       0.62      0.65      0.63      1465

    accuracy                           0.62      2895
   macro avg       0.62      0.62      0.62      2895
weighted avg       0.62      0.62      0.62      2895

