# Modelling 1

### Library and data imports

In [26]:
from importlib import reload
import pandas as pd
import numpy as np
from spektral.data import Dataset, Graph, BatchLoader
import dataset
reload(dataset)
import pickle
import os

from spektral.layers import GATConv, GCNConv, GlobalAvgPool, GlobalMaxPool, GlobalSumPool, GlobalAttentionPool
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import binary_accuracy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import keras
import random

In [10]:
# Load combined data
df_raw = pd.read_csv('../data/combined.csv')
df = df_raw.copy()

# Load hero feature data
df_features = pd.read_csv('../data/features.csv')
df_features = df_features.set_index('hero_id')

# Load standard filter
df_filters = pd.read_csv('../models/filters.csv')

In [None]:
# Load graph dataset 50000 matches at a time
dir = '../data/graphs_v1_scaled/'
count = 0
total = len(df)
step = 50000

for i in range(0,int(np.ceil(total/step))):
    start = i*step
    end = start+step-1 if (start+step)<total else total-1
    path = dir+f'graphs_v1_scaled_{start}-{end}.pkl'
    print(path)
    file = open(path,'rb')
    if i==0:
        graphs = pickle.load(file)
    else:
        graphs = graphs + pickle.load(file)

In [None]:
# DO NOT RUN IF DOING MODEL 1.7 FEATURE SELECTION (it will remove all desired features itself)
# Remove attack_backswing feature
for i in range(0,len(graphs)):
    # if(i%100000==0):
    graphs[i].x = graphs[i].x[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19]] # remove attack_backswing as a feature
print('Attack backswing feature removed')

In [14]:
def get_filt_idx(filt):
    '''Returns indices of desired matches given a boolean array filter e.g. True, False, True returns [0,2]'''
    # DotaV1 data handling (two graphs for every match: 0-49999 radiant, 0-49999 dire, 50000-99999 radiant, etc.)
    step = 50000
    filt_vals = []
    for i in range(0,int(np.ceil(len(filt)/step))):
        start = i*step
        end = start+step
        # Add filters for match range twice, as matches repeated every 50000
        filt_vals = np.append(filt_vals, filt[start:end])
        filt_vals = np.append(filt_vals, filt[start:end])

    # Get indices of True values in filters
    filt_idx = [i for i, x in enumerate(filt_vals) if x]
    return filt_idx

In [None]:
# DO NOT RUN IF DOING MODEL 1.9 MMR RANGES OR 1.10 DURATION RANGES (they handle filtering themselves)
# Filter graph dataset
filt = df_filters['filt_std'].values
filt_idx = get_filt_idx(filt)
graphs_filt = graphs[filt_idx]
print('Standard filtering complete')

### Standard Data Prep

In [16]:
# Train/valid/test split
d = graphs_filt # Graph data

np.random.seed(10)
idxs = np.random.permutation(len(d))
split_va, split_te = int(0.64 * len(d)), int(0.8 * len(d)) #64% training, 16% validation, 20% test
idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
data_tr = d[idx_tr]
data_va = d[idx_va]
data_te = d[idx_te]

print(f'Training data: {np.round(len(data_tr)/len(graphs_filt),2)*100}%')
print(f'Validation data: {np.round(len(data_va)/len(graphs_filt),2)*100}%')
print(f'Test data: {np.round(len(data_te)/len(graphs_filt),2)*100}%')

Training data: 64.0%
Validation data: 16.0%
Test data: 20.0%


In [1]:
# Confirm GPUs are being identified (requires tf environment, not the pipenv dotaprediction)
# print("Num GPUs Available: ", list_physical_devices('GPU'))
# print(device_lib.list_local_devices())

### Model 1.0 - Baseline

In [None]:
# Configuration
learning_rate = 0.001  # Learning rate
epochs = 50  # Number of training epochs
batch_size = 256  # Batch size

# Data loaders
loader_tr = BatchLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = BatchLoader(data_va, batch_size=batch_size)
loader_te = BatchLoader(data_te, batch_size=batch_size)

# Build model
class Net_1_0(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(19, activation='relu')
        self.flatten = Flatten()
        self.dense = Dense(d.n_labels, activation="sigmoid")

    def call(self, inputs):
        x, a = inputs
        x = self.conv1([x, a])
        x = self.flatten(x)
        x = self.dense(x) 
        return x

# Train model
model_1_0 = Net_1_0()
optimizer = Adam(learning_rate=learning_rate)
loss_fn = BinaryCrossentropy()
model_1_0.compile(optimizer=optimizer, loss=loss_fn, metrics=['binary_accuracy'])
fit_log_1_0 = model_1_0.fit(loader_tr.load(), steps_per_epoch=loader_tr.steps_per_epoch, epochs=epochs, validation_data=loader_va.load(), validation_steps=loader_va.steps_per_epoch)

# Save training record
epochs = list(range(1,len(fit_log_1_0.history['binary_accuracy'])+1))
training_accuracy = fit_log_1_0.history['binary_accuracy']
validation_accuracy = fit_log_1_0.history['val_binary_accuracy']
pd.DataFrame({'epoch': epochs, 'training_accuracy':training_accuracy, 'validation_accuracy':validation_accuracy}).to_csv('../models/fit_records/model_1_0_accuracy.csv', index=False)

In [None]:
# Optional: save model
model_1_0.save(f'../models/model_1_0.tf', save_format='tf')

### Model 1.1 - GATConv

In [None]:
# Configuration
learning_rate = 0.001  # Learning rate
epochs = 50  # Number of training epochs
batch_size = 256  # Batch size

# Data loaders
loader_tr = BatchLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = BatchLoader(data_va, batch_size=batch_size)
loader_te = BatchLoader(data_te, batch_size=batch_size)

# Build model
class Net_1_1(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = GATConv(19, activation='relu')
        self.flatten = Flatten()
        self.dense = Dense(d.n_labels, activation="sigmoid")

    def call(self, inputs):
        x, a = inputs
        x = self.conv1([x, a])
        x =self.flatten(x)
        x = self.dense(x)
        return x

# Train model
model_1_1 = Net_1_1()
optimizer = Adam(learning_rate=learning_rate)
loss_fn = BinaryCrossentropy()
model_1_1.compile(optimizer=optimizer, loss=loss_fn, metrics=['binary_accuracy'])
fit_log_1_1 = model_1_1.fit(loader_tr.load(), steps_per_epoch=loader_tr.steps_per_epoch, epochs=epochs, validation_data=loader_va.load(), validation_steps=loader_va.steps_per_epoch)

# Save training record
epochs = list(range(1,len(fit_log_1_1.history['binary_accuracy'])+1))
training_accuracy = fit_log_1_1.history['binary_accuracy']
validation_accuracy = fit_log_1_1.history['val_binary_accuracy']
pd.DataFrame({'epoch': epochs, 'training_accuracy':training_accuracy, 'validation_accuracy':validation_accuracy}).to_csv('../models/fit_records/model_1_1_accuracy.csv', index=False)

In [None]:
# Optional: save model
model_1_1.save(f'../models/model_1_1.tf', save_format='tf')

### Model 1.2 Average Pooling

In [None]:
# Configuration
learning_rate = 0.001  # Learning rate
epochs = 50  # Number of training epochs
batch_size = 256  # Batch size

# Data loaders
loader_tr = BatchLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = BatchLoader(data_va, batch_size=batch_size)
loader_te = BatchLoader(data_te, batch_size=batch_size)

# Build model
class Net_1_2(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(19, activation='relu')
        self.pool1 = GlobalAvgPool()
        self.flatten = Flatten()
        self.dense = Dense(d.n_labels, activation="sigmoid")

    def call(self, inputs):
        x, a = inputs
        x = self.conv1([x, a])
        x = self.pool1(x)
        x = self.dense(x)       
        return x

# Train model
model_1_2 = Net_1_2()
optimizer = Adam(learning_rate=learning_rate)
loss_fn = BinaryCrossentropy()
model_1_2.compile(optimizer=optimizer, loss=loss_fn, metrics=['binary_accuracy'])
fit_log_1_2 = model_1_2.fit(loader_tr.load(), steps_per_epoch=loader_tr.steps_per_epoch, epochs=epochs, validation_data=loader_va.load(), validation_steps=loader_va.steps_per_epoch)

# Save training record
epochs = list(range(1,len(fit_log_1_2.history['binary_accuracy'])+1))
training_accuracy = fit_log_1_2.history['binary_accuracy']
validation_accuracy = fit_log_1_2.history['val_binary_accuracy']
pd.DataFrame({'epoch': epochs, 'training_accuracy':training_accuracy, 'validation_accuracy':validation_accuracy}).to_csv('../models/fit_records/model_1_2_accuracy.csv', index=False)

In [None]:
# Optional: save model
model_1_2.save(f'../models/model_1_2.tf', save_format='tf')

### Model 1.3 - Max Pooling

In [None]:
# Configuration
learning_rate = 0.001  # Learning rate
epochs = 50  # Number of training epochs
batch_size = 256  # Batch size

# Data loaders
loader_tr = BatchLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = BatchLoader(data_va, batch_size=batch_size)
loader_te = BatchLoader(data_te, batch_size=batch_size)

# Build model
class Net_1_3(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(19, activation='relu')
        self.pool1 = GlobalMaxPool()
        self.dense = Dense(d.n_labels, activation="sigmoid")

    def call(self, inputs):
        x, a = inputs
        x = self.conv1([x, a])
        x = self.pool1(x)
        x = self.dense(x)       
        return x

# Train model
model_1_3 = Net_1_3()
optimizer = Adam(learning_rate=learning_rate)
loss_fn = BinaryCrossentropy()
model_1_3.compile(optimizer=optimizer, loss=loss_fn, metrics=['binary_accuracy'])
fit_log_1_3 = model_1_3.fit(loader_tr.load(), steps_per_epoch=loader_tr.steps_per_epoch, epochs=epochs, validation_data=loader_va.load(), validation_steps=loader_va.steps_per_epoch)

# Save training record
epochs = list(range(1,len(fit_log_1_3.history['binary_accuracy'])+1))
training_accuracy = fit_log_1_3.history['binary_accuracy']
validation_accuracy = fit_log_1_3.history['val_binary_accuracy']
pd.DataFrame({'epoch': epochs, 'training_accuracy':training_accuracy, 'validation_accuracy':validation_accuracy}).to_csv('../models/fit_records/model_1_3_accuracy.csv', index=False)

In [None]:
# Optional: save model
model_1_3.save(f'../models/model_1_3.tf', save_format='tf')

### Model 1.4 - Sum Pooling

In [None]:
# Configuration
learning_rate = 0.001  # Learning rate
epochs = 50  # Number of training epochs
batch_size = 256  # Batch size

# Data loaders
loader_tr = BatchLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = BatchLoader(data_va, batch_size=batch_size)
loader_te = BatchLoader(data_te, batch_size=batch_size)

# Build model
class Net_1_4(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(19, activation='relu')
        self.pool1 = GlobalSumPool()
        self.dense = Dense(d.n_labels, activation="sigmoid")

    def call(self, inputs):
        x, a = inputs
        x = self.conv1([x, a])
        x = self.pool1(x)
        x = self.dense(x)        
        return x

# Train model
model_1_4 = Net_1_4()
optimizer = Adam(learning_rate=learning_rate)
loss_fn = BinaryCrossentropy()
model_1_4.compile(optimizer=optimizer, loss=loss_fn, metrics=['binary_accuracy'])
fit_log_1_4 = model_1_4.fit(loader_tr.load(), steps_per_epoch=loader_tr.steps_per_epoch, epochs=epochs, validation_data=loader_va.load(), validation_steps=loader_va.steps_per_epoch)

# Save training record
epochs = list(range(1,len(fit_log_1_4.history['binary_accuracy'])+1))
training_accuracy = fit_log_1_4.history['binary_accuracy']
validation_accuracy = fit_log_1_4.history['val_binary_accuracy']
pd.DataFrame({'epoch': epochs, 'training_accuracy':training_accuracy, 'validation_accuracy':validation_accuracy}).to_csv('../models/fit_records/model_1_4_accuracy.csv', index=False)

In [None]:
# Optional: save model
model_1_4.save(f'../models/model_1_4.tf', save_format='tf')

### Model 1.5 - Attention Pooling - not presented

In [None]:
# Configuration
learning_rate = 0.001  # Learning rate
epochs = 50  # Number of training epochs
batch_size = 256  # Batch size

# Data loaders
loader_tr = BatchLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = BatchLoader(data_va, batch_size=batch_size)
loader_te = BatchLoader(data_te, batch_size=batch_size)

# Build model
class Net_1_5(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(19, activation='relu')
        self.pool1 = GlobalAttentionPool(19)
        self.dense = Dense(d.n_labels, activation="sigmoid")

    def call(self, inputs):
        x, a = inputs
        x = self.conv1([x, a])
        x = self.pool1(x)
        x = self.dense(x)       
        return x

# Train model
model_1_5 = Net_1_5()
optimizer = Adam(learning_rate=learning_rate)
loss_fn = BinaryCrossentropy()
model_1_5.compile(optimizer=optimizer, loss=loss_fn, metrics=['binary_accuracy'])
fit_log_1_5 = model_1_5.fit(loader_tr.load(), steps_per_epoch=loader_tr.steps_per_epoch, epochs=epochs, validation_data=loader_va.load(), validation_steps=loader_va.steps_per_epoch)

# Save training record
epochs = list(range(1,len(fit_log_1_5.history['binary_accuracy'])+1))
training_accuracy = fit_log_1_5.history['binary_accuracy']
validation_accuracy = fit_log_1_5.history['val_binary_accuracy']
pd.DataFrame({'epoch': epochs, 'training_accuracy':training_accuracy, 'validation_accuracy':validation_accuracy}).to_csv('../models/fit_records/model_1_5_accuracy.csv', index=False)

In [None]:
# Optional: save model
model_1_5.save(f'../models/model_1_5.tf', save_format='tf')

### Model 1.6 GATConv + Average Pooling

In [None]:
# Configuration
learning_rate = 0.001  # Learning rate
epochs = 50  # Number of training epochs
batch_size = 256  # Batch size

# Data loaders
loader_tr = BatchLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = BatchLoader(data_va, batch_size=batch_size)
loader_te = BatchLoader(data_te, batch_size=batch_size)

# Build model
class Net_1_6(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = GATConv(19, activation='relu')
        self.pool1 = GlobalAvgPool()
        self.dense = Dense(d.n_labels, activation="sigmoid")

    def call(self, inputs):
        x, a = inputs
        x = self.conv1([x, a])
        x = self.pool1(x)
        x = self.dense(x)       
        return x

# Train model
model_1_6 = Net_1_6()
optimizer = Adam(learning_rate=learning_rate)
loss_fn = BinaryCrossentropy()
model_1_6.compile(optimizer=optimizer, loss=loss_fn, metrics=['binary_accuracy'])
fit_log_1_6 = model_1_6.fit(loader_tr.load(), steps_per_epoch=loader_tr.steps_per_epoch, epochs=epochs, validation_data=loader_va.load(), validation_steps=loader_va.steps_per_epoch)

# Save training record
epochs = list(range(1,len(fit_log_1_6.history['binary_accuracy'])+1))
training_accuracy = fit_log_1_6.history['binary_accuracy']
validation_accuracy = fit_log_1_6.history['val_binary_accuracy']
pd.DataFrame({'epoch': epochs, 'training_accuracy':training_accuracy, 'validation_accuracy':validation_accuracy}).to_csv('../models/fit_records/model_1_6_accuracy.csv', index=False)

In [None]:
# Optional: save model
model_1_6.save(f'../models/model_1_6.tf', save_format='tf')

### Model 1.7 - Feature selection

In [None]:
# Loop through each graph and scale feature matrix and drop following features: (col numbers are after removing attack_backswing (14) earlier)

# base_attack_time (12)
# attack_point (13)
# vision_day (14)
# vision_night (15)
# turn_rate (16)
# collision_size (17)

print('Selecting specific columns:')
for i in range(0,len(graphs_filt)): # match 0 has only 15 features, reason not known, skipping this
    if(i%100000==0):
        print(i)

    graphs_filt[i].x = graphs_filt[i].x[:,[0,1,2,3,4,5,6,7,8,9,10,11,18]]

In [None]:
# Train/valid/test split
d_fs = graphs_filt # Graph data

# np.random.seed(10)
idxs = np.random.permutation(len(d_fs))
split_va, split_te = int(0.64 * len(d_fs)), int(0.8 * len(d_fs)) #64% training, 16% validation, 20% test
# idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te]) # use indices from earlier split
data_tr_fs = d_fs[idx_tr]
data_va_fs = d_fs[idx_va]
data_te_fs = d_fs[idx_te]

print(f'Training data: {np.round(len(data_tr_fs)/len(graphs_filt),2)*100}%')
print(f'Validation data: {np.round(len(data_va_fs)/len(graphs_filt),2)*100}%')
print(f'Test data: {np.round(len(data_te_fs)/len(graphs_filt),2)*100}%')

In [None]:
# Configuration
learning_rate = 0.001  # Learning rate
epochs = 50  # Number of training epochs
batch_size = 256  # Batch size

# Data loaders
loader_tr = BatchLoader(data_tr_fs, batch_size=batch_size, epochs=epochs)
loader_va = BatchLoader(data_va_fs, batch_size=batch_size)
loader_te = BatchLoader(data_te_fs, batch_size=batch_size)

# Build model
class Net_1_7(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = GATConv(13, activation='relu')
        self.pool1 = GlobalAvgPool()
        self.dense = Dense(d.n_labels, activation="sigmoid")

    def call(self, inputs):
        x, a = inputs
        x = self.conv1([x, a])
        x = self.pool1(x)
        x = self.dense(x)       
        return x

# Train model
model_1_7 = Net_1_7()
optimizer = Adam(learning_rate=learning_rate)
loss_fn = BinaryCrossentropy()
model_1_7.compile(optimizer=optimizer, loss=loss_fn, metrics=['binary_accuracy'])
fit_log_1_7 = model_1_7.fit(loader_tr.load(), steps_per_epoch=loader_tr.steps_per_epoch, epochs=epochs, validation_data=loader_va.load(), validation_steps=loader_va.steps_per_epoch)

# Save training record
epochs = list(range(1,len(fit_log_1_7.history['binary_accuracy'])+1))
training_accuracy = fit_log_1_7.history['binary_accuracy']
validation_accuracy = fit_log_1_7.history['val_binary_accuracy']
pd.DataFrame({'epoch': epochs, 'training_accuracy':training_accuracy, 'validation_accuracy':validation_accuracy}).to_csv('../models/fit_records/model_1_7_accuracy.csv', index=False)

In [None]:
# Optional: save model
model_1_7.save(f'../models/model_1_7.tf', save_format='tf')

### Model 1.8 Hyperparameter Sweep

In [None]:
# Configuration
learning_rate = 0.001  # Learning rate
es_patience = 10  # Patience for early stopping
batch_size = 256  # Batch size

# Loop through range of num channels hyperparameter (manual loop, auto was giving errors)
channel = 15 # change manually through [5,10,15,25,30]
channel_lookup = {5:1, 10:2, 15:3, 25:4, 30:5, 35:6}
i = channel_lookup[channel]

print(f'Channels: {channel}')

loader_tr = BatchLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = BatchLoader(data_va, batch_size=batch_size)
loader_te = BatchLoader(data_te, batch_size=batch_size)

# Build model
class Net_1_8(Model):
    def __init__(self, channels):
        super().__init__()
        self.conv1 = GATConv(channels, activation='relu')
        self.pool1 = GlobalAvgPool()
        self.dense = Dense(d.n_labels, activation="sigmoid")

    def call(self, inputs):
        x, a = inputs
        x = self.conv1([x, a])
        x = self.pool1(x)
        x = self.dense(x)       
        return x

# Train model
model_1_8 = Net_1_8(channel)
optimizer = Adam(learning_rate=learning_rate)
loss_fn = BinaryCrossentropy()
model_1_8.compile(optimizer=optimizer, loss=loss_fn, metrics=['binary_accuracy'])
fit_log_1_8 = model_1_8.fit(loader_tr.load(), steps_per_epoch=loader_tr.steps_per_epoch, epochs=epochs, validation_data=loader_va.load(), validation_steps=loader_va.steps_per_epoch)

# Save training record
epochs = list(range(1,len(fit_log_1_8.history['binary_accuracy'])+1))
training_accuracy = fit_log_1_8.history['binary_accuracy']
validation_accuracy = fit_log_1_8.history['val_binary_accuracy']
pd.DataFrame({'epoch': epochs, 'training_accuracy':training_accuracy, 'validation_accuracy':validation_accuracy}).to_csv(f'../models/fit_records/model_1_8_{i+1}_accuracy.csv', index=False)

In [None]:
# Optional: save model
model_1_8.save(f'../models/model_1_8_{i+1}.tf', save_format='tf')

### Model 1.9 MMR ranges

In [None]:
# Configuration
learning_rate = 0.001  # Learning rate
epochs = 30  # Number of training epochs
batch_size = 256  # Batch size

group = 1 # choose 1-6

# Filter data for current mmr group
filt = df_filters['filt_std'].values & df_filters[f'filt_mmr_{group}'].values
filt_idx = get_filt_idx(filt)
graphs_filt_mmr = graphs[filt_idx]
print('Filtered (standard + MMR group)')

# Remove attack_backswing feature
for i in range(0,len(graphs_filt_mmr)):
    # if(i%100000==0):
    graphs_filt_mmr[i].x = graphs_filt_mmr[i].x[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19]] # remove attack_backswing as a feature
print('Attack backswing feature removed')

# Train/valid split
np.random.seed(10)
idxs = np.random.permutation(len(graphs_filt_mmr))
split_va, split_te = int(0.7 * len(graphs_filt_mmr)), int(len(graphs_filt_mmr)) #70% training, 30% validation
idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
data_tr = graphs_filt_mmr[idx_tr]
data_va = graphs_filt_mmr[idx_va]

# Data loaders
loader_tr = BatchLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = BatchLoader(data_va, batch_size=batch_size)

# Build model
class Net_1_9(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = GATConv(15, activation='relu')
        self.pool1 = GlobalAvgPool()
        self.dense = Dense(graphs_filt_mmr.n_labels, activation="sigmoid")

    def call(self, inputs):
        x, a = inputs
        x = self.conv1([x, a])
        x = self.pool1(x)
        x = self.dense(x)       
        return x

# Train model
model_1_9 = Net_1_9()
optimizer = Adam(learning_rate=learning_rate)
loss_fn = BinaryCrossentropy()
model_1_9.compile(optimizer=optimizer, loss=loss_fn, metrics=['binary_accuracy'])
fit_log_1_9 = model_1_9.fit(loader_tr.load(), steps_per_epoch=loader_tr.steps_per_epoch, epochs=epochs, validation_data=loader_va.load(), validation_steps=loader_va.steps_per_epoch)

# Save training record
epochs = list(range(1,len(fit_log_1_9.history['binary_accuracy'])+1))
training_accuracy = fit_log_1_9.history['binary_accuracy']
validation_accuracy = fit_log_1_9.history['val_binary_accuracy']
pd.DataFrame({'epoch': epochs, 'training_accuracy':training_accuracy, 'validation_accuracy':validation_accuracy}).to_csv(f'../models/fit_records/model_1_9_{group}_accuracy.csv', index=False)

In [None]:
# Optional: save model
model_1_9.save(f'../models/model_1_9_{group}.tf', save_format='tf')

In [3]:
# Print number of matches in each MMR group
for i in range(1,7):
    filt = df_filters['filt_std'].values & df_filters[f'filt_mmr_{i}'].values
    matches_mmr_group = len(filt[filt==True])
    print(f'Matches MMR group {i}: {matches_mmr_group}')

Matches MMR group 1: 117735
Matches MMR group 2: 450537
Matches MMR group 3: 1158591
Matches MMR group 4: 1332106
Matches MMR group 5: 385337
Matches MMR group 6: 44372


### Model 1.10 Duration ranges

In [None]:
# Configuration
learning_rate = 0.001  # Learning rate
epochs = 30  # Number of training epochs
batch_size = 256  # Batch size

group = 1 # choose 1-6

# Filter data for current duration group
filt = df_filters['filt_std'].values & df_filters[f'filt_duration_{group}'].values
filt_idx = get_filt_idx(filt)
graphs_filt_duration = graphs[filt_idx]
print('Filtered (standard + duration group)')

# Train/valid split
np.random.seed(10)
idxs = np.random.permutation(len(graphs_filt_duration))
split_va, split_te = int(0.7 * len(graphs_filt_duration)), int(len(graphs_filt_duration)) #70% training, 30% validation
idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te])
data_tr = graphs_filt_duration[idx_tr]
data_va = graphs_filt_duration[idx_va]

# Data loaders
loader_tr = BatchLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_va = BatchLoader(data_va, batch_size=batch_size)

# Build model
class Net_1_10(Model):
    def __init__(self):
        super().__init__()
        self.conv1 = GATConv(15, activation='relu')
        self.pool1 = GlobalAvgPool()
        self.dense = Dense(graphs_filt_duration.n_labels, activation="sigmoid")

    def call(self, inputs):
        x, a = inputs
        x = self.conv1([x, a])
        x = self.pool1(x)
        x = self.dense(x)       
        return x

# Train model
model_1_10 = Net_1_10()
optimizer = Adam(learning_rate=learning_rate)
loss_fn = BinaryCrossentropy()
model_1_10.compile(optimizer=optimizer, loss=loss_fn, metrics=['binary_accuracy'])
fit_log_1_10 = model_1_10.fit(loader_tr.load(), steps_per_epoch=loader_tr.steps_per_epoch, epochs=epochs, validation_data=loader_va.load(), validation_steps=loader_va.steps_per_epoch)

# Save training record
epochs = list(range(1,len(fit_log_1_10.history['binary_accuracy'])+1))
training_accuracy = fit_log_1_10.history['binary_accuracy']
validation_accuracy = fit_log_1_10.history['val_binary_accuracy']
pd.DataFrame({'epoch': epochs, 'training_accuracy':training_accuracy, 'validation_accuracy':validation_accuracy}).to_csv(f'../models/fit_records/model_1_10_{group}_accuracy.csv', index=False)

# Pickle model and training+validation log
filehandler = open(f'../models/fit_records/fit_log_1_10_{group}.pkl','wb')
pickle.dump(fit_log_1_10, filehandler)
filehandler = open(f'../models/fit_records/model_1_10_{group}.pkl','wb')
pickle.dump(model_1_10, filehandler)

In [None]:
# Optional: save model
model_1_10.save(f'../models/model_1_10_{group}.tf', save_format='tf')

In [4]:
# Print number of matches in each duration group
for i in range(1,7):
    filt = df_filters['filt_std'].values & df_filters[f'filt_duration_{i}'].values
    matches_mmr_group = len(filt[filt==True])
    print(f'Matches MMR group {i}: {matches_mmr_group}')

Matches MMR group 1: 654790
Matches MMR group 2: 1254998
Matches MMR group 3: 1231033
Matches MMR group 4: 1036095
Matches MMR group 5: 573267
Matches MMR group 6: 327147


### Final Model (1.8.3) - Test data evaluation

In [37]:
final_model = keras.models.load_model(f'../models/model_1_8_3.tf')
loss = final_model.evaluate(loader_te.load(), steps=loader_te.steps_per_epoch)
print(f"Test accuracy: {np.round(loss[1],4)*100}%")

Test accuracy: 53.459999999999994%
