In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 150
import seaborn as sns

from netam import framework, models
from epam.torch_common import print_parameter_count

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df, val_df = framework.load_shmoof_dataframes("/Users/matsen/data/shmoof_edges_11-Jan-2023_NoNode0_iqtree_K80+R_masked.csv", val_nickname="51") #, sample_count=5000)

In [3]:
kmer_length = 5
max_length = 410

train_dataset = framework.SHMoofDataset(train_df, kmer_length=kmer_length, max_length=max_length)
val_dataset = framework.SHMoofDataset(val_df, kmer_length=kmer_length, max_length=max_length)

print(f"we have {len(train_dataset)} training examples and {len(val_dataset)} validation examples")

we have 26592 training examples and 22424 validation examples


In [4]:
model = models.FivemerModel(train_dataset)
burrito = framework.Burrito(train_dataset, val_dataset, model, batch_size=1024, learning_rate=0.1, l2_regularization_coeff=1e-6)
print_parameter_count(model)
print("starting training...")
losses = burrito.train(epochs=100)
losses.tail()

kmer_embedding: 1025 parameters
-----
total: 1025 parameters
starting training...


Epoch:  32%|███▏      | 32/100 [00:26<00:55,  1.22it/s, loss_diff=-2.233e-08, lr=3.2e-5] 


Unnamed: 0,train_loss,val_loss
28,0.0608,0.061525
29,0.060802,0.061525
30,0.0608,0.061525
31,0.060802,0.061525
32,0.060801,0.061525


In [5]:
regularization_coeffs = [0, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
results = []

for coeff in regularization_coeffs:
    print(f"Training with regularization coefficient {coeff}")
    model = models.FivemerModel(train_dataset)
    burrito = framework.Burrito(train_dataset, val_dataset, model, batch_size=1024, learning_rate=0.1, l2_regularization_coeff=coeff)
    loss_history = burrito.train(epochs=100)
    final_training_loss = loss_history['training_losses'].iloc[-1]
    final_validation_loss = loss_history['validation_losses'].iloc[-1]

    results.append({
        'Regularization': coeff,
        'Final_Training_Loss': final_training_loss,
        'Final_Validation_Loss': final_validation_loss
    })

regularization_results_df = pd.DataFrame(results)
plt.figure(figsize=(10, 6))
plt.plot(regularization_results_df['Regularization'], regularization_results_df['Final_Training_Loss'], label='Training Loss', marker='o')
plt.plot(regularization_results_df['Regularization'], regularization_results_df['Final_Validation_Loss'], label='Validation Loss', marker='x')

plt.xscale('log')
plt.xlabel('Regularization Coefficient')
plt.ylabel('Loss')
plt.title('Effect of L2 Regularization on Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

Training with regularization coefficient 0


Epoch:  32%|███▏      | 32/100 [00:25<00:55,  1.23it/s, loss_diff=-1.067e-08, lr=3.2e-5] 


KeyError: 'training_losses'

In [None]:
regularization_results_df

Unnamed: 0,Regularization,Final_Training_Loss,Final_Validation_Loss
0,0.0,0.05883,0.067044
1,1e-08,0.058831,0.067043
2,1e-07,0.058832,0.067037
3,1e-06,0.058857,0.067038
4,1e-05,0.059283,0.067516
5,0.0001,0.061188,0.069719
