# 8. Switching to PyTorch
> Comparing the performance of PyTorch and NumPy based Pagels lambda

In [2]:
# All the imports
%load_ext autoreload
%autoreload 2

import ete3
import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt

# Stuff for timing
import time
from tqdm import tqdm

# My stuff
from src.ihmp import get_diffs
from src.phylogenetic_signal_torch import PagelsLambda as PagelsLambdaTorch
from src.phylogenetic_signal import PagelsLambda as PagelsLambdaNumpy

treepath = "greengenes/data/gg_13_5_otus_99_annotated.tree"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Get data and prune tree appropriately

ibd = get_diffs("ibd")

tree = ete3.Tree(treepath, format=1, quoted_node_names=True)
leaves = ibd.columns
tree.prune(leaves)

ibd.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,1000269,1008348,1009894,1012376,1017181,1017413,1019823,1019878,102222,1023075,...,964363,968675,968954,971907,975306,976470,979707,988375,988932,999046
site,patient,visit,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
feces,3002,6,-10.298141,-10.991272,0.0,0.0,-11.68441,11.70451,0.0,0.307779,0.0,0.0,...,0.0,-0.02869,-2.331253,-12.244023,-1.15262,-11.396731,0.0,-14.059309,1.224061,-10.298141
feces,3002,8,0.0,0.0,0.0,0.0,0.0,-11.70451,0.0,-11.70451,0.0,0.0,...,0.0,-13.313941,-10.605914,0.0,-14.557134,0.0,0.0,0.0,-12.215332,0.0
feces,3002,9,0.0,0.0,0.0,0.0,0.0,10.83337,0.0,0.0,0.0,0.0,...,0.0,12.219649,11.526507,10.83337,14.704551,0.0,0.0,0.0,11.526507,0.0
feces,3003,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
feces,3003,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,11.768101,0.0,0.0,0.0,0.0,0.0,15.02619,11.768101,0.0


In [4]:
# Time initialization - this is probably not the slowest part of the code
# At least you only have to do it once

t1 = time.time()
pl_torch = PagelsLambdaTorch(tree)
t2 = time.time()
print(f"Time to initialize PagelsLambdaTorch: {t2-t1:.2f} seconds")

pl_numpy = PagelsLambdaNumpy(tree)
t3 = time.time()
print(f"Time to initialize PagelsLambdaNumpy: {t3-t2:.2f} seconds")

Time to initialize PagelsLambdaTorch: 67.21 seconds
Time to initialize PagelsLambdaNumpy: 55.77 seconds


In [5]:
# Free preprocessing: turn df into Tensor

ibd_torch = torch.tensor(ibd.values, dtype=torch.float32)

t1 = time.time()
for i in tqdm(range(ibd_torch.shape[0])):
    pl_torch.fit(ibd_torch[i])
t2 = time.time()
print(f"Time to fit PagelsLambdaTorch: {t2-t1:.2f} seconds")

for idx, row in tqdm(ibd.iterrows()):
    pl_numpy.fit(row.values)
t3 = time.time()
print(f"Time to fit PagelsLambdaNumpy: {t3-t2:.2f} seconds")

100%|██████████| 69/69 [00:13<00:00,  5.14it/s]


Time to fit PagelsLambdaTorch: 13.44 seconds


12it [02:52, 17.24s/it]

In [None]:
ibd_torch[0].shape

torch.Size([1370])