In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# load dataset
df = pd.read_excel (r'LBC_ARCHER/LBC Archer variants.xlsx')

In [2]:
# create a class of object containing the longitudinal trajectory of a mutation accross different individuals

# each participant has several attributes storing information
# together with a list of trajectories of which each element is a trajectory class object

class participant :
    def __init__ (self, id =None,mutation_list=None,trajectories=None,data=None):
        self.id = id                       # participant's id
        self.mutation_list = mutation_list # list of all mutations present in a participant
        self.trajectories = trajectories   # list of all trajectories present in a participant
        self.data=data                     # slice of the full dataset corresponding to the participant
        
    # function allowing to plot a participant's trajectories
    def plot_trajectories (self,germline=True):
        fig =plot_individual(self,germline)
        fig.show()

# each trajectory object stores the information of the trajectory of a participant's mutation
class trajectory :
    def __init__ (self, mutation=None, data = None, germline=False,gradient=None,driver=False) :
        self.mutation=mutation 
        self.germline=germline   
        self.data=data                     # contains the wave and AF columns of the trajectory 
        self.gradient=gradient             # overall gradient of the mutation from first to last wave
        self.driver=driver                 # Boolean denoting wether the mutation is driver or not 

In [3]:
# create a list of all participants 
lbc=[]

# create a participant object for each individual in the dataset
for part in df.participant_id.unique():
    # each participant's  data is filled with its corresponding rows of the dataset
    lbc.append(participant(id=part, data=df[df['participant_id']==part])) 

# we now rescan each participant to create the list of trajectories
for part in lbc:
    # create a list of all trajectories in a participant
    traj=[]
    mutation_list= []
    # add a trajectory for each mutation present in a participant
    for key in part.data.key.unique():
        # keep track of driver trajectories 
        driver=False
        
        # if any row in the trajectory has anything in the driver column set driver to true
        if part.data[part.data['key']==key]['Driver'].isnull().values.any() == False : 
            driver = True
        
        # append trajectory to the list of trajectories
        traj.append(trajectory(mutation=key,data=part.data[part.data['key']==key][['AF','wave']],driver=driver))


    
    # Filter mutations with more than one time data point
    traj=[item for item in traj if len(item.data.wave) > 1]
    # Add all trajectories to the participant
    part.trajectories=traj
    
    # Annotate for each trajectory if it is germline based on condition
    for traj in part.trajectories:
        traj.data['gradient']=None
        mutation_list.append(traj.mutation) # keep track of all mutations present in an individual 
        if np.mean(traj.data.AF)>0.45:      # germline condition
            traj.germline=True
            
    # add the list of all mutations to each individual        
    part.mutation_list=mutation_list
    
#compute the average gradient for each mutation in each individual
for part in lbc : 
    for traj in part.trajectories :
        a=0 
        for index, general_index in enumerate(traj.data.index) :
            if index < len(traj.data)-1 :
                trace = traj.data.iloc[index+1]-traj.data.iloc[index]
                traj.data['gradient'].iloc[index] = trace.AF/trace.wave 
                a += trace.AF / trace.wave
                
        traj.gradient= a/(len(traj.data.index)-1)
        
        
# Create an array containing all trajectories' gradients.
total_grad=[]
for part in lbc:
    for traj in part.trajectories :
        total_grad.append(traj.gradient)
        # change the local gradient by an average of the gradients
        traj.data=traj.data.reset_index(drop=True)
        new =[]
        for i, grad in enumerate(traj.data.gradient):
            if i ==0:
                new.append(grad)
            elif i == len(traj.data.gradient)-1:
                new.append(traj.data.gradient[i-1])
            else :
                new.append((traj.data.gradient[i-1]+grad)/2)
        traj.data.gradient=new

We also define a few useful functions to plot data. These will be explained in the next section.

In [4]:
# tool to name all mutations present in a sample
def mutation_to_samples (mutation):
    list=[]
    for part in participants:
        if mutation in part.mutation_list:
            list.append(part.id)
    return list

# tool to plot all mutations AF accross waves
def plot_individual(part,germline):
    fig = go.Figure()
    if germline==True : 
        for traj in part.trajectories:
            fig.add_trace(go.Scatter(x=traj.data.wave, y=traj.data.AF,
                                        mode='lines+markers',
                                        name=traj.mutation))
    else :
        for traj in part.trajectories:
            if traj.germline==False:
                fig.add_trace(go.Scatter(x=traj.data.wave, y=traj.data.AF,
                                        mode='lines+markers',
                                        name=traj.mutation))
    fig.update_layout(title = f'Trajectories of participant {part.id}' ,
                   xaxis_title='Wave',
                   yaxis_title='AF')
    return fig

def plot_id (cohort,participant_id, germline=False):
    for part in cohort:
        if part.id == participant_id :
            fig= plot_individual(part, germline)
    return fig
    
    
def mutation_plot (cohort, mutation):
    fig = go.Figure()

    for part in cohort:
        for i , word in enumerate(part.mutation_list):
            if mutation in word.split():
                fig.add_trace(go.Scatter(x=part.trajectories[i].data.wave, y=part.trajectories[i].data.AF,
                                        mode='lines+markers',
                                        name=part.trajectories[i].mutation,
                                        hovertemplate = f"{part.id}"
                                        ))
    # Edit the layout
    fig.update_layout(title = f'Trajectories containing mutation {mutation}' ,
                   xaxis_title='Wave',
                   yaxis_title='AF')

    return fig
    
    
def mutation_plot2 (cohort, mutation,threshold):
    fig = go.Figure()
        
    for  part in cohort:
        for traj in part.trajectories:
            if mutation in traj.mutation and traj.gradient > threshold :
                fig.add_trace(go.Scatter(x=traj.data.wave, y=traj.data.AF,
                                        mode='lines+markers',
                                        name=traj.mutation,
                                        hovertemplate =f"{part.id}"
                                        ))
                
    # Edit the layout
    fig.update_layout(title = f'Trajectories containing mutation {mutation}' ,
                        xaxis_title='Wave',
                        yaxis_title='AF')

    return fig
    
def gradients (cohort,mutations):
    df = pd.DataFrame(columns=['gradient', 'participant','mutation'])

    counter = 0
    for part in cohort:
        for i , word in enumerate(part.mutation_list):
            for mut in mutations:
                if mut in word.split():
                    df.loc[counter]=[part.trajectories[i].gradient, part.id, mut]
                    counter += 1
                
     
    fig = px.violin(df, y="gradient",x='mutation', color='mutation',
                         points='all',
                         hover_name="participant", hover_data=["mutation"])
                
    fig.update_layout(title='Overall gradients of trajectories sorted by mutation')
    return fig
    
def relative_gradients (cohort,mutations):
    df = pd.DataFrame(columns=['relative gradient', 'participant','mutation'])

    counter = 0
    for part in cohort:
        for i , word in enumerate(part.mutation_list):
            for mut in mutations:
                if mut in word.split():
                    df.loc[counter]=[part.trajectories[i].data['gradient']/part.trajectories[i].data['AF'],
                                     part.id, mut]
                    counter += 1
                
     
    fig = px.violin(df, y="relative gradient",x='mutation', color='mutation',
                         points='all',
                         hover_name="participant", hover_data=["mutation"])
                
    fig.update_layout(title='Overall relative gradients of trajectories sorted by mutation')
    return fig


def local_gradients (cohort,mutations):
    df = pd.DataFrame(columns=['gradient', 'wave' ,'participant','mutation'])
    counter=0
    for part in lbc:
        for i , word in enumerate(part.mutation_list):
            for mut in mutations:
                if mut in word.split():
                    for j in range(0, part.trajectories[i].data.shape[0]):
                        df.loc[counter]=[part.trajectories[i].data['gradient'].iloc[j],part.trajectories[i].data['wave'].iloc[j], part.id, mut]
                        counter += 1
    fig = px.violin(df, y="gradient", x="wave", facet_col="mutation",color='mutation',
                         points='all',
                         hover_name="participant", hover_data=["mutation"])
    # Edit the layout
    fig.update_layout(title = f'local gradient computed betweeen any 2 blood samples' ,
                   xaxis_title='Wave',
                   yaxis_title='local gradient')
    return fig

def clonal_expansion(cohort, gradient_threshold):

    fig = go.Figure()

    for part in cohort:
        for traj in part.trajectories:
            if traj.gradient > gradient_threshold :
                fig.add_trace(go.Scatter(x=traj.data.wave, y=traj.data.AF,
                                        mode='lines+markers',
                                        name=traj.mutation,
                                        hovertemplate = f"{part.id}"
                                        ))
    # Edit the layout
    fig.update_layout(title = f'clonal expansion mutations' ,
                   xaxis_title='Wave',
                   yaxis_title='AF')

    return fig

# Plotting data

We have created several plot functions to explore the data in the cohort.

## Plotting a participant's trajectory

There are two ways of plotting a participant's trajectory, either by using a participants pointer inside the cohort, or using a participants id.

In [5]:
# Plotting a participant using a pointer
lbc[0].plot_trajectories(germline=True)  # 'germline=false' filters mutations having an AF mean above 0.45 

In [6]:
# Plotting a participant using a participant's id
plot_id(lbc,'LBC360021', germline=True)

## Plotting trajectory gradients


Function *gradients* allows us to crate violin plots of the overall gradient of all trajectories containing a mutation in a list of genes.

In [7]:
gradients(lbc, ['DNMT3A','JAK2','TET2','RUNX1'])

We can produce a similar plot, but instead of using the overall gradient, showing the local gradients sorted grouped by wave:

In [8]:
local_gradients(lbc,['JAK2','TET2','DNMT3A'])

## Estimating mutations confering fitness advantage 

All global gradients of trajectories have been stored in a list called 'total_grad'. We can plot this list, and use it to extract information about which mutations are present in any quantile. 

In [9]:
# Plotting total_grad
fig = go.Figure(data=go.Violin(y=total_grad, box_visible=True, line_color='black', points='all',
                               meanline_visible=True, fillcolor='lightseagreen', opacity=0.6,
                               x0='Total Bill'))

fig.update_layout(title = f'Overall gradient of each trajectory present in each patient of the cohort',
                  yaxis_zeroline=False)
fig.show()

We can therefore plot all mutations present in the top 5% as ranked by overall gradient using *clonal_expansion* function.

In [10]:
clonal_expansion(lbc, np.quantile(total_grad, 0.95))

Further, we can plot trajectories following a mutation in any particular gene, present in any percentile as ranked by overall gradient.

In [11]:
mutation_plot2(lbc,'DNMT3A',np.quantile(total_grad, 0.9))

# Modeling clonal expansion *a la blundell*

We should be able too check that for each mutation, at least within a sample, local ratio
$$\frac{G_c}{AF_c}\sim \lambda r s$$
should be constant.

Let's have a look at the TET2 mutation of sample 'LBC360181'.

In [12]:
plot_id (lbc,'LBC360181')

In [13]:
for i, part in enumerate(lbc):
    if part.id =='LBC360181':
        for traj in lbc[58].trajectories:
            if 'TET2' in traj.mutation.split():
                data=traj.data
                
print(data.gradient/data.AF)
TET=np.mean(data.gradient/data.AF)
f'We can estimate lrs as the mean {TET}'

0    0.317021
1    0.299138
2    0.314161
3    0.267364
dtype: float64


'We can estimate lrs as the mean 0.29942121434807245'

We can also create an estimate for all TET mutations having a differentially high global gradient:

In [14]:
genename = 'DNMT3A'
quantile = 0.9
fig = mutation_plot2(lbc,genename,np.quantile(total_grad, quantile))
fig.write_image("LBC_ARCHER/plots/trajectories" + genename + "p" + str(int(100*quantile)) + ".pdf", width=500, height=350, scale=1)
fig.show()

Then we can compute the mean $\lambda r s$ over all these TET2 mutations. 

This process could be improved by choosing all TET2 non-synonymous mutations.

In [15]:
all_mut_rgrad=[]
all_mut_VAF=[]
quantile = 0.9
outlier_filter = [0.10,0.90]
for part in lbc:
    for traj in part.trajectories:
        if genename in traj.mutation.split():
            if traj.gradient> np.quantile(total_grad, quantile):
                all_mut_rgrad.append((traj.data.gradient/traj.data.AF).tolist())
                all_mut_VAF.append((traj.data.AF).tolist())
                
#flatten mut_list
all_mut_rgrad = [i for b in map(lambda x:[x] if not isinstance(x, list) else x, all_mut_rgrad) for i in b]
all_mut_VAF = [i for b in map(lambda x:[x] if not isinstance(x, list) else x, all_mut_VAF) for i in b]

#Filter outliers from the list
all_mut_VAF =[item for index,item in enumerate(all_mut_VAF) if all_mut_rgrad[index] > np.nanquantile(all_mut_rgrad,outlier_filter[0]) and all_mut_rgrad[index] < np.nanquantile(all_mut_rgrad,outlier_filter[1])]
all_mut_rgrad=[item for item in all_mut_rgrad if item > np.nanquantile(all_mut_rgrad,outlier_filter[0]) and item < np.nanquantile(all_mut_rgrad,outlier_filter[1])]

# compute the mean
mut_mean=np.mean(all_mut_rgrad)
mut_mean

0.22879248593556634

Calculate the relative gradient for all mutations labelled as DRIVER

In [16]:
all_mut_driver_rgrad=[]
all_mut_driver_VAF=[]
outlier_filter = [0.10,0.90]
for part in lbc:
    for traj in part.trajectories:
        if genename in traj.mutation.split():
            if traj.driver==True:
                all_mut_driver_rgrad.append((traj.data.gradient/traj.data.AF).tolist())
                all_mut_driver_VAF.append((traj.data.AF).tolist())
                
#flatten mut_list
all_mut_driver_rgrad = [i for b in map(lambda x:[x] if not isinstance(x, list) else x, all_mut_driver_rgrad) for i in b]
all_mut_driver_VAF = [i for b in map(lambda x:[x] if not isinstance(x, list) else x, all_mut_driver_VAF) for i in b]

#Filter outliers from the list
all_mut_driver_VAF =[item for index,item in enumerate(all_mut_driver_VAF) if all_mut_driver_rgrad[index] > np.nanquantile(all_mut_driver_rgrad,outlier_filter[0]) and all_mut_driver_rgrad[index] < np.nanquantile(all_mut_driver_rgrad,outlier_filter[1])]
all_mut_driver_rgrad=[item for item in all_mut_driver_rgrad if item > np.nanquantile(all_mut_driver_rgrad,outlier_filter[0]) and item < np.nanquantile(all_mut_driver_rgrad,outlier_filter[1])]

# compute the mean
mut_driver_mean=np.mean(all_mut_driver_rgrad)
mut_driver_mean

0.29319101724900337

We can similarly compute the relative gradients for all mutations NOT labelled as driver.

In [17]:
all_mut_nondriver_rgrad=[]
all_mut_nondriver_VAF=[]
outlier_filter = [0.10,0.90]
for part in lbc:
    for traj in part.trajectories:
        if genename in traj.mutation.split():
            if traj.driver==False:
                all_mut_nondriver_rgrad.append((traj.data.gradient/traj.data.AF).tolist())
                all_mut_nondriver_VAF.append((traj.data.AF).tolist())
                
#flatten mut_list
all_mut_nondriver_rgrad = [i for b in map(lambda x:[x] if not isinstance(x, list) else x, all_mut_nondriver_rgrad) for i in b]
all_mut_nondriver_VAF = [i for b in map(lambda x:[x] if not isinstance(x, list) else x, all_mut_nondriver_VAF) for i in b]

#Filter outliers from the list
all_mut_nondriver_VAF =[item for index,item in enumerate(all_mut_nondriver_VAF) if all_mut_nondriver_rgrad[index] > np.nanquantile(all_mut_nondriver_rgrad,outlier_filter[0]) and all_mut_nondriver_rgrad[index] < np.nanquantile(all_mut_nondriver_rgrad,outlier_filter[1])]
all_mut_nondriver_rgrad=[item for item in all_mut_nondriver_rgrad if item > np.nanquantile(all_mut_nondriver_rgrad,outlier_filter[0]) and item < np.nanquantile(all_mut_nondriver_rgrad,outlier_filter[1])]

# compute the mean
mut_nondriver_mean=np.mean(all_mut_nondriver_rgrad)
mut_nondriver_mean

0.09018134783366247

We can see that this mean is not too far from the one evaluated in the previous example.

In [18]:
# make a scatter plot of relative gradient vs VAF to show saturation (=slowing of growth) near VAF=0.5
fig = go.Figure(go.Scatter(x=all_mut_VAF, y=all_mut_rgrad, mode='markers'))
fig.update_layout(title = f'Saturation of fitness advantage for '+genename,  xaxis_title='VAF',
                        yaxis_title='λrs')
fig.write_image("LBC_ARCHER/plots/saturation" + genename + "p" + str(int(100*quantile)) + ".pdf", width=400, height=350, scale=1)

fig.show()

In [19]:
# as above but for all genes: make a scatter plot of relative gradient vs VAF to show saturation (=slowing of growth) near VAF=0.5
all_rgrad=[]
all_VAF=[]
quantile = 0.9
outlier_filter = [0.05,0.95]
for part in lbc:
    for traj in part.trajectories:
        if traj.gradient> np.quantile(total_grad, quantile):
            all_rgrad.append((traj.data.gradient/traj.data.AF).tolist())
            all_VAF.append((traj.data.AF).tolist())
                
#flatten mut_list
all_rgrad = [i for b in map(lambda x:[x] if not isinstance(x, list) else x, all_rgrad) for i in b]
all_VAF = [i for b in map(lambda x:[x] if not isinstance(x, list) else x, all_VAF) for i in b]

#Filter outliers from the list
all_VAF =[item for index,item in enumerate(all_VAF) if all_rgrad[index] > np.nanquantile(all_rgrad,outlier_filter[0]) and all_rgrad[index] < np.nanquantile(all_rgrad,outlier_filter[1])]
all_rgrad=[item for item in all_rgrad if item > np.nanquantile(all_rgrad,outlier_filter[0]) and item < np.nanquantile(all_rgrad,outlier_filter[1])]

# make the scatter plot
fig = go.Figure(go.Scatter(x=all_VAF, y=all_rgrad, mode='markers'))
fig.update_layout(title = f'Saturation of fitness advantage for all genes',  xaxis_title='VAF',
                        yaxis_title='λrs')
fig.write_image("LBC_ARCHER/plots/saturation_allgenes_p" + str(int(100*quantile)) + ".pdf", width=400, height=350, scale=1)

fig.show()

# Plotting distribution of λrs

In [20]:
fig = go.Figure(data=go.Violin(y=all_mut_rgrad, box_visible=True, line_color='black',
                               meanline_visible=True, fillcolor='lightseagreen', opacity=0.6,
                               x0=genename))

fig.update_layout(title = f'Fitness advantage based on longitudinal data',
                  yaxis_zeroline=False)
fig.write_image("LBC_ARCHER/plots/fitness" + genename + "p" + str(int(100*quantile)) + ".pdf", width=400, height=350, scale=1)
fig.show()

In [21]:
fig = go.Figure(data=go.Violin(y=all_mut_driver_rgrad, box_visible=True, line_color='black',
                               meanline_visible=True, fillcolor='red', opacity=0.6,
                               x0=genename+' driver'))
fig.add_trace(go.Violin(y=all_mut_nondriver_rgrad, box_visible=True, line_color='black',
                               meanline_visible=True, fillcolor='lightseagreen', opacity=0.6,
                               x0=genename+' other'))

fig.update_layout(title = f'Fitness advantage based on longitudinal data',
                  yaxis_zeroline=False)
fig.write_image("LBC_ARCHER/plots/fitness" + genename + "_annotated.pdf", width=400, height=350, scale=1)
fig.show()

In [22]:
fig = go.Figure()

x = np.array([1,2,3,4])
y=np.exp(x*mut_mean)
for init in [0.05,0.1,0.2]:
    fig.add_trace(go.Scatter(x=x, y=init*np.exp(x*mut_mean),
                                        mode='lines+markers'
                                        ))
#Edit the layout
fig.update_layout(title = f'Trajectories containing mutation' ,
                   xaxis_title='Wave',
                   yaxis_title='AF')