# Modelling ARCH

## Fitting sigmoid curves through different trajectories

We want to fit sigmoid curves through different trajectories containing mutations on the same gene.


Each sigmoid curve has the following parameters: 
- amplitude: maximum capacity
- fitness: relative fitness advantage confered by the mutation
- displacement: time of mutation gain

Since we follow the same mutation accross several individuals, although each trajectory can have different displacements and amplitudes, the fitness parameter will be maintained accross individuals.

### Loading the dataset and creating trajectories

In [5]:
import pandas as pd

import numpy as np
import plotly.express as px
import plotly.graph_objects as go
colors=px.colors.qualitative.Plotly

import warnings
warnings.filterwarnings('ignore')

# load dataset
df = pd.read_excel (r'LBC Archer variants.xlsx')

In [6]:
# create a class of object containing the longitudinal trajectory of a mutation accross different individuals

# each participant has several attributes storing information
# together with a list of trajectories of which each element is a trajectory class object

class participant :
    def __init__ (self, id =None,mutation_list=None,trajectories=None,data=None):
        self.id = id                       # participant's id
        self.mutation_list = mutation_list # list of all mutations present in a participant
        self.trajectories = trajectories   # list of all trajectories present in a participant
        self.data=data                     # slice of the full dataset corresponding to the participant
        
    # function allowing to plot a participant's trajectories
    def plot_trajectories (self,germline=True):
        fig =plot_individual(self,germline)
        fig.show()

# each trajectory object stores the information of the trajectory of a participant's mutation
class trajectory :
    def __init__ (self, mutation=None, data = None, germline=False,gradient=None,driver=False) :
        self.mutation=mutation 
        self.germline=germline   
        self.data=data                     # contains the wave and AF columns of the trajectory 
        self.gradient=gradient             # overall gradient of the mutation from first to last wave
        self.driver=driver                 # Boolean denoting wether the mutation is driver or not 

In [7]:
# create a list of all participants 
lbc=[]

# create a participant object for each individual in the dataset
for part in df.participant_id.unique():
    # each participant's  data is filled with its corresponding rows of the dataset
    lbc.append(participant(id=part, data=df[df['participant_id']==part])) 

# we now rescan each participant to create the list of trajectories
for part in lbc:
    # create a list of all trajectories in a participant
    traj=[]
    mutation_list= []
    # add a trajectory for each mutation present in a participant
    for key in part.data.key.unique():
        # keep track of driver trajectories 
        driver=False
        
        # if any row in the trajectory has anything in the driver column set driver to true
        if part.data[part.data['key']==key]['Driver'].isnull().values.any() == True : 
            driver = True
        
        # append trajectory to the list of trajectories
        traj.append(trajectory(mutation=key,data=part.data[part.data['key']==key][['AF','wave']],driver=driver))


    
    # Filter mutations with more than one time data point
    traj=[item for item in traj if len(item.data.wave) > 1]
    # Add all trajectories to the participant
    part.trajectories=traj
    
    # Annotate for each trajectory if it is germline based on condition
    for traj in part.trajectories:
        traj.data['gradient']=None
        mutation_list.append(traj.mutation) # keep track of all mutations present in an individual 
        if np.mean(traj.data.AF)>0.45:      # germline condition
            traj.germline=True
            
    # add the list of all mutations to each individual        
    part.mutation_list=mutation_list
    
#compute the average gradient for each mutation in each individual
for part in lbc : 
    for traj in part.trajectories :
        a=0 
        for index, general_index in enumerate(traj.data.index) :
            if index < len(traj.data)-1 :
                trace = traj.data.iloc[index+1]-traj.data.iloc[index]
                traj.data['gradient'].iloc[index] = trace.AF/trace.wave 
                a += trace.AF / trace.wave
                
        traj.gradient= a/(len(traj.data.index)-1)
        
        
# Create an array containing all trajectories' gradients.
total_grad=[]
for part in lbc:
    for traj in part.trajectories :
        total_grad.append(traj.gradient)
        # change the local gradient by an average of the gradients
        traj.data=traj.data.reset_index(drop=True)
        new =[]
        for i, grad in enumerate(traj.data.gradient):
            if i ==0:
                new.append(grad)
            elif i == len(traj.data.gradient)-1:
                new.append(traj.data.gradient[i-1])
            else :
                new.append((traj.data.gradient[i-1]+grad)/2)
        traj.data.gradient=new

We also define a few useful functions to plot data. These will be explained in the next section.

In [8]:
# tool to name all mutations present in a sample
def mutation_to_samples (mutation):
    list=[]
    for part in participants:
        if mutation in part.mutation_list:
            list.append(part.id)
    return list

# tool to plot all mutations AF accross waves
def plot_individual(part,germline):
    fig = go.Figure()
    if germline==True : 
        for traj in part.trajectories:
            fig.add_trace(go.Scatter(x=traj.data.wave, y=traj.data.AF,
                                        mode='lines+markers',
                                        name=traj.mutation))
    else :
        for traj in part.trajectories:
            if traj.germline==False:
                fig.add_trace(go.Scatter(x=traj.data.wave, y=traj.data.AF,
                                        mode='lines+markers',
                                        name=traj.mutation))
    fig.update_layout(title = f'Trajectories of participant {part.id}' ,
                   xaxis_title='Wave',
                   yaxis_title='AF')
    return fig

def plot_id (cohort,participant_id, germline=False):
    for part in cohort:
        if part.id == participant_id :
            fig= plot_individual(part, germline)
    return fig
    
    
def mutation_plot (cohort, mutation):
    fig = go.Figure()

    for part in cohort:
        for i , word in enumerate(part.mutation_list):
            if mutation in word.split():
                fig.add_trace(go.Scatter(x=part.trajectories[i].data.wave, y=part.trajectories[i].data.AF,
                                        mode='lines+markers',
                                        name=part.trajectories[i].mutation,
                                        hovertemplate = f"{part.id}"
                                        ))
    # Edit the layout
    fig.update_layout(title = f'Trajectories containing mutation {mutation}' ,
                   xaxis_title='Wave',
                   yaxis_title='AF')

    return fig
    
    
def mutation_plot2 (cohort, mutation,threshold):
    fig = go.Figure()
        
    for  part in cohort:
        for traj in part.trajectories:
            if mutation in traj.mutation and traj.gradient > threshold :
                fig.add_trace(go.Scatter(x=traj.data.wave, y=traj.data.AF,
                                        mode='lines+markers',
                                        name=traj.mutation,
                                        hovertemplate =f"{part.id}"
                                        ))
                
    # Edit the layout
    fig.update_layout(title = f'Trajectories containing mutation {mutation}' ,
                        xaxis_title='Wave',
                        yaxis_title='AF')

    return fig
    
def gradients (cohort,mutations):
    df = pd.DataFrame(columns=['gradient', 'participant','mutation'])

    counter = 0
    for part in cohort:
        for i , word in enumerate(part.mutation_list):
            for mut in mutations:
                if mut in word.split():
                    df.loc[counter]=[part.trajectories[i].gradient, part.id, mut]
                    counter += 1
                
     
    fig = px.violin(df, y="gradient",x='mutation', color='mutation',
                         points='all',
                         hover_name="participant", hover_data=["mutation"])
                
    fig.update_layout(title='Overall gradients of trajectories sorted by mutation')
    return fig
    
def relative_gradients (cohort,mutations):
    df = pd.DataFrame(columns=['relative gradient', 'participant','mutation'])

    counter = 0
    for part in cohort:
        for i , word in enumerate(part.mutation_list):
            for mut in mutations:
                if mut in word.split():
                    df.loc[counter]=[part.trajectories[i].data['gradient']/part.trajectories[i].data['AF'],
                                     part.id, mut]
                    counter += 1
                
     
    fig = px.violin(df, y="relative gradient",x='mutation', color='mutation',
                         points='all',
                         hover_name="participant", hover_data=["mutation"])
                
    fig.update_layout(title='Overall relative gradients of trajectories sorted by mutation')
    return fig


def local_gradients (cohort,mutations):
    df = pd.DataFrame(columns=['gradient', 'wave' ,'participant','mutation'])
    counter=0
    for part in lbc:
        for i , word in enumerate(part.mutation_list):
            for mut in mutations:
                if mut in word.split():
                    for j in range(0, part.trajectories[i].data.shape[0]):
                        df.loc[counter]=[part.trajectories[i].data['gradient'].iloc[j],part.trajectories[i].data['wave'].iloc[j], part.id, mut]
                        counter += 1
    fig = px.violin(df, y="gradient", x="wave", facet_col="mutation",color='mutation',
                         points='all',
                         hover_name="participant", hover_data=["mutation"])
    # Edit the layout
    fig.update_layout(title = f'local gradient computed betweeen any 2 blood samples' ,
                   xaxis_title='Wave',
                   yaxis_title='local gradient')
    return fig

def clonal_expansion(cohort, gradient_threshold):

    fig = go.Figure()

    for part in cohort:
        for traj in part.trajectories:
            if traj.gradient > gradient_threshold :
                fig.add_trace(go.Scatter(x=traj.data.wave, y=traj.data.AF,
                                        mode='lines+markers',
                                        name=traj.mutation,
                                        hovertemplate = f"{part.id}"
                                        ))
    # Edit the layout
    fig.update_layout(title = f'clonal expansion mutations' ,
                   xaxis_title='Wave',
                   yaxis_title='AF')

    return fig

def fitness (cohort, mut):
    mut_mean=[]
    for part in lbc:
        for traj in part.trajectories:
            if mut in traj.mutation.split():
                if traj.gradient> np.quantile(total_grad, 0.85):
                    mut_mean.append((traj.data.gradient/traj.data.AF).tolist())
                
    #flatten mut_list
    mut_mean = [i for b in map(lambda x:[x] if not isinstance(x, list) else x, mut_mean) for i in b]

    #Filter outliers from the list
    mut_mean=[item for item in mut_mean if item > np.nanquantile(mut_mean,0.1) and item < np.nanquantile(mut_mean,0.9)]

    # compute the mean
    fig = go.Figure(data=go.Violin(y=mut_mean, box_visible=True, line_color='black',
                               meanline_visible=True, fillcolor='lightseagreen', opacity=0.6,
                               x0=mut))

    fig.update_layout(yaxis_zeroline=False)
    return mut_mean, fig

def mutation_expansion(mutation):
    fig = go.Figure()

    fit, fig=fitness(lbc,mutation)
    mut_mean=np.mean(fit)
    x = np.array([1,2,3,4])
    y=np.exp(x*mut_mean)
    for init in [0.001,0.05,0.1]:
        fig.add_trace(go.Scatter(x=x, y=init*np.exp(x*mut_mean),
                                        mode='lines+markers'
                                        ))
    #Edit the layout
    fig.update_layout(title = f'Example of trajectories with a fitness advantage similar to {mutation}' ,
                   xaxis_title='Wave',
                   yaxis_title='AF')
    return fig

## Setting up trajectories

In [119]:
# set the trajectories that we want to fit
mutation='TET2'
threshold=np.quantile(total_grad, 0.85)
data=[]
x=[]
for part in lbc:
    for traj in part.trajectories:
        if mutation in traj.mutation and traj.gradient > threshold and len(traj.data)>2: 
            data.append(traj.data.AF)
            x.append(traj.data.wave)

We can visualize the trajectories that have been selected for fitting

In [120]:
fig=go.Figure()
for i, points in enumerate(zip(x,data)):
    fig.add_trace(go.Scatter(x=points[0], y=points[1],mode='lines+markers', line=dict(color=colors[i])))
fig

## Fitting multiple curves with shared parameters

In [121]:
total_res=[]   # keeps track of the cumulative residuals accross all data sets
counter=0
    
# compute the residual for each trajectory in the dataset
for i , points in enumerate(zip(x,data)):
        # points[0] -> x
        # points[1] -> y
    res = points[1] - sigmoid_dataset(fit_params, i, points[0])
    total_res.append(res)

    # now flatten this to a 1D array, as minimize() needs

total_res=np.concatenate(total_res).ravel()
for i, j in enumerate(data):
    total_res=np.append(total_res,np.asarray([0.5-fit_params['amp_%i' % (i+1)]]))


KeyError: 'amp_4'

In [127]:
from lmfit import Parameters, minimize, report_fit

def sigmoid(x, amp, fit, dis):
    """Sigmoid lineshape."""
    return amp / (1.+np.exp(-fit*(x -dis)))


def sigmoid_dataset(params, i, x):
    """Calculate Sigmoid lineshape from parameters for data set."""
    amp = params['amp_%i' % (i+1)]
    fit = params['fit_%i' % (i+1)]
    dis = params['dis_%i' % (i+1)]
    return sigmoid(x, amp, fit, dis)


def objective(params, x, data, a):
    """Calculate total residual for fits of Sigmoids to several data sets."""
    
    total_res=[]   # keeps track of the cumulative residuals accross all data sets
    counter=0
    
    # compute the residual for each trajectory in the dataset
    for i , points in enumerate(zip(x,data)):
        # points[0] -> x
        # points[1] -> y
        res = points[1] - sigmoid_dataset(params, i, points[0])
        total_res.append(res)

    # now flatten this to a 1D array, as minimize() needs
    #return np.concatenate(total_res).ravel()

    total_res=np.concatenate(total_res).ravel()
    for i, j in enumerate(data):
        total_res=np.append(total_res,np.asarray([a*(0.5-params['amp_%i' % (i+1)])]))
        
    return total_res

fit_params = Parameters()

# initialize parameters using default values and boundary values
for iy, y in enumerate(data):
    fit_params.add('amp_%i' % (iy+1), value=0.4, min=0.25, max=0.5)
    fit_params.add('fit_%i' % (iy+1), value=0,  min =-1.)
    fit_params.add('dis_%i' % (iy+1), value=0,   min=-10,  max=10)
    
for iy in range(2,len(data)+1):
    fit_params['fit_%i' % iy].expr = 'fit_1'

# coefficeint of l^2 regularization on amplitude parameters
a=0.1

# fit the data using lmfit 'minimize' function 
out = minimize(objective, fit_params, args=(x, data, a))
out.params

name,value,initial value,min,max,vary,expression
amp_1,0.48335776,0.4,0.25,0.5,True,
fit_1,0.54878567,0.0,-1.0,inf,True,
dis_1,6.01655467,0.0,-10.0,10.0,True,
amp_2,0.5,0.4,0.25,0.5,True,
fit_2,0.54878567,0.0,-1.0,inf,False,fit_1
dis_2,6.24626043,0.0,-10.0,10.0,True,
amp_3,0.41969493,0.4,0.25,0.5,True,
fit_3,0.54878567,0.0,-1.0,inf,False,fit_1
dis_3,3.20440278,0.0,-10.0,10.0,True,
amp_4,0.25,0.4,0.25,0.5,True,


Notice that all *fit* parameters are equal

## Plot fitted curves

In [128]:
fig=go.Figure()
for i, points in enumerate(zip(x,data)):
    x_line = np.linspace(-5,10,1000)
    y_fit = sigmoid_dataset(out.params, i, x_line)
    fig.add_trace(go.Scatter(x=points[0], y=points[1],mode='markers', line=dict(color=colors[i])))
    
    fig.add_trace(go.Scatter(x=x_line, y=y_fit,mode='lines',line=dict(color=colors[i])))
fig.write_image('/home/elatorre/Desktop/ARCH exploration/TET2_same_fitness_reg.png')
fig