In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
%matplotlib inline
from matplotlib import pyplot as plt
import Levenshtein
import matplotlib as mpl
import seaborn as sns
import autoreload
import scipy
params = {
    'font.size': 12,
    'axes.titlesize': 12,
    'axes.labelsize': 12,
    'legend.fontsize': 12,
    'xtick.labelsize': 8,
    'ytick.labelsize': 10,
    'font.family': "Helvetica",
    'pdf.fonttype': 42,
    'ps.fonttype': 42,
    'figure.dpi': 100
   }

mpl.rcParams.update(params)

sns.set_style("ticks")
sns.set_context(context='paper')
savefig_args = {"dpi": 300, "bbox_inches": "tight", "pad_inches": 0, "transparent": False}
mpl.rc('savefig', dpi=300)
output_dir='../../figures/'
output_suffix = ""
output_formats = [".png", ".pdf"]

def save_figure(fig, name, output_dir=output_dir, output_suffix=output_suffix, output_formats=output_formats, savefig_args=savefig_args):
    for output_format in output_formats:
        fig.savefig(output_dir + "/" + name + output_suffix + output_format, **savefig_args)
    return None

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 20) 
pd.set_option('display.width', 100)

def tissue_colors():
    
    tissue_color_dict = {'Bladder': '#e7969c',
             'Blood': '#d6616b',
             'Bone_Marrow': '#cedb9c',
             'Eye': '#c7ea46',#"#00ff7f",
             'Fat': '#e7cb94',
             'Heart': '#ff0800',
             'Kidney': '#7b4173',
             'Large_Intestine': '#31a354',
             'Liver': '#000080',
             'Lung': '#3182bd',
             'Lymph_Node': '#8c6d31',
             'Mammary':'#ce6dbd',
             'Muscle': '#e7ba52',
             'Pancreas': '#fd8d3c',
             'Prostate':'#637939',#'#a55194',#
             'Salivary_Gland':'#622a0f',
             'Skin': '#de9ed6',
             'Small_Intestine': '#6baed6',
             'Spleen': '#393b79',
             'Thymus': '#9c9ede',
             'Tongue':'#b5cf6b',
             'Trachea': '#969696',
             'Uterus':'#c64b8c',#'#ff0090',
             'Vasculature': '#843c39'}
    return tissue_color_dict

In [2]:
df = pd.read_csv("../../../TabulaSapiens/data/BCRChangeo.csv")

In [4]:
def calculate_v_mu_freq(df):
    df['distance_to_germline_v'] = df.apply(lambda x: Levenshtein.distance(x.v_sequence_alignment, x.v_germline_alignment), axis=1)


    df['v_mu_freq'] = df['distance_to_germline_v'] / df['v_sequence_alignment'].str.len()
    return df

In [8]:
df.columns

Index(['Unnamed: 0', 'SEQUENCE_ID', 'SEQUENCE_INPUT', 'FUNCTIONAL', 'IN_FRAME', 'STOP',
       'MUTATED_INVARIANT', 'INDELS', 'LOCUS', 'V_CALL', 'D_CALL', 'J_CALL', 'SEQUENCE_VDJ',
       'SEQUENCE_IMGT', 'V_SEQ_START', 'V_SEQ_LENGTH', 'V_GERM_START_VDJ', 'V_GERM_LENGTH_VDJ',
       'V_GERM_START_IMGT', 'V_GERM_LENGTH_IMGT', 'NP1_LENGTH', 'D_SEQ_START', 'D_SEQ_LENGTH',
       'D_GERM_START', 'D_GERM_LENGTH', 'NP2_LENGTH', 'J_SEQ_START', 'J_SEQ_LENGTH',
       'J_GERM_START', 'J_GERM_LENGTH', 'JUNCTION', 'JUNCTION_LENGTH', 'GERMLINE_IMGT', 'V_SCORE',
       'V_IDENTITY', 'V_EVALUE', 'V_CIGAR', 'D_SCORE', 'D_IDENTITY', 'D_EVALUE', 'D_CIGAR',
       'J_SCORE', 'J_IDENTITY', 'J_EVALUE', 'J_CIGAR', 'FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT',
       'FWR4_IMGT', 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT', 'CLONE', 'GERMLINE_IMGT_D_MASK',
       'GERMLINE_V_CALL', 'GERMLINE_D_CALL', 'GERMLINE_J_CALL', 'MU_COUNT_SEQ_R', 'MU_COUNT_SEQ_S',
       'MU_FREQ_SEQ_R', 'MU_FREQ_SEQ_S', 'CellBarcode', 'Other_Inf

In [10]:
df.V_SEQ_START

0        76
1       137
2       103
3       120
4        96
       ... 
1488     15
1489     11
1490     41
1491     10
1492     16
Name: V_SEQ_START, Length: 1493, dtype: int64

In [18]:
df.groupby('Tissue')['MU_FREQ'].mean()

Tissue
Bladder          0.106566
Blood            0.006914
Kidney           0.049863
LI               0.095491
Lung             0.088911
LungNeuron       0.054076
LymphNode        0.054091
Muscle           0.081137
SI               0.043678
Skin             0.136000
Spleen           0.053909
Thymus           0.078577
Vasculature      0.076199
vertebralbody    0.074374
Name: MU_FREQ, dtype: float64

In [21]:
df[df['Tissue'] != "Blood"].MU_FREQ.std()

0.04135273552846125

In [6]:
df = calculate_v_mu_freq(df)

AttributeError: 'Series' object has no attribute 'v_sequence_alignment'