In [59]:
import pandas as pd
import scirpy as ir
import Levenshtein
%matplotlib inline
import autoreload
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20) 
pd.set_option('display.width', 100)

def calculate_v_mu_freq(df):
    df['distance_to_germline_v'] = df.apply(lambda x: Levenshtein.distance(x.v_sequence_alignment, x.v_germline_alignment), axis=1)
    df['v_mu_freq'] = df['distance_to_germline_v'] / df['v_sequence_alignment'].str.len()
    return df

## Prepare the data

In [60]:
# output of my snakemake
airr = pd.read_table('../../data/snakemake_outputs/combined_igblast.airr.tsv')

  exec(code_obj, self.user_global_ns, self.user_ns)


### filter and munge

In [61]:
# get rid of merging artefacts
print("{} total assemblies out of snakemake".format(airr.shape[0]))
airr = airr.dropna(subset=['locus'])
print(airr.shape[0], "removed presumed merging artefacts")
airr = airr[airr.productive == "T"]
print(airr.shape[0], "removed unproductive chains")
airr = airr.dropna(subset=['locus'])

print("{} assemblies from TS".format(airr.shape[0]))

# reformatting 10X sequence id to merge with adata.obs
airr.loc[:,"cell_id"] = airr.sequence_id.replace('_contig_[0-9]', '', regex=True)
airr.cell_id.replace("\-","_", regex=True, inplace=True)
# 10X
airr.cell_id.replace("_5prime..CR", '_5prime', regex = True, inplace = True)
airr.cell_id.replace('_[0-9]_[T][S]', '_TS',regex = True, inplace = True)
airr.cell_id.replace('10X_[0-9]_[0-9]', '10X',regex = True, inplace = True)
airr.loc[:, 'cell_id'] = airr.cell_id.str.rsplit(" ", expand=True)[0]
airr.head().cell_id.values[-20:]
# Munge the data a little
airr.loc[:, 'cell_id'] = airr.cell_id.str.split('|', expand=True)[0]
airr = airr.dropna(subset=['junction'])
# scirpy needs umi_count column 
airr['umi_count'] = airr['umis']
airr['umi_count'].fillna(5, inplace=True)
airr['is_cell'] = True
airr['high_confidence'] = True
airr['multi_chain'] = False
# Want to assign c_calls to SS2 data in the snakemake, that's a todo
airr['c_call'] = 'None'
# rename column
airr = airr.rename({'cell_id_x':'cell_id'}, axis=1)

19252 total assemblies out of snakemake
19252 removed presumed merging artefacts
17780 removed unproductive chains
17780 assemblies from TS


0                AAACCTGAGTGAAGTT_TSP2_BM_vertebralbody_10X_5prime
1                AAACCTGCAATTGCTG_TSP2_BM_vertebralbody_10X_5prime
2                AAACCTGGTTCCACGG_TSP2_BM_vertebralbody_10X_5prime
3                AAACGGGAGGCAAAGA_TSP2_BM_vertebralbody_10X_5prime
4                AAACGGGAGGCAAAGA_TSP2_BM_vertebralbody_10X_5prime
                                   ...                            
19245     TSP8_Prostate_NA_SS2_B134140_B134704_Epithelial_B14_L002
19247     TSP8_Prostate_NA_SS2_B134140_B134704_Epithelial_B14_L002
19248    TSP8_Prostate_NA_SS2_B134137_B134703_Endothelial_B18_L002
19249      TSP8_Prostate_NA_SS2_B134140_B134704_Epithelial_D7_L002
19251         TSP8_Prostate_NA_SS2_B134141_B134697_Immune_H12_L001
Name: cell_id, Length: 17735, dtype: object

In [63]:
airr.cell_id[airr.cell_id.str.contains('5prime')]

0       AAACCTGAGTGAAGTT_TSP2_BM_vertebralbody_10X_5prime
1       AAACCTGCAATTGCTG_TSP2_BM_vertebralbody_10X_5prime
2       AAACCTGGTTCCACGG_TSP2_BM_vertebralbody_10X_5prime
3       AAACGGGAGGCAAAGA_TSP2_BM_vertebralbody_10X_5prime
4       AAACGGGAGGCAAAGA_TSP2_BM_vertebralbody_10X_5prime
                              ...                        
8731           TTTCCTCCAGGCTGAA_TSP2_Thymus_NA_10X_5prime
8732           TTTCCTCGTGTGAATA_TSP2_Thymus_NA_10X_5prime
8733           TTTCCTCGTGTGAATA_TSP2_Thymus_NA_10X_5prime
8734           TTTCCTCTCCGCATCT_TSP2_Thymus_NA_10X_5prime
8735           TTTGTCATCGGAAACG_TSP2_Thymus_NA_10X_5prime
Name: cell_id, Length: 8694, dtype: object

In [55]:
met.cell_id.replace('10X_[0-9]_[0-9]', '10X',regex = True, inplace = True)


In [56]:
met[met.cell_id.str.contains('5prime')]

Unnamed: 0,cell_id,donor,tissue,anatomical_position,method,cell_ontology_class,free_annotation,consensus_prediction,consensus_percentage,manually_annotated,...,10X_barcode,cDNAplate,libraryplate,well,notes,seqrun,cell_identifier,n_counts,n_genes,_cell_id
376407,AAACCTGCACCTGGTG_TSP2_Blood_NA_10X_5prime,TSP2,Blood,,10X,macrophage,macrophage,classical monocyte,0.86,True,...,AAACCTGCACCTGGTG_TSP2_Blood_NA_10X_1_4_5prime,,,,5prime,2,AAACCTGCACCTGGTG_TSP2_Blood_NA_10X_1_4_5prime,4376.0,1734,
376408,AAACCTGTCGCCGTGA_TSP2_Blood_NA_10X_5prime,TSP2,Blood,,10X,macrophage,macrophage,classical monocyte,0.86,True,...,AAACCTGTCGCCGTGA_TSP2_Blood_NA_10X_1_4_5prime,,,,5prime,2,AAACCTGTCGCCGTGA_TSP2_Blood_NA_10X_1_4_5prime,8893.0,2665,
376409,AAACGGGCACGGCGTT_TSP2_Blood_NA_10X_5prime,TSP2,Blood,,10X,"cd4-positive, alpha-beta t cell",CD4 t cell,cd4-positive helper t cell,1.00,True,...,AAACGGGCACGGCGTT_TSP2_Blood_NA_10X_1_4_5prime,,,,5prime,2,AAACGGGCACGGCGTT_TSP2_Blood_NA_10X_1_4_5prime,2519.0,1268,
376410,AAAGATGGTAGCGCAA_TSP2_Blood_NA_10X_5prime,TSP2,Blood,,10X,"cd4-positive, alpha-beta t cell",CD4 t cell,cd4-positive helper t cell,1.00,True,...,AAAGATGGTAGCGCAA_TSP2_Blood_NA_10X_1_4_5prime,,,,5prime,2,AAAGATGGTAGCGCAA_TSP2_Blood_NA_10X_1_4_5prime,6917.0,1849,
376411,AAAGTAGCAGGAATCG_TSP2_Blood_NA_10X_5prime,TSP2,Blood,,10X,macrophage,macrophage,classical monocyte,0.86,True,...,AAAGTAGCAGGAATCG_TSP2_Blood_NA_10X_1_4_5prime,,,,5prime,2,AAAGTAGCAGGAATCG_TSP2_Blood_NA_10X_1_4_5prime,2701.0,1159,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490192,TTGTAGGAGTGCAAGC_TSP2_Thymus_NA_10X_5prime,TSP2,Thymus,,10X,thymocyte,thymocyte,thymocyte,1.00,True,...,TTGTAGGAGTGCAAGC_TSP2_Thymus_NA_10X_1_4_5prime,,,,5prime,2,TTGTAGGAGTGCAAGC_TSP2_Thymus_NA_10X_1_4_5prime,3812.0,1585,
490193,TTTACTGCAATGACCT_TSP2_Thymus_NA_10X_5prime,TSP2,Thymus,,10X,thymocyte,thymocyte,thymocyte,1.00,True,...,TTTACTGCAATGACCT_TSP2_Thymus_NA_10X_1_4_5prime,,,,5prime,2,TTTACTGCAATGACCT_TSP2_Thymus_NA_10X_1_4_5prime,4356.0,1856,
490194,TTTATGCCAGGCTCAC_TSP2_Thymus_NA_10X_5prime,TSP2,Thymus,,10X,thymocyte,thymocyte,thymocyte,1.00,True,...,TTTATGCCAGGCTCAC_TSP2_Thymus_NA_10X_1_4_5prime,,,,5prime,2,TTTATGCCAGGCTCAC_TSP2_Thymus_NA_10X_1_4_5prime,3273.0,1332,
490195,TTTCCTCCAGGCTGAA_TSP2_Thymus_NA_10X_5prime,TSP2,Thymus,,10X,thymocyte,thymocyte,thymocyte,1.00,True,...,TTTCCTCCAGGCTGAA_TSP2_Thymus_NA_10X_1_4_5prime,,,,5prime,2,TTTCCTCCAGGCTGAA_TSP2_Thymus_NA_10X_1_4_5prime,3056.0,1315,


In [57]:
# add a mutation frequ
ency column
airr = calculate_v_mu_freq(airr)

SyntaxError: invalid syntax (3931430004.py, line 2)

# SCIRPY

In [58]:
# metadata from h5ad
met = pd.read_csv('../../metadata/TSP1_TSP15_metadata.csv.gz')
#met[(met.donor == "TSP1") & (met.method == 'smartseq2')].cell_id.str.rsplit(".", n = 5, expand = True)[0]
met['_cell_id'] = met['cell_id']
met['_cell_id'] = met['_cell_id'].replace('_[0-2]_[0-2]_5prime', '_5prime', regex = True, inplace = True)
met['_cell_id'] = met['_cell_id'].replace('.homo.gencode.v30.ERCC.chrM', '', regex = True, inplace = True)
#met.cell_id.replace('10X_[0-9]_[0-9]', '10X',regex = True, inplace = True)

  exec(code_obj, self.user_global_ns, self.user_ns)
