Modified from https://github.com/melanieabrams-pub/RH-seq_with_barcoding/blob/main/annotate_poolfile_for_barseq_v1.2_JSkerker.ipynb and 
https://github.com/melanieabrams-pub/kluyv_RH-seq_with_barcoding/blob/main/annotate_poolfile_kluyv.py

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt

#makes viewing pandas tables better
pd.set_option('display.max_colwidth', 0)

### Load GFF annotations

In [4]:
#you don't need to change the path below - this is the most recent annotation file (5/12/22)
gff_file = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_July_2023/OORB003_TnSeq/for_analysis/modified_gff_file_finalized_08_08_2023'

In [5]:
gff = pd.read_csv(gff_file, sep='\t')

In [7]:
t = gff['scaffold'].unique()
print(t)

['chr1' 'chr2' 'chr3' 'chr4' 'chr5' 'chr6' 'chr7']


In [8]:
gff.head()

Unnamed: 0,scaffold,source,type,start,stop,strand,info,ID
0,chr1,JGI,CDS,1381.0,2142.0,+,ID=gene_1,proteinId=2114025
1,chr1,JGI,CDS,2744.0,3343.0,-,ID=gene_2,proteinId=2293935
2,chr1,JGI,CDS,3344.0,3817.0,-,ID=gene_3,proteinId=2293936
3,chr1,JGI,CDS,5330.0,6385.0,-,ID=gene_4,proteinId=2051335
4,chr1,JGI,CDS,15458.0,25533.0,+,ID=gene_5,proteinId=2121898


In [9]:
gff

Unnamed: 0,scaffold,source,type,start,stop,strand,info,ID
0,chr1,JGI,CDS,1381.0,2142.0,+,ID=gene_1,proteinId=2114025
1,chr1,JGI,CDS,2744.0,3343.0,-,ID=gene_2,proteinId=2293935
2,chr1,JGI,CDS,3344.0,3817.0,-,ID=gene_3,proteinId=2293936
3,chr1,JGI,CDS,5330.0,6385.0,-,ID=gene_4,proteinId=2051335
4,chr1,JGI,CDS,15458.0,25533.0,+,ID=gene_5,proteinId=2121898
...,...,...,...,...,...,...,...,...
9105,chr7,JGI,CDS,4089438.0,4091337.0,+,ID=gene_10836,proteinId=103797
9106,chr7,JGI,CDS,4095568.0,4097284.0,+,ID=gene_10837,proteinId=2071098
9107,chr7,JGI,CDS,4098122.0,4098907.0,-,ID=gene_10838,proteinId=2114022
9108,chr7,JGI,CDS,4100606.0,4101293.0,-,ID=gene_10840,proteinId=2114023


### Match barcode insert locations to gene annotations

In [10]:
!pwd

/auto/sahara/namib/home/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Sequencing_with_the_13k_mutants


In [23]:
#CHANGE IF NEEDED: load COMBINED poolfile generated in TNseq mapping step

poolfile = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Sequencing_with_the_13k_mutants/for_analysis/TNSeq_13k_mutants_mapping_output/Tnseq_13k_mutants_Mar_22_2024_without_vector_seq_poolfile'

#CHANGE IF NEEDED: annotated pool file path to output

annotated_poolfile_path = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Sequencing_with_the_13k_mutants/for_analysis/TNSeq_13k_mutants_mapping_output/Tnseq_13k_mutants_03_26_2024_annotated.csv'

In [24]:
df = pd.read_csv(poolfile,sep='\t')
df.head()

Unnamed: 0,barcode,rcbarcode,nTot,n,scaffold,strand,pos,type,nMainLocation,nInsert,All genomic mappings,All insert mappings
0,TTCATTCAGGATCTTTGCTC,GAGCAAAGATCCTGAATGAA,55,55,chr1,+,8764372,Single,52,0,"[('chr1:8764372:+', 52), ('chr6:477092:-', 1), ('chr7:1928216:+', 1), ('chr4:2631691:-', 1)]",{'Null:0:+': 0}
1,ACGTATTCGCAGTCCTCATA,TATGAGGACTGCGAATACGT,15,15,chr1,-,526973,Single,15,0,"[('chr1:526973:-', 15)]",{'Null:0:+': 0}
2,GCGGCGGCAGCGTTCCTCGC,GCGAGGAACGCTGCCGCCGC,54,54,chr1,-,2235514,Single,51,0,"[('chr1:2235514:-', 51), ('chr2:2382780:-', 1), ('chr3:2906544:-', 1), ('chr2:2797042:-', 1)]",{'Null:0:+': 0}
3,GTTCGCTGTGCGGCGGATGA,TCATCCGCCGCACAGCGAAC,10,10,chr3,-,2963746,Single,9,0,"[('chr3:2963746:-', 9), ('chr1:783370:+', 1)]",{'Null:0:+': 0}
4,GAATAGGTAATTAAGCCTGC,GCAGGCTTAATTACCTATTC,25,25,chr1,+,8013504,Single,22,1,"[('chr1:8013504:+', 22), ('chr5:3325007:-', 1), ('chr4:3047278:-', 1)]","{'Null:0:+': 0, 'insert:1521:-': 1}"


Check that each barcode in the dataframe is unique (the two numbers below should be the same):

In [25]:
print('Number of insertions in poolfile: {}'.format(df.shape[0]))
print('Number of unique barcodes in poolfile: {}'.format(len(df['barcode'].unique())))

Number of insertions in poolfile: 5720
Number of unique barcodes in poolfile: 5720


In [26]:
#df = df.set_index('transposonLocation') #This is what was there and it is showing error. The transposonlocation was changed to barcode in the next line
df = df.set_index('barcode')

In [27]:
df.columns

Index(['rcbarcode', 'nTot', 'n', 'scaffold', 'strand', 'pos', 'type',
       'nMainLocation', 'nInsert', 'All genomic mappings',
       'All insert mappings'],
      dtype='object')

In [28]:
df.head()

Unnamed: 0_level_0,rcbarcode,nTot,n,scaffold,strand,pos,type,nMainLocation,nInsert,All genomic mappings,All insert mappings
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TTCATTCAGGATCTTTGCTC,GAGCAAAGATCCTGAATGAA,55,55,chr1,+,8764372,Single,52,0,"[('chr1:8764372:+', 52), ('chr6:477092:-', 1), ('chr7:1928216:+', 1), ('chr4:2631691:-', 1)]",{'Null:0:+': 0}
ACGTATTCGCAGTCCTCATA,TATGAGGACTGCGAATACGT,15,15,chr1,-,526973,Single,15,0,"[('chr1:526973:-', 15)]",{'Null:0:+': 0}
GCGGCGGCAGCGTTCCTCGC,GCGAGGAACGCTGCCGCCGC,54,54,chr1,-,2235514,Single,51,0,"[('chr1:2235514:-', 51), ('chr2:2382780:-', 1), ('chr3:2906544:-', 1), ('chr2:2797042:-', 1)]",{'Null:0:+': 0}
GTTCGCTGTGCGGCGGATGA,TCATCCGCCGCACAGCGAAC,10,10,chr3,-,2963746,Single,9,0,"[('chr3:2963746:-', 9), ('chr1:783370:+', 1)]",{'Null:0:+': 0}
GAATAGGTAATTAAGCCTGC,GCAGGCTTAATTACCTATTC,25,25,chr1,+,8013504,Single,22,1,"[('chr1:8013504:+', 22), ('chr5:3325007:-', 1), ('chr4:3047278:-', 1)]","{'Null:0:+': 0, 'insert:1521:-': 1}"


In [29]:
df=df.drop(df.columns[-1],axis=1)

In [30]:
# df.columns = ['rcbarcode','nTot', 'n', 'scaffold', 'strand', 'pos', 'type',
#        'nMainLocation', 'nInsert', 'All genomic mappings',
#        'All insert mappings']

In [31]:
df.head()

Unnamed: 0_level_0,rcbarcode,nTot,n,scaffold,strand,pos,type,nMainLocation,nInsert,All genomic mappings
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
TTCATTCAGGATCTTTGCTC,GAGCAAAGATCCTGAATGAA,55,55,chr1,+,8764372,Single,52,0,"[('chr1:8764372:+', 52), ('chr6:477092:-', 1), ('chr7:1928216:+', 1), ('chr4:2631691:-', 1)]"
ACGTATTCGCAGTCCTCATA,TATGAGGACTGCGAATACGT,15,15,chr1,-,526973,Single,15,0,"[('chr1:526973:-', 15)]"
GCGGCGGCAGCGTTCCTCGC,GCGAGGAACGCTGCCGCCGC,54,54,chr1,-,2235514,Single,51,0,"[('chr1:2235514:-', 51), ('chr2:2382780:-', 1), ('chr3:2906544:-', 1), ('chr2:2797042:-', 1)]"
GTTCGCTGTGCGGCGGATGA,TCATCCGCCGCACAGCGAAC,10,10,chr3,-,2963746,Single,9,0,"[('chr3:2963746:-', 9), ('chr1:783370:+', 1)]"
GAATAGGTAATTAAGCCTGC,GCAGGCTTAATTACCTATTC,25,25,chr1,+,8013504,Single,22,1,"[('chr1:8013504:+', 22), ('chr5:3325007:-', 1), ('chr4:3047278:-', 1)]"


In [32]:
#this takes a few minutes
def get_gene_by_bc_position(pos):
    
    gene_list = [scaffold_gff.loc[interval]['ID'] for interval in idx if pos in interval]
    
    #barcode position not in any genes
    if not gene_list:
        return ''
    #barcode position is in one gene
    if len(gene_list) == 1:
        return gene_list[0]
    #barcode position is in multiple genes (should be rare)
    else:
        return ';'.join(gene_list)

barcode_to_gene_dict = {}

for scaffold in sorted(gff['scaffold'].unique()):
    
    scaffold_gff = gff[gff['scaffold']==scaffold]
    scaffold_barcodes = df[df['scaffold']==scaffold]
    
    #divide scaffold_gff.shape by 2 here to get number of genes (original shape includes promoters)
    print('Mapping {} barcodes on {} to {} genes on {}'.format(scaffold_barcodes.shape[0], scaffold,
                                                              round(scaffold_gff.shape[0]/2), scaffold))

    #turn the start and stop of each gene into an interval
    v = scaffold_gff.loc[:, 'start':'stop'].apply(tuple, 1).tolist()
    idx = pd.IntervalIndex.from_tuples(v, closed='both') 
    scaffold_gff = scaffold_gff.set_index(idx)
    
    #for each barcode, check if it's in each start/stop interval
    barcode_to_gene_dict.update(scaffold_barcodes['pos'].apply(get_gene_by_bc_position).to_dict())
    
df['gene'] = df.index.map(barcode_to_gene_dict)

Mapping 1867 barcodes on chr1 to 1356 genes on chr1
Mapping 999 barcodes on chr2 to 715 genes on chr2
Mapping 758 barcodes on chr3 to 605 genes on chr3
Mapping 548 barcodes on chr4 to 535 genes on chr4
Mapping 379 barcodes on chr5 to 492 genes on chr5
Mapping 433 barcodes on chr6 to 396 genes on chr6
Mapping 736 barcodes on chr7 to 455 genes on chr7


In [33]:
#check that a 'gene' column has been added to df

df.head()

Unnamed: 0_level_0,rcbarcode,nTot,n,scaffold,strand,pos,type,nMainLocation,nInsert,All genomic mappings,gene
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TTCATTCAGGATCTTTGCTC,GAGCAAAGATCCTGAATGAA,55,55,chr1,+,8764372,Single,52,0,"[('chr1:8764372:+', 52), ('chr6:477092:-', 1), ('chr7:1928216:+', 1), ('chr4:2631691:-', 1)]",proteinId=2298894
ACGTATTCGCAGTCCTCATA,TATGAGGACTGCGAATACGT,15,15,chr1,-,526973,Single,15,0,"[('chr1:526973:-', 15)]",
GCGGCGGCAGCGTTCCTCGC,GCGAGGAACGCTGCCGCCGC,54,54,chr1,-,2235514,Single,51,0,"[('chr1:2235514:-', 51), ('chr2:2382780:-', 1), ('chr3:2906544:-', 1), ('chr2:2797042:-', 1)]",
GTTCGCTGTGCGGCGGATGA,TCATCCGCCGCACAGCGAAC,10,10,chr3,-,2963746,Single,9,0,"[('chr3:2963746:-', 9), ('chr1:783370:+', 1)]",proteinId=2304708
GAATAGGTAATTAAGCCTGC,GCAGGCTTAATTACCTATTC,25,25,chr1,+,8013504,Single,22,1,"[('chr1:8013504:+', 22), ('chr5:3325007:-', 1), ('chr4:3047278:-', 1)]",proteinId=97899


### This line is not necessary in the running. Continue to the next line to save the dataframe

In [22]:
print('{} barcodes not in coding sequences'.format(df[df['gene']==''].shape[0]))
print()

print('{} barcodes in coding sequences'.format(df[df['gene']!=''].shape[0]))
print('{} barcodes in Kl coding sequences'.format(df[df['gene'].str.startswith('kl',na=False)].shape[0]))
print('{} barcodes in Km coding sequences'.format(df[df['gene'].str.startswith('km',na=False)].shape[0]))

3622 barcodes not in coding sequences

2098 barcodes in coding sequences
0 barcodes in Kl coding sequences
0 barcodes in Km coding sequences


In [34]:
df.to_csv(annotated_poolfile_path, sep='\t')

In [None]:
gff[gff['ID']=='kmYHR034C']