# Analysis of blasticidin samples

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

## Peak Finding
We keep this code here for reference, analysis was done in the cluster <br>
First we merge all the blasticidin samples together

In [None]:
%%bash
samtools merge -f $TMPDIR/out.bam  \
BSF_03/res4/BSF_03/BSF_03_sorted.bam \
BSF_04/res4/BSF_04/BSF_04_sorted.bam \
BSF_05/res4/BSF_05/BSF_05_sorted.bam \

samtools sort -@ 10 -o $TMPDIR/BSF_B_merge.bam $TMPDIR/out.bam
rm $TMPDIR/out.bam
samtools index $TMPDIR/BSF_B_merge.bam

In [None]:
#copy the bam file for the control samples
!cp BSF_01/res4/BSF_01/BSF_01_sorted.bam $TMPDIR/

In [None]:
#call peaks using B samples bam files
#and control bam file
#These parameters were found after severel attempts of try/error
!mkdir -p merge_macs_BSF_B
!macs2 callpeak -t $TMPDIR/BSF_B_merge.bam -f BAMPE \
-c $TMPDIR/BSF_01_sorted.bam \
--min-length 300 --max-gap 1 --broad --nomodel --keep-dup all\
-q 1 --broad-cutoff 1 --broad --llocal 3000 --outdir 'merge_macs_BSF_B'

In [None]:
#load peak results and make a bed files
#set coverage to one for visualization
import pandas as pd
indf = pd.read_csv('merge_macs_BSF_B/NA_peaks.xls',sep='\t',comment='#')
#GeneID	Chr	Start	End	Strand
indf = indf[list(indf.columns[0:3])]
indf['coverage']=1
#indf.columns = ['GeneID','Chr','Start','End']
indf.to_csv('merge_macs_BSF_B/merge_macs_BSF_B.bed',sep='\t',index=False,header=False)
indf.head()

Unnamed: 0,chr,start,end,coverage
0,11L3_v3,22346,23528,1
1,11L3_v3,26393,26954,1
2,11L3_v3,27519,28717,1
3,11L3_v3,44970,45408,1
4,11L3_v3,50065,50477,1


In [None]:
#we make SAF file from the peak finding alghritm (MACS2) to use with feature counts
indf = pd.read_csv('merge_macs_BSF_B/NA_peaks.xls',sep='\t',comment='#')
#GeneID	Chr	Start	End	Strand
indf = indf[[indf.columns[-1]] + list(indf.columns[0:3])]
indf.columns = ['GeneID','Chr','Start','End']
indf['Strand']='.'
indf.to_csv('merge_macs_BSF_B/merge_macs_BSF_B.SAF',sep='\t',index=False)
indf.head()

Unnamed: 0,GeneID,Chr,Start,End,Strand
0,NA_peak_1,11L3_v3,22346,23528,.
1,NA_peak_2,11L3_v3,26393,26954,.
2,NA_peak_3,11L3_v3,27519,28717,.
3,NA_peak_4,11L3_v3,44970,45408,.
4,NA_peak_5,11L3_v3,50065,50477,.


## Counting
We count, for all amples, the total reads and the reads with the barcode <br>in the 4 different orientation (F,R,FR,RR).

In [None]:
#we now count the read pairs for each B samples and barcoded B samples
!featureCounts -p -B -C -M -O -T 8 -F SAF -a 'merge_macs_BSF_B/merge_macs_BSF_B.SAF' \
-o 'merge_macs_BSF_B/merge_macs_BSF_B_counts.txt' \
/cluster/majf_lab/mtinti/UTR/BSF_01/res4/BSF_01/BSF_01_sorted.bam \
/cluster/majf_lab/mtinti/UTR/BSF_03/res4/BSF_03/BSF_03_sorted.bam \
/cluster/majf_lab/mtinti/UTR/BSF_04/res4/BSF_04/BSF_04_sorted.bam \
/cluster/majf_lab/mtinti/UTR/BSF_05/res4/BSF_05/BSF_05_sorted.bam \
/cluster/majf_lab/mtinti/UTR/BSF_01/res4/BSF_01/BSF_01_sorted_F.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_01/res4/BSF_01/BSF_01_sorted_R.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_01/res4/BSF_01/BSF_01_sorted_FR.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_01/res4/BSF_01/BSF_01_sorted_RR.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_03/res4/BSF_03/BSF_03_sorted_F.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_03/res4/BSF_03/BSF_03_sorted_R.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_03/res4/BSF_03/BSF_03_sorted_FR.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_03/res4/BSF_03/BSF_03_sorted_RR.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_04/res4/BSF_04/BSF_04_sorted_F.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_04/res4/BSF_04/BSF_04_sorted_R.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_04/res4/BSF_04/BSF_04_sorted_FR.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_04/res4/BSF_04/BSF_04_sorted_RR.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_05/res4/BSF_05/BSF_05_sorted_F.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_05/res4/BSF_05/BSF_05_sorted_R.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_05/res4/BSF_05/BSF_05_sorted_FR.bam  \
/cluster/majf_lab/mtinti/UTR/BSF_05/res4/BSF_05/BSF_05_sorted_RR.bam

## Read in the peak counts

In [None]:
#get the counts of all samples
df = pd.read_csv('merge_macs_BSF_B/merge_macs_BSF_B_counts.txt',comment='#',sep='\t',index_col=[0])
counts_col = list(df.columns[5:])
new_names = [n.split('/')[-1].split('.')[0].replace('_sorted','') for n in counts_col]
new_names = dict(zip(counts_col,new_names))
df = df.rename(new_names,axis=1)
df.head()

Unnamed: 0_level_0,Chr,Start,End,Strand,Length,BSF_01,BSF_03,BSF_04,BSF_05,BSF_01_F,...,BSF_03_FR,BSF_03_RR,BSF_04_F,BSF_04_R,BSF_04_FR,BSF_04_RR,BSF_05_F,BSF_05_R,BSF_05_FR,BSF_05_RR
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NA_peak_1,11L3_v3,22346,23528,.,1183,221,689,470,354,5,...,14,23,6,1,2,11,3,0,3,8
NA_peak_2,11L3_v3,26393,26954,.,562,47,94,35,13,4,...,0,0,2,0,0,0,1,0,0,0
NA_peak_3,11L3_v3,27519,28717,.,1199,25,189,282,422,0,...,11,0,1,5,15,0,2,14,23,0
NA_peak_4,11L3_v3,44970,45408,.,439,48,225,57,100,2,...,4,1,1,1,0,0,2,2,1,0
NA_peak_5,11L3_v3,50065,50477,.,413,124,206,83,100,0,...,22,0,0,12,7,0,0,9,7,0


In [None]:
#Prepare a final table
final_table=pd.DataFrame(index=df.index)
final_table['Chr']=df.Chr
final_table['Start']=df.Start
final_table['End']=df.End
final_table.head()

Unnamed: 0_level_0,Chr,Start,End
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NA_peak_1,11L3_v3,22346,23528
NA_peak_2,11L3_v3,26393,26954
NA_peak_3,11L3_v3,27519,28717
NA_peak_4,11L3_v3,44970,45408
NA_peak_5,11L3_v3,50065,50477


In [None]:
#[n for n in df.columns if '_03' in n]

## Sum up the barcoded reads
we create one column with the sum of the barcoded reads in <br>
forward (F/RR) and reverse complement (R/RR) orientation <br>
using control and treatments

In [None]:
#prepare the control
final_table['Control_total']=df['BSF_01']
final_table['Control_F']=df['BSF_01_F']+df['BSF_01_RR']
final_table['Control_R']=df['BSF_01_R']+df['BSF_01_FR']
#prepare treatment
final_table['Treatment_total']=df['BSF_03']+df['BSF_04']+df['BSF_05']

final_table['Treatment_F']=(df['BSF_03_F']+df['BSF_04_F']+df['BSF_05_F']+
                            df['BSF_03_RR']+df['BSF_04_RR']+df['BSF_05_RR'])

final_table['Treatment_R']=(df['BSF_03_R']+df['BSF_04_R']+df['BSF_05_R']+
                            df['BSF_03_FR']+df['BSF_04_FR']+df['BSF_05_FR'])

#fraction of reads in forward oritentation
#now we try to extract the orientation of the peak
#it will be a number between 0 and 1
#so, for a peak, having Treatment_fraction_F ==1 
#means all the barcoded reads where found in forward orientation
#this is clearly an apporximantion, as sometimes is difficoult to 
#detect peack that span the full barcoded region
final_table['Treatment_fraction_F']=final_table['Treatment_F']/(final_table['Treatment_F']+final_table['Treatment_R'])
#by definition
final_table['Treatment_fraction_R']=1-final_table['Treatment_fraction_F']
final_table.head()

Unnamed: 0_level_0,Chr,Start,End,Control_total,Control_F,Control_R,Treatment_total,Treatment_F,Treatment_R,Treatment_fraction_F,Treatment_fraction_R
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
NA_peak_1,11L3_v3,22346,23528,221,8,9,1513,54,25,0.683544,0.316456
NA_peak_2,11L3_v3,26393,26954,47,4,0,142,14,0,1.0,0.0
NA_peak_3,11L3_v3,27519,28717,25,0,1,893,5,71,0.065789,0.934211
NA_peak_4,11L3_v3,44970,45408,48,2,0,382,6,13,0.315789,0.684211
NA_peak_5,11L3_v3,50065,50477,124,4,12,389,0,72,0.0,1.0


In [None]:
#now we adjust the total reads, that we assume having an orientation
#with the same proportion of the barcoded reads
final_table['Treatment_total_corrected_F']=final_table['Treatment_total']*final_table['Treatment_fraction_F']
final_table['Treatment_total_corrected_R']=final_table['Treatment_total']*final_table['Treatment_fraction_R']
final_table.head()

Unnamed: 0_level_0,Chr,Start,End,Control_total,Control_F,Control_R,Treatment_total,Treatment_F,Treatment_R,Treatment_fraction_F,Treatment_fraction_R,Treatment_total_corrected_F,Treatment_total_corrected_R
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
NA_peak_1,11L3_v3,22346,23528,221,8,9,1513,54,25,0.683544,0.316456,1034.202532,478.797468
NA_peak_2,11L3_v3,26393,26954,47,4,0,142,14,0,1.0,0.0,142.0,0.0
NA_peak_3,11L3_v3,27519,28717,25,0,1,893,5,71,0.065789,0.934211,58.75,834.25
NA_peak_4,11L3_v3,44970,45408,48,2,0,382,6,13,0.315789,0.684211,120.631579,261.368421
NA_peak_5,11L3_v3,50065,50477,124,4,12,389,0,72,0.0,1.0,0.0,389.0


In [None]:
#create a bed file from the peak start/end 
bed_file = df[['Chr', 'Start', 'End']]
bed_file=bed_file.reset_index()
bed_file = bed_file[['Chr', 'Start', 'End', 'Geneid']]
bed_file=bed_file.rename({'Geneid':'Peak_Id'},axis=1)
bed_file[['Chr', 'Start', 'End', 'Peak_Id']].to_csv('bla_peaks_final.bed', sep='\t',
                                                   header=False, index=False)
!head 'bla_peaks_final.bed'

11L3_v3	22346	23528	NA_peak_1
11L3_v3	26393	26954	NA_peak_2
11L3_v3	27519	28717	NA_peak_3
11L3_v3	44970	45408	NA_peak_4
11L3_v3	50065	50477	NA_peak_5
11L3_v3	54025	54923	NA_peak_6
5K5_v5.1	1983	3503	NA_peak_7
5K5_v5.1	3902	5898	NA_peak_8
5K5_v5.1	6985	8467	NA_peak_9
5K5_v5.1	10746	11356	NA_peak_10


In [None]:
#From the re-annotation of the UTRs 
#we defined a set of gene that we do not belive to be real.
#so, we want exclude those from the analysis
bl =pd.read_csv('blacklisted gene.csv',engine='python',header=None)
bl.columns = ['gene','reason']
bl['gene']=[n.strip() for n in bl['gene']]
print(bl.shape)
bl.tail()

(284, 2)


Unnamed: 0,gene,reason
279,Tb927.9.15950,UTR_contained
280,Tb927.9.6570,UTR_contained
281,Tb927.9.6830,UTR_contained
282,Tb927.9.7500,UTR_contained
283,Tb927.9.7850,UTR_contained


In [None]:
# From the re-annotation of the UTRs
# we parse a file with UTRs coordinates
import numpy as np
import pandas as pd
df = pd.read_csv('UTRs.txt',sep='\t',header=None)
df.columns = ['Gene_id','utr5','utr3']
df['Chr']=[n.split(':')[0] if str(n) != 'nan' else np.nan for n in df['utr3'] ]
df['Start']=[n.split(':')[1].split('..')[0] if str(n) != 'nan' else np.nan for n in df['utr3']]
df['End']=[n.split(':')[1].split('..')[1] if str(n) != 'nan' else np.nan for n in df['utr3']]
df['Score']='.'
df['Strand']=['-' if str(n)[-1] == 'r' else '+' for n in df['utr3']]
df=df.dropna(subset=['End'])
df['diff'] = df['End'].astype(int)-df['Start'].astype(int)
df=df[df['diff']>=1]
df.head(5)

Unnamed: 0,Gene_id,utr5,utr3,Chr,Start,End,Score,Strand,diff
0,Tb927.9.13440,Tb927_09_v5.1:2132976..2133162:r,Tb927_09_v5.1:2127479..2128238:r,Tb927_09_v5.1,2127479,2128238,.,-,759
1,Tb927.9.13430,Tb927_09_v5.1:2126776..2127309:r,Tb927_09_v5.1:2121838..2125437:r,Tb927_09_v5.1,2121838,2125437,.,-,3599
2,Tb927.9.13380,Tb927_09_v5.1:2120866..2121171:r,Tb927_09_v5.1:2118125..2119605:r,Tb927_09_v5.1,2118125,2119605,.,-,1480
3,Tb927.9.13360,Tb927_09_v5.1:2117171..2117516:r,Tb927_09_v5.1:2108712..2111431:r,Tb927_09_v5.1,2108712,2111431,.,-,2719
4,Tb927.9.8905,Tb927_09_v5.1:1431796..1431807:r,Tb927_09_v5.1:1431368..1431516:r,Tb927_09_v5.1,1431368,1431516,.,-,148


In [None]:
#remove blacklisted genes
df = df[~df['Gene_id'].isin(bl['gene'])]

In [None]:
#we create a bed file for the 3' UTR coords
df[['Chr', 'Start', 'End','Gene_id', 'Score', 'Strand']].to_csv(
    'UTR3_final.bed', sep='\t', header=False, index=False)

## Find peak / UTRs overlaps
we use  bedtools intersect to find the ovrlapping reagions <br>
between UTRs and Peaks

In [None]:
%%bash
bedtools intersect -a <(bedtools sort -i bla_peaks_final.bed) \
-b <(bedtools sort -i UTR3_final.bed) -wao > bla_peak_belongs_to_3UTR.bed

In [None]:
#use awk to find the peaks that are fully fully contained in UTRs
!awk '$(NF-2) != "-1"' bla_peak_belongs_to_3UTR.bed > bla_peak_belongs_to_3UTR_filtered.bed

In [None]:
#now we parse the bedtools output
df = pd.read_csv('bla_peak_belongs_to_3UTR_filtered.bed',sep='\t',header=None)
df.columns = ['Chromosome','Peak_Start','Peak_End','Peak_Name',
              'UTR_Chromosome','UTR_Start','UTR_End','UTR_Gene_ID','UTR_Score','UTR_Strand','UTR_Peak_overlap']
del df['UTR_Score']
del df['UTR_Chromosome']

#we separate the peaks that overlap with UTRs in the forward (+) or reverse complement (-) orientation
df_plus = df[df['UTR_Strand']=='+']
df_minus = df[df['UTR_Strand']=='-']
#now it is easy to discard peaks that overalp with multiple UTRs
#we keep the most 5' if the peak is in the forward (+) orientation
df_plus = df_plus.drop_duplicates(subset=['Peak_Name'],keep='first')
#we keep the most 3' if the peak is in the reverse complement (+) orientation
df_minus = df_minus.drop_duplicates(subset=['Peak_Name'],keep='last')
df = pd.concat([df_plus,df_minus])
df=df.sort_values(by=['Chromosome','Peak_Start','Peak_End'])
df[df['Peak_Name']=='NA_peak_1371']

Unnamed: 0,Chromosome,Peak_Start,Peak_End,Peak_Name,UTR_Start,UTR_End,UTR_Gene_ID,UTR_Strand,UTR_Peak_overlap
803,Tb927_07_v5.1,297730,300530,NA_peak_1371,297863,297962,Tb927.7.1150,+,99


In [None]:
# now we merge the table with the peak counting
# with the table that select the peaks overalpping with UTRs
# we keep only the peak that overlap with a UTR
print(df.shape)
df = df.merge(final_table.iloc[:,3:],left_on='Peak_Name',right_index=True,how='left')
print(df.shape)

(2068, 9)
(2068, 19)


In [None]:
df[df['UTR_Gene_ID']=='Tb927.11.1110']

Unnamed: 0,Chromosome,Peak_Start,Peak_End,Peak_Name,UTR_Start,UTR_End,UTR_Gene_ID,UTR_Strand,UTR_Peak_overlap,Control_total,Control_F,Control_R,Treatment_total,Treatment_F,Treatment_R,Treatment_fraction_F,Treatment_fraction_R,Treatment_total_corrected_F,Treatment_total_corrected_R
1857,Tb927_11_v5.1,314906,315992,NA_peak_3980,314972,316981,Tb927.11.1110,+,1020,430,9,12,10462,286,95,0.750656,0.249344,7853.364829,2608.635171


In [None]:
df[df['UTR_Gene_ID']=='Tb927.11.510']

Unnamed: 0,Chromosome,Peak_Start,Peak_End,Peak_Name,UTR_Start,UTR_End,UTR_Gene_ID,UTR_Strand,UTR_Peak_overlap,Control_total,Control_F,Control_R,Treatment_total,Treatment_F,Treatment_R,Treatment_fraction_F,Treatment_fraction_R,Treatment_total_corrected_F,Treatment_total_corrected_R
1837,Tb927_11_v5.1,125016,127658,NA_peak_3960,119846,127190,Tb927.11.510,-,2174,693,7,18,31255,105,968,0.097856,0.902144,3058.504194,28196.495806


In [None]:
#df.head(20)

In [None]:
#69815-69903

In [None]:
#For convinence, we create a function for 
#creating a string that can be used at TryTripDB
#to download nucleotide sequences
def format_download(X):
    if X['UTR_Strand']=='-':
        orient = 'r'
    else:
        orient = 'f'
    chro =  X['Chromosome']  
    p_start =  int(X['Peak_Start'])
    p_end =  int(X['Peak_End'])
    u_start =  int(X['UTR_Start'])
    u_end =  int(X['UTR_End'])  
    
    start = max(p_start,u_start)
    end = min(p_end,u_end)
    download_string = f'{chro}:{start}..{end}:{orient}'
    return download_string           

In [None]:
df['download_seq']=df.apply(format_download,axis=1)

In [None]:
#now we report if the peak
#is fully contained in the UTR
def is_fully_contained(X):
    p_start =  int(X['Peak_Start'])
    p_end =  int(X['Peak_End'])
    u_start =  int(X['UTR_Start'])
    u_end =  int(X['UTR_End'])
    if (p_start>=u_start) and (p_end<=u_end):
        return True
    else:
        return False
df['is_fully_contained'] = df.apply(is_fully_contained,axis=1)

In [None]:
df[df['is_fully_contained']].shape

(217, 21)

In [None]:
print(df.shape)
df.head()

(2068, 21)


Unnamed: 0,Chromosome,Peak_Start,Peak_End,Peak_Name,UTR_Start,UTR_End,UTR_Gene_ID,UTR_Strand,UTR_Peak_overlap,Control_total,...,Control_R,Treatment_total,Treatment_F,Treatment_R,Treatment_fraction_F,Treatment_fraction_R,Treatment_total_corrected_F,Treatment_total_corrected_R,download_seq,is_fully_contained
0,Tb927_01_v5.1,69815,70745,NA_peak_118,69661,69903,Tb927.1.180,-,88,6972,...,71,15571,408,57,0.877419,0.122581,13662.296774,1908.703226,Tb927_01_v5.1:69815..69903:r,False
1,Tb927_01_v5.1,85792,86127,NA_peak_120,85611,85853,Tb927.1.220,-,61,3363,...,140,7286,523,154,0.772526,0.227474,5628.623338,1657.376662,Tb927_01_v5.1:85792..85853:r,False
3,Tb927_01_v5.1,202076,204347,NA_peak_125,204160,204417,Tb927.1.540,-,187,5250,...,173,22674,542,438,0.553061,0.446939,12540.110204,10133.889796,Tb927_01_v5.1:204160..204347:r,False
4,Tb927_01_v5.1,230915,232950,NA_peak_126,231675,232503,Tb927.1.700,-,828,122,...,4,3232,10,202,0.04717,0.95283,152.45283,3079.54717,Tb927_01_v5.1:231675..232503:r,False
5,Tb927_01_v5.1,263874,264334,NA_peak_129,263913,264383,Tb927.1.880,-,421,285,...,28,6872,915,7,0.992408,0.007592,6819.826464,52.173536,Tb927_01_v5.1:263913..264334:r,False


In [None]:
df.columns

Index(['Chromosome', 'Peak_Start', 'Peak_End', 'Peak_Name', 'UTR_Start',
       'UTR_End', 'UTR_Gene_ID', 'UTR_Strand', 'UTR_Peak_overlap',
       'Control_total', 'Control_F', 'Control_R', 'Treatment_total',
       'Treatment_F', 'Treatment_R', 'Treatment_fraction_F',
       'Treatment_fraction_R', 'Treatment_total_corrected_F',
       'Treatment_total_corrected_R', 'download_seq', 'is_fully_contained'],
      dtype='object')

In [None]:
#we save now save the table for the paper
df.to_csv('Paper_Table_Blasticidine.csv')

In [None]:
# show versions of packages
# adopted from https://stackoverflow.com/questions/40428931/package-for-listing-version-of-packages-used-in-a-jupyter-notebook
import pkg_resources
import types
import sys
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]
        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to add
        # exceptions to this list manually!
        poorly_named_packages = {
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]
        yield name.lower()
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
modules = []
for m in sys.builtin_module_names:
    if m.lower() in imports and m !='builtins':
        modules.append((m,'Python BuiltIn'))
        imports.remove(m.lower())

for m in pkg_resources.working_set:
    if m.project_name.lower() in imports and m.project_name!="pip":
        modules.append((m.project_name, m.version))
        imports.remove(m.project_name.lower())

for m in sys.modules:
    if m.lower() in imports and m !='builtins':
        modules.append((m,'unknown'))

# print('System=='+platform.system()+' '+platform.release()+'; Version=='+platform.version())
for r in modules:
    if 'unknown' in r[1]:
        continue
    print("{}=={}".format(*r))

sys==Python BuiltIn
numpy==1.21.6
pandas==1.4.2


In [None]:
#bedtools-2.31

In [None]:
!python --version

Python 3.10.10
