In [2]:
import pandas
import gffutils
import pickle

In [3]:
negative_correlation_partitions_csv = 'negative_correlation_partitions.csv'
gff_file = '../ref/GCF_000008685.2_ASM868v2_genomic.gff'
db_file = '../ref/B31_gffdb.db'

In [4]:
id2plasmid = {
    'NC_001318.1':'chromosome',
    'NC_000957.1':'lp5',
    'NC_001904.1':'cp9',
    'NC_001849.2':'lp17',
    'NC_000955.2':'lp21',
    'NC_001850.1':'lp25',
    'NC_001903.1':'cp26',
    'NC_001851.2':'lp28-1',
    'NC_001852.1':'lp28-2',
    'NC_001853.1':'lp28-3',
    'NC_001854.1':'lp28-4',
    'NC_000948.1':'cp32-1',
    'NC_000949.1':'cp32-3',
    'NC_000950.1':'cp32-4',
    'NC_000951.1':'cp32-6',
    'NC_000952.1':'cp32-7',
    'NC_000953.1':'cp32-8',
    'NC_000954.1':'cp32-9',
    'NC_001855.1':'lp36',
    'NC_001856.1':'lp38',
    'NC_001857.2':'lp54',
    'NC_000956.1':'lp56'
}

In [5]:
gffutils.create_db(gff_file, dbfn=db_file, force=True, keep_order=True, merge_strategy='merge')
db = gffutils.FeatureDB(db_file, keep_order=True)

In [6]:
# THis is specifically for the negative correlations list of genes that Ben gave me. It is not required for what I am doing below.
neg_corr_df = pandas.read_csv(negative_correlation_partitions_csv)
neg_corr_df['Gene'] = neg_corr_df['Gene'].apply(lambda x: 'gene-' + x)
list_of_genes = neg_corr_df['Gene'].tolist()
neg_corr_df

Unnamed: 0,Gene,Partition
0,gene-BB_RS05155,9
1,gene-BB_RS05160,9
2,gene-BB_RS05165,9
3,gene-BB_RS05235,9
4,gene-BB_RS05170,9
...,...,...
560,gene-BB_RS06060,18
561,gene-BB_RS07040,13
562,gene-BB_RS03170,11
563,gene-BB_RS04390,4


In [7]:
new_rows = []

for gene in list_of_genes:
    #gene_id = "gene-"+gene
    gene_feature = db[gene]
    children = db.children(gene_feature, level=1)
    for child in children:
        new_row = {
        'Plasmid': id2plasmid[child.seqid],
        'Product': child.attributes.get('product', [''])[0],
        'Protein_ID': child.attributes.get('protein_id', [''])[0],
        'Gene': child.attributes.get('Parent', [''])[0],
        'BB_Name': gene_feature.attributes.get('old_locus_tag', [''])[0],
        }
        new_rows.append(new_row)

df_new_rows = pandas.DataFrame(new_rows)
df_merged = neg_corr_df.merge(df_new_rows, how='left', on='Gene')


In [106]:
#df_merged = df_merged.sort_values(['Partition','Plasmid']).reset_index(drop=True)

In [107]:
# don't overwrite this
#df_merged.to_csv('negative_correlation_partitions_annotated_v1.csv', index=False)

In [8]:
## Okay so now we are going to make a dictionary for gene ID to name/product for ALL proteins and genes.

all_cds = db.features_of_type('CDS')

In [24]:
for feature in all_cds:
    print(feature)

In [34]:
# Initialize the dictionary
b31_gene_dict = {}

# Iterate over gene features in the database
for gene in db.features_of_type('CDS'):
    # Assuming 'ID' is the attribute for gene_id in your GFF file
    gene_id = gene.attributes['ID'][0]

    # Extract other required attributes; you might need to adjust these attribute names based on your GFF file
    name = gene.attributes.get('Name', [''])[0]
    replicon = gene.seqid  # Using the seqid field as replicon
    replicon_name = id2plasmid[replicon]
    product = gene.attributes.get('product', [''])[0]
    parent = gene.attributes.get('Parent', [''])[0].replace('gene-', '')
    locus_tag = gene.attributes.get('locus_tag', [''])[0]
    old_locus_tag = gene.attributes.get('old_locus_tag', [''])[0]


    # Store in dictionary
    b31_gene_dict[parent] = {'Protein_ID' : name, 'CDS_id': gene_id, 'replicon': replicon, 'replicon_name' : replicon_name, 'product': product, 'old_locus_tag' : old_locus_tag}

# Optional: print the dictionary to check
print(b31_gene_dict)
b31_ref_pickle = 'b31_gene_dict.pkl'

with open(b31_ref_pickle, 'wb') as outpickle:
    pickle.dump(b31_gene_dict, outpickle)


{'BB_RS00005': {'Protein_ID': 'WP_002658391.1', 'CDS_id': 'cds-WP_002658391.1', 'replicon': 'NC_001318.1', 'replicon_name': 'chromosome', 'product': 'hypothetical protein', 'old_locus_tag': ''}, 'BB_RS00010': {'Protein_ID': 'WP_002658389.1', 'CDS_id': 'cds-WP_002658389.1', 'replicon': 'NC_001318.1', 'replicon_name': 'chromosome', 'product': 'glycoside hydrolase family 3 protein', 'old_locus_tag': ''}, 'BB_RS00015': {'Protein_ID': '', 'CDS_id': 'cds-BB_RS00015', 'replicon': 'NC_001318.1', 'replicon_name': 'chromosome', 'product': 'UTP--glucose-1-phosphate uridylyltransferase', 'old_locus_tag': ''}, 'BB_RS00020': {'Protein_ID': 'WP_020948671.1', 'CDS_id': 'cds-WP_020948671.1', 'replicon': 'NC_001318.1', 'replicon_name': 'chromosome', 'product': 'phosphoglucomutase', 'old_locus_tag': ''}, 'BB_RS00025': {'Protein_ID': 'WP_010889658.1', 'CDS_id': 'cds-WP_010889658.1', 'replicon': 'NC_001318.1', 'replicon_name': 'chromosome', 'product': 'tryptophan--tRNA ligase', 'old_locus_tag': ''}, 'BB_RS

In [1]:
sr_gene_presence_absence_file = "/Users/mf019/bioinformatics/longread_pangenome/longread_analysis/v4/shortread_paired_v4/gene_presence_absence.csv"
lr_gene_presence_absence_file = "/Users/mf019/bioinformatics/longread_pangenome/longread_analysis/v4/longread_paired_v4/gene_presence_absence.csv"


NameError: name 'pandas' is not defined

In [4]:
lr_gene_pa_df = pandas.read_csv(lr_gene_presence_absence_file, delimiter=',')
sr_gene_pa_df = pandas.read_csv(sr_gene_presence_absence_file, delimiter=',')

Unnamed: 0,Gene,Non-unique Gene name,Annotation,No. isolates,No. sequences,Avg sequences per isolate,Genome Fragment,Order within Fragment,Accessory Fragment,Accessory Order with Fragment,...,URI86,URI87,URI88,URI89,URI91,URI93,UWI247,UWI248,UWI263,UWI283
0,Cytosolic protein,,Cytosolic protein,49,49,1.0,63,3286,,,...,GEBMJB_00250,NCOCKI_01290,OHAJAL_00245,CIPKDF_01290,DMHAMM_01290,AGFJGC_01290,GGIAGI_01890,AEKFMC_03510,EMEHFM_02570,LGKPKB_03110
1,hMG1,,"hydroxymethylglutaryl-CoA reductase, degradative",49,49,1.0,63,748,,,...,GEBMJB_00810,NCOCKI_00730,OHAJAL_00800,CIPKDF_00730,DMHAMM_00730,AGFJGC_00730,GGIAGI_03300,AEKFMC_02115,EMEHFM_01720,LGKPKB_03750
2,group_1091,Lipoprotein,Lipoprotein,49,49,1.0,63,240,,,...,GEBMJB_03135,NCOCKI_02510,OHAJAL_03125,CIPKDF_02510,DMHAMM_02510,AGFJGC_02510,GGIAGI_02515,AEKFMC_03885,EMEHFM_00655,LGKPKB_04360
3,BBA07 family lipoprotein,,BBA07 family lipoprotein,49,49,1.0,64,47,,,...,GEBMJB_04540,NCOCKI_04550,OHAJAL_05095,CIPKDF_04550,DMHAMM_04550,AGFJGC_04550,GGIAGI_05050,AEKFMC_04910,EMEHFM_03965,LGKPKB_05635
4,Queuine tRNA-ribosyltransferase,,Queuine tRNA-ribosyltransferase,49,49,1.0,63,3270,,,...,GEBMJB_00175,NCOCKI_01365,OHAJAL_00170,CIPKDF_01365,DMHAMM_01365,AGFJGC_01365,GGIAGI_03535,AEKFMC_03435,EMEHFM_03260,LGKPKB_03885


In [17]:
# Filter rows where the 'Annotation' column contains any of the keywords
keywords = ['partition', 'pf32', 'Pfam32', 'bcsq']
lr_filtered_df = lr_gene_pa_df[lr_gene_pa_df['Annotation'].str.contains('|'.join(keywords), case=False, na=False)]
# Optional: Save the filtered DataFrame to a new CSV file
lr_filtered_df.to_csv('lr_RoaryV4_filtered_output_pf32.csv', index=False)

# Show the first few rows of the filtered DataFrame
print(lr_filtered_df.head())

# Filter rows where the 'Annotation' column contains any of the keywords
keywords = ['partition', 'pf32', 'Pfam32', 'bcsq']
sr_filtered_df = sr_gene_pa_df[sr_gene_pa_df['Annotation'].str.contains('|'.join(keywords), case=False, na=False)]
# Optional: Save the filtered DataFrame to a new CSV file
sr_filtered_df.to_csv('sr_RoaryV4_filtered_output_pf32.csv', index=False)

# Show the first few rows of the filtered DataFrame
print(sr_filtered_df.head())

                                     Gene  \
5                              group_1108   
247                            group_2693   
336                                  parA   
569                                  parB   
619  Borrelia PFam57/62 partition protein   

                     Non-unique Gene name  \
5    Borrelia PFam57/62 partition protein   
247  Borrelia PFam57/62 partition protein   
336                                   NaN   
569                                   NaN   
619                                   NaN   

                                            Annotation  No. isolates  \
5                 Borrelia PFam57/62 partition protein            49   
247               Borrelia PFam57/62 partition protein            49   
336  ParA-like ATPase involved in chromosome/plasmi...            49   
569      putative chromosome-partitioning protein ParB            49   
619               Borrelia PFam57/62 partition protein            49   

     No. sequences  Avg s