In [23]:
#!/usr/bin/env python3
import pandas
import pickle
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import re

In [36]:
def sanitize_filename(filename, replacement='_'):
    """
    Sanitize a string to make it safe for use as a filename.

    Parameters:
    - filename: str, the original filename
    - replacement: str, the character to replace unsafe characters with (default: '_')

    Returns:
    - str, the sanitized filename
    """
    # Replace unsafe characters with the replacement character
    sanitized = re.sub(r'[\/\\\?\%\*\:\|\"<>\s]', replacement, filename)
    # Truncate the filename to a reasonable length: 200 chars should suffice.
    sanitized = sanitized[:200]
    # Remove leading and trailing replacement characters
    sanitized = sanitized.strip(replacement)
    return sanitized

# Function to create SeqRecord objects
def create_seq_record(row):
    return SeqRecord(
        Seq(row['sequence']),
        id=row['locus_tag'],  # Use 'locus_tag' or another unique identifier
        name=row['gene'],
        description=f"{row['product']} {row['assembly']}",
        annotations={
            'protein_id': row['protein_id'],
            'replicon': row['replicon'],
            'replicon_name': row['replicon_name'],
            'start': row['start'],
            'end': row['end'],
            'strand': row['strand'],
            'assembly': row['assembly'],
            'translation': row['translation'],
            'inference': row['inference'],
            'transl_table': row['transl_table'],
            'db_xrefs': row['db_xrefs']
        }
    )

In [25]:
pickle_df = '/Users/mf019/bioinformatics/longread_pangenome/notebooks/metadb/clustered_proteins_db_results_v1.pkl'

with open(pickle_df, 'rb') as jar:
    df_dict = pickle.load(jar)

In [38]:
multifasta_out = 'group_multifasta'

for group in df_dict.keys():
    temp_df = pandas.DataFrame(df_dict[group])
    group_records = [create_seq_record(row) for index, row in temp_df.iterrows()]
    sanitized_group = sanitize_filename(group, '_')
    file_out = f'{multifasta_out}/{sanitized_group}.multi.fna'
    with open(file_out, 'w') as out:
        SeqIO.write(group_records, out, "fasta")


In [46]:
rows = []
for gene in df_dict.keys():
    counts = df_dict[gene]['replicon_name'].value_counts()
    new_row = counts.to_dict()
    rows.append(new_row)
group2plasmid_df = pandas.DataFrame(rows, index=df_dict.keys()).fillna(0)
print(group2plasmid_df)
group2plasmid_df.to_csv('group2plasmid_df_v1.csv')


                       lp36  lp28-3  lp21  lp38  lp28-1  chromosome  lp17  \
Lipoprotein            59.0    11.0   2.0   1.0     1.0         0.0   0.0   
Putative antigen, P35  48.0     7.0   1.0   1.0     1.0         0.0   0.0   
lptF                    0.0     0.0   0.0   0.0     0.0        49.0   0.0   
group_3281              0.0     0.0   0.0   0.0     0.0        49.0   0.0   
htpG                    0.0     0.0   0.0   0.0     0.0        49.0   0.0   
...                     ...     ...   ...   ...     ...         ...   ...   
group_2276              0.0     0.0   0.0   0.0     0.0         0.0   0.0   
group_704               0.0     1.0   0.0   0.0     0.0         0.0   0.0   
group_1306              1.0     0.0   0.0   0.0     0.0         0.0   0.0   
group_1689              0.0     0.0   0.0   0.0     0.0         0.0   0.0   
group_1218              0.0     0.0   0.0   0.0     0.0         1.0   0.0   

                       cp26  cp32-7  cp32-10  ...  lp28-9  lp56  cp9  cp9-3