In [None]:
import sys
import os
import numpy as np
from transformers import pipeline

# Function to read a regular FASTA file
def read_fasta(filepath, output_arr=False):
    names = []
    seqs = []
    seq = ''
    with open(filepath, 'r') as fin:
        for line in fin:
            if line.startswith('>'):
                if seq:
                    names.append(name)
                    if output_arr:
                        seqs.append(np.array(list(seq)))
                    else:
                        seqs.append(seq)
                name = line[1:].strip()
                seq = ''
            else:
                seq += line.strip()
        if seq:
            names.append(name)
            if output_arr:
                seqs.append(np.array(list(seq)))
            else:
                seqs.append(seq)
    if output_arr:
        seqs = np.array(seqs)
    return names, seqs

def insert_newlines(seq, every=60):
    return '\n'.join(seq[i:i+every] for i in range(0, len(seq), every))

def output_fasta(names, seqs, output_file):
    with open(output_file, 'w') as file:
        for name, seq in zip(names, seqs):
            file.write(name + '\n')
            file.write(seq + '\n')

def main():
    input_file = r"F:\ayush_work\BIO\archive\anatoxin_sequences.fasta"
    preprocessed_file = 'preprocessed.txt'
    output_file = 'new_sequences.txt'

    # Preprocess input data
    _, input_seqs = read_fasta(input_file)
    input_seqs = [insert_newlines(seq) for seq in input_seqs]
    input_names = ["<|endoftext|>" for _ in range(len(input_seqs))]
    output_fasta(input_names, input_seqs, preprocessed_file)

    # Use ProtGPT2 model to generate new sequences
    protgpt2 = pipeline('text-generation', model="nferruz/ProtGPT2")

    with open(preprocessed_file, 'r') as file:
        sequences = file.readlines()

    new_sequences = []
    for seq in sequences:
        new_seq = protgpt2(seq.strip(), max_length=100, do_sample=True, top_k=950, repetition_penalty=1.2, num_return_sequences=1)[0]['generated_text']
        new_sequences.append(new_seq)

    with open(output_file, 'w') as file:
        for seq in new_sequences:
            file.write(seq + '\n')

if __name__ == "__main__":
    main()


12
