In [None]:
import pandas as pd
hete_df = pd.read_csv('../data/ID_2mer_hete.csv', names = ['id'])
homo_df = pd.read_csv('../ID_2mer_homo.csv', names = ['id'])
hete_list = list(hete_df['id'])
homo_list = list(homo_df['id'])

In [None]:
import os
import random
def dimer_fastas_to_dimer_config(fasta_dir, out_file_path, hete_list, homo_list):

  out_file = open(out_file_path, 'w')
  headers = ['ArrayTaskID', 'seq1_id', 'seq2_id', 'seq1', 'seq2', 'which_seq', 'delete_index']
  out_file.write('\t'.join(headers) + '\n')

  n_perm = 1

  n_dimer = 0

  n_hete = 0
  n_homo = 0

  directory_list = os.listdir(fasta_dir)
  # Randomly shuffle the directory list
  random.shuffle(directory_list)

  # List of filenames corresponding to dimers we add to the monomer config file
  dimer_path_list = []

  for fasta_filename in directory_list:

    if n_dimer == 30:
      break

    fasta_file = os.path.join(fasta_dir, fasta_filename)

    cur_line = 1 # current line is 1-indexed
    seq1_id = ''
    seq2_id = ''
    seq1 = ''
    seq2 = ''

    with open(fasta_file) as f:

      for line in f:

        if cur_line == 1:
          seq1_id = line[1:].strip()
          cur_line += 1
          continue
        elif cur_line == 2:
          seq1 = line.strip()
          cur_line += 1
          continue
        elif cur_line == 3:
          seq2_id = line[1:].strip()
          cur_line += 1
          continue
        elif cur_line == 4:
          seq2 = line.strip()
          cur_line += 1
          continue

    # Only keep the dimers with total number of amino acids <= 200
    if len(seq1) + len(seq2) > 200:
      continue

    # If dimer is heteromer, check if we have too many already
    if seq1_id[:4] in hete_list:
      if n_hete == 15:
        continue
      else:
        n_hete += 1
    # Else dimer is homomer, check if we have too many already
    else:
      if n_homo == 15:
        continue
      else:
        n_homo += 1

    # Add to list of filenames we use
    n_dimer += 1
    dimer_path_list.append(fasta_file)

    # Write in a baseline run
    baseline_info = [str(n_perm), seq1_id, seq2_id, seq1, seq2, str(0), str(-1)]
    out_file.write('\t'.join(baseline_info) + '\n')
    n_perm += 1


    for which_seq in [1,2]:

      if which_seq == 1:

        for delete_index in range(len(seq1)):
          run_info = [str(n_perm), seq1_id, seq2_id, seq1, seq2, str(which_seq), str(delete_index)]
          out_file.write('\t'.join(run_info) + '\n')
          n_perm += 1

      else:

        for delete_index in range(len(seq2)):
          run_info = [str(n_perm), seq1_id, seq2_id, seq1, seq2, str(which_seq), str(delete_index)]
          out_file.write('\t'.join(run_info) + '\n')
          n_perm += 1

  out_file.close()

  print(f"Number of dimers: {n_dimer}")
  print(f"Number of heteromers: {n_hete}")
  print(f"Number of homomers: {n_hete}")
  print(f"Number of permutations: {n_perm}")

  return dimer_path_list

In [None]:
def dimer_fastas_to_monomer_config(dimer_path_list, out_file_path):

  out_file = open(out_file_path, 'w')
  headers = ['ArrayTaskID', 'seq', 'delete_index']
  out_file.write('\t'.join(headers) + '\n')

  monomer_list = []

  for dimer_fasta in dimer_path_list:

    cur_line = 1 # current line is 1-indexed

    with open(dimer_fasta) as f:

      for line in f:

        if cur_line == 1:
          cur_line += 1
          continue

        elif cur_line == 2:
          monomer_list.append(line.strip())
          cur_line += 1
          continue

        elif cur_line == 3:
          cur_line += 1
          continue

        elif cur_line == 4:
          monomer_list.append(line.strip())
          cur_line += 1
          continue

  monomer_set = set(monomer_list)

  n_monomer = len(monomer_set)

  n_perm = 1

  for seq in monomer_set:

    # Write in a baseline run
    baseline_info = [str(n_perm), seq, str(-1)]
    out_file.write('\t'.join(baseline_info) + '\n')
    n_perm += 1

    for delete_index in range(len(seq)):
      run_info = [str(n_perm), seq, str(delete_index)]
      out_file.write('\t'.join(run_info) + '\n')
      n_perm += 1

  print(f'Number of unique monomers: {n_monomer}')
  print(f'Number of permutations: {n_perm}')

  return

In [None]:
dimer_fasta_dir = '../data/dimer_seqres'
dimer_out_file_dir = '../config_files/multimer_config.txt'

dimer_path_list = dimer_fastas_to_dimer_config(dimer_fasta_dir, dimer_out_file_dir, hete_list, homo_list)

monomer_out_file_dir = '../config_files/monomer_config.txt'
dimer_fastas_to_monomer_config(dimer_path_list, monomer_out_file_dir)