In [1]:
import re
import os
import numpy as np
import pandas as pd

In [10]:
def generate_alignment_matrix(simple_rows):
    contigs = set()
    contigs = sorted(list(contigs))
    matrix = np.zeros((len(contigs), len(contigs)), dtype=int)
    #logging.info('[bold yellow]Beginning matrix generation loop[/bold yellow]')
    for row in simple_rows[1:]:
        source_contig = row['contig_id']  # contig_id is now in the second column
        source_index = contigs.index(source_contig)
        for alignment in row['list_of_alignments(asm2:homology_length)']:
            target_contig, coverage = alignment.split(':')
            target_contig = target_contig.strip('(')
            coverage = float(coverage.strip('() '))
            target_index = contigs.index(target_contig)
            matrix[source_index, target_index] = coverage
    #logging.info('[bold green]Matrix has been generated[/bold green]')
    return matrix, contigs

def get_rows_and_df(file):
    rows = []
    df = pd.read_csv(file, sep='\t')
    for idx, row in df.iterrows():
        if 'assembly_id' in row:
            if row['assembly_id'] == '[]':
                pass
        else:
            rows.append(row)
    return rows, df

def process_alignment(alignment):
    match = re.match(r"\['?\(?([\w-]+)\s*:\s*\(?(\d+)\)?'\]?", alignment)
    if match:
        return match.group(1), int(match.group(2))
    return None, None

def aggregate_alignments(df):
    grouped = {}
    all_contigs = set()
    
    for _, row in df.iterrows():
        source_contig = row['contig_id']
        all_contigs.add(source_contig)
        target_contig, homology_len = process_alignment(row['list_of_alignments(asm2:homology_length)'])
        
        if source_contig not in grouped:
            grouped[source_contig] = {
                'source_len': row['contig_len'],
                'total_contig_homology': row['total_contig_homology'],
                'alignments': {}
            }
        
        if target_contig:
            all_contigs.add(target_contig)
            grouped[source_contig]['alignments'][target_contig] = homology_len
            
            # Add target contig to grouped if it's not there
            if target_contig not in grouped:
                grouped[target_contig] = {
                    'source_len': None,  # We don't know the length yet
                    'total_contig_homology': None,
                    'alignments': {}
                }

    # Ensure all contigs are in the grouped dictionary
    for contig in all_contigs:
        if contig not in grouped:
            grouped[contig] = {
                'source_len': None,
                'total_contig_homology': None,
                'alignments': {}
            }

    return grouped

def create_summary_dataframe(grouped_data):
    summary_data = []
    
    for source_contig, data in grouped_data.items():
        summary_data.append({
            'source_contig': source_contig,
            'source_len': data['source_len'],
            'total_contig_homology': data['total_contig_homology'],
            'num_alignments': len(data['alignments']),
            'target_contigs': ', '.join(data['alignments'].keys()),
            'homology_lengths': ', '.join(map(str, data['alignments'].values()))
        })
    
    return pd.DataFrame(summary_data)

def create_homology_matrix(grouped_data):
    """takes in grouped data, returns a numpy matrix, pandas matrix, and contig labels for the np_matrix"""
    all_contigs = list(grouped_data.keys())
    n = len(all_contigs)
    
    # Create a mapping of contig names to matrix indices
    contig_to_index = {contig: i for i, contig in enumerate(all_contigs)}
    
    # Initialize the matrix with zeros
    np_matrix = np.zeros((n, n), dtype=np.int64)
    
    for source_contig, data in grouped_data.items():
        source_index = contig_to_index[source_contig]
        
        # Set the contig length on the diagonal if known
        source_len = data.get('source_len')
        if source_len is not None:
            np_matrix[source_index, source_index] = source_len
        
        alignments = data.get('alignments', {})
        for target_contig, homology_len in alignments.items():
            target_index = contig_to_index[target_contig]
            np_matrix[source_index, target_index] = homology_len
            np_matrix[target_index, source_index] = homology_len  # Symmetric
    df_matrix = pd.DataFrame(np_matrix, index=all_contigs, columns=all_contigs)
    return np_matrix, df_matrix, all_contigs

In [3]:
broken_table = 'v2/ava_homology_simple_v2.tsv'

old_rows, row_df = get_rows_and_df(broken_table)

#Fix the broken table
new_rows = []
for idx, row in row_df.iterrows():
    if row['assembly_id'] == '[]':
        pass
    else:
        new_rows.append(row)

In [4]:
new_rows[0:2]

[assembly_id                                               UNY169P_B348_lp28-2
 contig_id                                                 UNY169P_B348_lp28-2
 contig_len                                                              29777
 total_contig_homology                                                    1617
 list_of_alignments(asm2:homology_length)    ['(URI56H_contig000022 : (1617)']
 Name: 1, dtype: object,
 assembly_id                                             URI120H_contig000076
 contig_id                                               URI120H_contig000076
 contig_len                                                               227
 total_contig_homology                                                    227
 list_of_alignments(asm2:homology_length)    ['(URI88H_contig000006 : (227)']
 Name: 3, dtype: object]

In [13]:
fixed_table = 'v2/ava_homo_simple_v2_fixed.tsv'
new_df = pd.DataFrame(new_rows)
new_df = new_df.drop('assembly_id', axis=1).reset_index(drop=True)
new_df.to_csv(fixed_table, sep='\t', index=False)

In [14]:
#fixed_table = 'ava_homology_simple_v2_fixed.tsv'
#new_rows, new_df = get_rows_and_df(fixed_table)
#print(new_df.head())

In [15]:
grouped_data = aggregate_alignments(new_df)
print('Alignments have been cleaned and grouped!')
summary_df = create_summary_dataframe(grouped_data)
print('Summary dataframe has been constructed!')
np_homology_matrix, pd_homology_matrix, contig_labels = create_homology_matrix(grouped_data)
print('Matrix and labels have been generated!')

Alignments have been cleaned and grouped!
Summary dataframe has been constructed!
Matrix and labels have been generated!


In [16]:
for source_contig, data in grouped_data.items():
    print(data['source_len'])
    break

29777


In [17]:
print("Number of contigs:", len(grouped_data))
print("\nSummary DataFrame:")
print(summary_df.head())
print("\nnp_homology Matrix:")
print(np_homology_matrix)
print("\npd_homology Matrix:")
print(pd_homology_matrix.head())

Number of contigs: 2931

Summary DataFrame:
          source_contig  source_len  total_contig_homology  num_alignments  \
0   UNY169P_B348_lp28-2     29777.0                 1617.0             434   
1   URI56H_contig000022         NaN                    NaN              66   
2  URI120H_contig000076       227.0                  227.0             216   
3   URI88H_contig000006         NaN                    NaN              24   
4   URI44H_contig000005     30259.0                  501.0             472   

                                      target_contigs  \
0  URI56H_contig000022, URI47H_contig000015, URI9...   
1  UCT96H_contig000004, UNY208P_MR641_lp28-4, URI...   
2  URI88H_contig000006, URI36H_contig000007, URI4...   
3  URI89H_contig000054, URI40H_contig000073, URI5...   
4  URI46H_contig000040, UCT110H_contig000024, UCT...   

                                    homology_lengths  
0  1617, 14746, 3307, 470, 118, 178, 1198, 768, 5...  
1  760, 762, 1619, 2679, 2677, 1619, 760

In [143]:
output_dir = os.path.join(os.getcwd(), 'asm_ava_v1')
os.makedirs(output_dir, exist_ok=True)

In [144]:
#If you want to save the results to CSV files
summary_df.to_csv(os.path.join(output_dir, 'asm_ava_summary_contig_data_v1.csv'), index=False)
np.savetxt(os.path.join(output_dir, 'asm_ava_homology_matrix_v1.csv'), homology_matrix, delimiter=',', fmt='%d')
pd.DataFrame({'contig': contig_labels}).to_csv(os.path.join(output_dir, 'asm_avacontig_labels_v1.csv'), index=False)

In [None]:
homology_matrix