In [7]:
import pandas as pd
import numpy as np

In [2]:
detailed_homology ='ava_homology_detailed_v5.tsv'
simple_homology = 'ava_homology_simple_v5.tsv'
summary_homology = 'ava_homology_summary_v5.tsv'

In [27]:
def create_homology_matrix(detailed_rows):
    """Creates a homology matrix from the detailed rows."""
    # Extract all unique contigs
    all_contigs = set()
    contig_lengths = {}
    for row in detailed_rows[1:]:  # Skip header row
        asm1_id, asm1_len, asm2_id, asm2_len = row[1], int(row[2]), row[4], int(row[5])
        all_contigs.add(asm1_id)
        all_contigs.add(asm2_id)
        contig_lengths[asm1_id] = asm1_len
        contig_lengths[asm2_id] = asm2_len

    all_contigs = sorted(list(all_contigs))
    n = len(all_contigs)

    # Create a mapping of contig names to matrix indices
    contig_to_index = {contig: i for i, contig in enumerate(all_contigs)}

    # Initialize the matrix with zeros
    np_matrix = np.zeros((n, n), dtype=np.int64)

    # Fill the diagonal with contig lengths
    for i, contig in enumerate(all_contigs):
        np_matrix[i, i] = contig_lengths[contig]

    # Fill the matrix with homology data
    for row in detailed_rows[1:]:  # Skip header row
        asm1_id, asm2_id, asm1_total_homology = row[1], row[4], int(row[7])
        i, j = contig_to_index[asm1_id], contig_to_index[asm2_id]
        np_matrix[i, j] = asm1_total_homology
        np_matrix[j, i] = asm1_total_homology  # Ensure symmetry

    # Create pandas DataFrame
    df_matrix = pd.DataFrame(np_matrix, index=all_contigs, columns=all_contigs)

    return np_matrix, df_matrix, all_contigs

def check_diagonal_cells_np(np_matrix, contig_labels):
    """
    Check the diagonal cells of the np_homology matrix to ensure none are zero.

    Args:
    np_matrix (numpy.ndarray): The numpy matrix representing the homology data.
    contig_labels (list): List of contig labels corresponding to matrix rows/columns.

    Returns:
    list: List of contig labels with zero diagonal values, if any.
    """
    zero_diagonals = []
    for i, contig in enumerate(contig_labels):
        if np_matrix[i, i] == 0:
            zero_diagonals.append(contig)

    return zero_diagonals

def check_diagonal_cells_df(df_matrix):
    """
    Check the diagonal cells of the homology DataFrame to ensure none are zero.

    Args:
    df_matrix (pd.DataFrame): The pandas DataFrame representing the homology data.

    Returns:
    list: List of contig labels with zero diagonal values, if any.
    """
    zero_diagonals = []
    for contig in df_matrix.index:
        if df_matrix.loc[contig, contig] == 0:
            zero_diagonals.append(contig)

    return zero_diagonals

def write_list_to_file(filename, data_list):
    with open(filename, 'w') as file:
        for item in data_list:
            file.write(str(item) + '\n')

In [21]:
detailed_df = pandas.read_csv(detailed_homology, sep='\t')
simple_df = pandas.read_csv(simple_homology, sep='\t')
summary_df = pandas.read_csv(summary_homology, sep='\t')

In [5]:
drows = []
num_rows = len(detailed_df)
print(num_rows)
for index, row in detailed_df.iterrows():
    drows.append(row.to_list())
print(len(drows))


4329153
4329153


In [8]:
np_matrix, df_matrix, all_contigs = create_homology_matrix(drows)

In [26]:
# Usage example:
zero_contigs_np = check_diagonal_cells_np(np_matrix, all_contigs)
zero_contigs_df = check_diagonal_cells_df(df_matrix)

if zero_contigs_np:
    print(f"The following np contigs have zero length (diagonal value): {', '.join(zero_contigs_np)}")
else:
    print("All np contigs have non-zero lengths (diagonal values).")

if zero_contigs_df:
    print(f"The following df contigs have zero length (diagonal value): {', '.join(zero_contigs_df)}")
else:
    print("All df contigs have non-zero lengths (diagonal values).")

All np contigs have non-zero lengths (diagonal values).
All df contigs have non-zero lengths (diagonal values).


In [29]:
print(zero_contigs_np, zero_contigs_df)

[] []


In [30]:
np_matrix.tofile('ava_homology_np_matrix_v5.1.tsv', sep='\t')
df_matrix.to_csv('ava_homology_pd_matrix_labeled_v5.1.tsv', sep='\t')
write_list_to_file('ava_homology_np_matrix_labels_v5.1.txt', all_contigs)