In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from Bio import SeqIO


In [8]:
def parse_fasta_and_interleaved_phy(fasta_file, phy_file):
    """
    Parses a FASTA file for sequence descriptions and an interleaved PHYLIP file for sequences,
    assuming both files contain the sequences in the same order.

    :param fasta_file: Path to the FASTA file.
    :param phy_file: Path to the interleaved PHYLIP file.
    :return: A list of tuples, each containing a sequence description from the FASTA file
             and its corresponding sequence from the PHYLIP file.
    """
    # Parse the FASTA file for descriptions
    descriptions = [record.description for record in SeqIO.parse(fasta_file, "fasta")]
    
    # Initialize variables for parsing the PHYLIP file
    sequences = [''] * len(descriptions)  # Pre-initialize list to hold concatenated sequences
    seq_index = 0  # Index to keep track of the current sequence being concatenated

    with open(phy_file, 'r') as file:
        # First line contains the number of sequences and sequence length, which can be ignored here
        num_sequences, sequence_length = map(int, file.readline().strip().split())
        
        # Verify the number of sequences matches
        if num_sequences != len(descriptions):
            raise ValueError("The number of sequences in the FASTA and PHYLIP files does not match.")
        
        for line in file:
            line = line.strip()
            if line:  # Ignore empty lines
                # Extract the sequence part (after the first block of sequences, identifiers are not included)
                if seq_index < num_sequences:
                    # For the first block of sequences, ignore the first 10 characters (sequence identifier)
                    sequences[seq_index] += line[10:].strip()
                else:
                    # After the first block, directly concatenate sequence data
                    sequences[seq_index % num_sequences] += line.strip()
                seq_index += 1

    # Combine descriptions with sequences
    combined_data = list(zip(descriptions, sequences))
    return combined_data



In [9]:
def categorize_sample(sample_type):
    """
    Categorize the sample type into one of the four specified categories.
    """
    if '1alt' in sample_type or '1Malt' in sample_type:
        return '1alt'
    elif '1pri' in sample_type:
        return '1pri'
    elif 'altT' in sample_type or 'altF' in sample_type:
        return '2alt'
    elif 'priT' in sample_type or 'priF' in sample_type:
        return '2pri'
    return None

In [10]:
def prepare_data(fasta_file, phy_file):
    sequences = parse_fasta_and_interleaved_phy(fasta_file, phy_file)
    # Combine data
    data = []
    for desc, seq in sequences:
        _, _, sample_type,  position, _ = desc.split('_')
        category = categorize_sample(sample_type)
        if category and position is not None:  # Ensure we have both category and position
            data.append((desc, seq, category, position))
    
    # Sort by genomic position
    data.sort(key=lambda x: x[3])
    return data



In [11]:
data = prepare_data("../treeBuilding/mCanLor/combined_genes_IGH.fasta", "../treeBuilding/mCanLor/combined_genes_IGH.phy")

In [12]:
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

# Assuming you have a list of sequences and their genomic positions
sequences = [...]  # Your sequence data: [(id1, seq1, position1), (id2, seq2, position2), ...]

# Initialize your graph
G = nx.Graph()

# Add nodes to the graph
for seq_id, seq, position in sequences:
    G.add_node(seq_id, sequence=seq, position=position)

# Compare all pairs of sequences and add edges
for i, (id1, seq1, pos1) in enumerate(sequences):
    for id2, seq2, pos2 in sequences[i+1:]:
        mismatches = count_mismatches(seq1, seq2)
        if mismatches == 0:
            # Identical sequences - solid blue edge
            G.add_edge(id1, id2, color='blue', style='solid')
        elif mismatches == 1:
            # One mismatch - dashed red edge
            G.add_edge(id1, id2, color='red', style='dashed')

# Drawing the graph with custom edge styles
pos = nx.spring_layout(G)  # Or any layout that suits your data
edges = G.edges(data=True)
colors = [edata['color'] for u, v, edata in edges]
styles = [edata['style'] for u, v, edata in edges]

# Draw nodes
nx.draw_networkx_nodes(G, pos)
# Draw labels
nx.draw_networkx_labels(G, pos)

# Draw edges with custom styles and colors
for (u, v, edata), color, style in zip(edges, colors, styles):
    nx.draw_networkx_edges(G, pos, edgelist=[(u, v)], width=2, edge_color=color, style=style)

# Create a legend
blue_line = mlines.Line2D([], [], color='blue', marker='_', linestyle='-', linewidth=2, label='Identical')
red_line = mlines.Line2D([], [], color='red', marker='_', linestyle='--', linewidth=2, label='1 Mismatch')
plt.legend(handles=[blue_line, red_line])

plt.show()


ModuleNotFoundError: No module named 'networkx'

In [13]:
!pip install networkx


/bin/bash: pip: command not found
