In [5]:
import pandas as pd
import numpy as np
from collections import defaultdict

def identify_discordant_blocks(data, threshold=4):
    # Read the CSV data
    df = pd.read_csv(data)

    print(f"Total rows: {len(df)}")
    print(f"Unique assemblies: {df['assembly'].nunique()}")
    print(f"Unique genes: {df['gene'].nunique()}")

    # Identify replicon columns
    replicon_columns = df.columns[6:-2].tolist()

    # Function to get the replicons for each gene (allowing multiple)
    def get_gene_replicons(row):
        return [col for col in replicon_columns if row[col] > 0]

    # Apply the function to each row
    df['gene_replicons'] = df.apply(get_gene_replicons, axis=1)

    # Group by gene and assembly, and get the replicons for each
    gene_assembly_replicons = df.groupby(['gene', 'assembly'])['gene_replicons'].first()

    print(f"Shape of gene_assembly_replicons: {gene_assembly_replicons.shape}")

    # Identify discordant genes
    discordant_genes = []
    for gene in df['gene'].unique():
        try:
            replicon_sets = gene_assembly_replicons[gene].apply(set)
            if len(set.union(*replicon_sets)) > 1:
                discordant_genes.append(gene)
        except KeyError as e:
            print(f"KeyError for gene {gene}: {str(e)}")

    print(f"Number of discordant genes: {len(discordant_genes)}")

    # Filter the dataframe for discordant genes
    discordant_df = df[df['gene'].isin(discordant_genes)]

    # Group discordant genes into blocks
    blocks = defaultdict(lambda: defaultdict(set))
    current_block = []
    prev_pos = None
    prev_assembly = None

    for _, row in discordant_df.sort_values(['assembly', 'pos']).iterrows():
        if prev_assembly != row['assembly'] or (prev_pos is not None and row['pos'] - prev_pos > 1):
            if current_block:
                block_key = tuple(current_block)
                for gene in current_block:
                    try:
                        blocks[block_key][row['assembly']].update(gene_assembly_replicons[gene, row['assembly']])
                    except KeyError as e:
                        print(f"KeyError in block creation: {str(e)}")
                        print(f"Gene: {gene}, Assembly: {row['assembly']}")
            current_block = []
        current_block.append(row['gene'])
        blocks[tuple(current_block)][row['assembly']].update(row['gene_replicons'])
        prev_pos = row['pos']
        prev_assembly = row['assembly']

    # Prepare output
    results = []
    for block, assembly_replicons in blocks.items():
        all_replicons = set.union(*assembly_replicons.values())
        main_replicon = max(all_replicons, key=lambda r: sum(r in replicons for replicons in assembly_replicons.values()))

        discordant_assemblies = [
            assembly for assembly, replicons in assembly_replicons.items()
            if main_replicon not in replicons
        ]

        if len(discordant_assemblies) <= threshold:
            results.append({
                'block': block,
                'main_replicon': main_replicon,
                'all_assemblies': list(assembly_replicons.keys()),
                'discordant_assemblies': discordant_assemblies,
                'all_replicons': list(all_replicons),
                'discordant_replicons': list(all_replicons - {main_replicon})
            })

    print(f"Number of discordant blocks found: {len(results)}")
    return results

In [10]:
infile = 'GeneTree_Ordered_50.csv'
gtree = pd.read_csv(infile)


In [14]:
def chaos_block_finder(df, best_replicon, target_blocks=None):
    chaos = df[(df["best_replicon"]==best_replicon)&(df["replicon_name"]!=best_replicon)].copy()

    if target_blocks is not None:
        if isinstance(target_blocks, list):
            chaos = chaos[chaos["replicon_name"].isin(target_blocks)]
        else:
            raise ValueError('Target blocks must be formatted as a list, e.g. ["plasmid1", "plasmid2"]')
        for t in target_blocks:
            chaos_t = chaos[chaos["replicon_name"]==t].copy()
            print(f"********************{t}********************")
            print(f"Mosaic Assemblies: {chaos_t['assembly'].unique()}")
            print(f"Mosaic Genes: {chaos_t['gene'].unique()}\n")

    print(f"**********Overall Mosaicism in {best_replicon}**********")
    print(f"Mosaic Assemblies: {chaos['assembly'].unique()}")
    print(f"Mosaic Replicons: {chaos['replicon_name'].unique()}")
    print(f"Mosaic Genes: {chaos['gene'].unique()}")
    return chaos


In [30]:

lp28_4_df = chaos_block_finder(gtree, "lp28-4", ['lp36','lp38','lp28-2','cp32-10','lp28-1'])


********************lp36********************
Mosaic Assemblies: ['UCT109H' 'UCT31H' 'UCT96H' 'URI101H' 'URI102H' 'URI107H' 'URI111H'
 'URI120H' 'URI39H' 'URI40H' 'URI41H' 'URI42H' 'URI87H' 'URI89H' 'URI91H'
 'URI93H']
Mosaic Genes: ['P12 family lipoprotein']

********************lp38********************
Mosaic Assemblies: ['UCT29H']
Mosaic Genes: ['P12 family lipoprotein']

********************lp28-2********************
Mosaic Assemblies: ['URI102H' 'UCT109H']
Mosaic Genes: ['Borrelia ORF-A' 'group_1798' 'group_1250' 'Transposase'
 'Blasticidin-S acetyltransferase' 'BBC01' 'group_1795' 'group_1794'
 'group_1793' 'group_862'
 'Borrelia burgdorferi virulent strain associated lipoprotein'
 'group_1796' 'group_1249' 'group_3525' 'Surface antigen'
 'chromosome replication/partitioning protein']

********************cp32-10********************
Mosaic Assemblies: ['URI42H']
Mosaic Genes: ['Borrelia ORF-A' 'group_1798' 'group_1250' 'Transposase'
 'Blasticidin-S acetyltransferase' 'BBC01' 'grou

In [31]:

cp32_1_df = chaos_block_finder(gtree, "cp32-1")

**********Overall Mosaicism in cp32-1**********
Mosaic Assemblies: []
Mosaic Replicons: []
Mosaic Genes: []


In [None]:

# Usage
discordant_blocks = identify_discordant_blocks(infile)

# Print results
for i, block in enumerate(discordant_blocks, 1):
    print(f"Discordant Block {i}:")
    print(f"Genes: {', '.join(block['block'])}")
    print(f"Main Replicon: {block['main_replicon']}")
    print(f"All Assemblies: {', '.join(block['all_assemblies'])}")
    print(f"Discordant Assemblies: {', '.join(block['discordant_assemblies'])}")
    print(f"All Replicons: {', '.join(block['all_replicons'])}")
    print(f"Discordant Replicons: {', '.join(block['discordant_replicons'])}")
    print(f"Number of Assemblies: {len(block['all_assemblies'])}")
    print(f"Number of Discordant Assemblies: {len(block['discordant_assemblies'])}")
    print()

In [4]:
for block in discordant_blocks:
    print(block)

{'block': ('group_431',), 'main_replicon': 'cp32-3', 'all_assemblies': ['ESI26H', 'UCT109H', 'UCT29H', 'UCT31H', 'UCT50H', 'UCT96H', 'UNY172P', 'UNY203P', 'URI101H', 'URI107H', 'URI111H', 'URI120H', 'URI36H', 'URI39H', 'URI40H', 'URI41H', 'URI42H', 'URI86H', 'URI87H', 'URI89H', 'URI91H', 'URI93H', 'UWI283P'], 'discordant_assemblies': [], 'all_replicons': ['cp32-3', 'cp32-10', 'cp32-9', 'cp32-4', 'cp32-1', 'Unclassified', 'cp32-11'], 'discordant_replicons': ['cp32-10', 'cp32-9', 'cp32-4', 'cp32-1', 'Unclassified', 'cp32-11']}
{'block': ('Borrelia ORF-A',), 'main_replicon': 'lp28-4', 'all_assemblies': ['ESI26H', 'UCT109H', 'UCT110H', 'UCT29H', 'UCT30H', 'UCT31H', 'UCT32H', 'UCT35H', 'UCT50H', 'UCT92H', 'UCT96H', 'UNY149P', 'UNY169P', 'UNY172P', 'UNY208P', 'URI101H', 'URI102H', 'URI103H', 'URI107H', 'URI111H', 'URI112H', 'URI117H', 'URI118H', 'URI120H', 'URI33H', 'URI34H', 'URI39H', 'URI40H', 'URI41H', 'URI42H', 'URI44H', 'URI46H', 'URI47H', 'URI48H', 'URI86H', 'URI87H', 'URI88H', 'URI89H

In [34]:
def diagnose_csv(file_path):
    df = pd.read_csv(file_path)

    print(f"Total rows: {len(df)}")
    print(f"Total columns: {len(df.columns)}")
    print("\nColumn names:")
    print(df.columns.tolist())

    print("\nFirst 5 rows:")
    print(df.head())

    print("\nData types:")
    print(df.dtypes)

    print("\nUnique values in key columns:")
    for col in ['assembly', 'gene', 'replicon_name', 'best_replicon']:
        if col in df.columns:
            print(f"{col}: {df[col].nunique()}")

    replicon_columns = df.columns[6:-2].tolist()  # Assuming the last two columns are 'lipo' and 'surface_lipo'
    non_zero_counts = (df[replicon_columns] > 0).sum()
    print("\nNon-zero counts in replicon columns:")
    print(non_zero_counts)

    multiple_non_zero = (df[replicon_columns] > 0).sum(axis=1) > 1
    print(f"\nRows with multiple non-zero replicon values: {multiple_non_zero.sum()}")

    if multiple_non_zero.sum() > 0:
        print("\nSample row with multiple non-zero replicon values:")
        print(df[multiple_non_zero].iloc[0])

    gene_replicon_counts = df.groupby(['gene', 'best_replicon']).size().unstack(fill_value=0)
    discordant_genes = gene_replicon_counts[gene_replicon_counts.gt(0).sum(axis=1) > 1]
    print(f"\nNumber of discordant genes: {len(discordant_genes)}")

    if len(discordant_genes) > 0:
        print("\nSample discordant gene:")
        sample_gene = discordant_genes.index[0]
        print(df[df['gene'] == sample_gene])


In [47]:
def analyze_replicon_discrepancies(file_path, threshold=4):
    # Read the CSV file
    df = pd.read_csv(file_path)

    print("Data shape:", df.shape)
    print("Columns:", df.columns.tolist())

    # Identify discrepancies
    df['is_discrepant'] = df['replicon_name'] != df['best_replicon']

    # Group by gene and summarize discrepancies and non-discrepancies
    gene_summary = df.groupby('gene').agg({
        'is_discrepant': ['sum', 'size'],
        'assembly': lambda x: [list(x[df.loc[x.index, 'is_discrepant']]),
                               list(x[~df.loc[x.index, 'is_discrepant']])],
        'replicon_name': lambda x: [list(x[df.loc[x.index, 'is_discrepant']]),
                                    list(set(x[~df.loc[x.index, 'is_discrepant']]))],
        'best_replicon': 'first'
    })

    # Flatten the column names
    gene_summary.columns = ['discrepant_count', 'total_assemblies', 'assemblies', 'replicons', 'best_replicon']
    gene_summary['non_discrepant_count'] = gene_summary['total_assemblies'] - gene_summary['discrepant_count']

    # Split the lists
    gene_summary['discrepant_assemblies'] = gene_summary['assemblies'].str[0]
    gene_summary['non_discrepant_assemblies'] = gene_summary['assemblies'].str[1]
    gene_summary['discrepant_replicons'] = gene_summary['replicons'].str[0]
    gene_summary['non_discrepant_replicons'] = gene_summary['replicons'].str[1]
    gene_summary.drop(columns=['assemblies', 'replicons'], inplace=True)

    # Filter based on threshold
    results = gene_summary[
        (gene_summary['discrepant_count'] > 0) &
        (gene_summary['discrepant_count'] <= threshold)
    ].reset_index()

    return results

discrepancies = analyze_replicon_discrepancies(infile)

print("\nDetailed Replicon Location Analysis:")
if not discrepancies.empty:
    for _, disc in discrepancies.iterrows():
        print(f"\nGene: {disc['gene']}")
        print(f"Best Replicon: {disc['best_replicon']}")
        print(f"Total assemblies: {disc['total_assemblies']}")
        print(f"Number of discrepant assemblies: {disc['discrepant_count']}")
        print(f"Discrepant assemblies: {', '.join(disc['discrepant_assemblies'])}")
        print(f"Replicons in discrepant assemblies: {', '.join(disc['discrepant_replicons'])}")
        print(f"Number of non-discrepant assemblies: {disc['non_discrepant_count']}")
        print(f"Non-discrepant assemblies: {', '.join(disc['non_discrepant_assemblies'])}")
        print(f"Replicons in non-discrepant assemblies: {', '.join(disc['non_discrepant_replicons'])}")
else:
    print("No discrepancies found matching the current criteria.")

# Summary statistics
print("\nSummary Statistics:")
total_genes = len(gene_summary)
genes_with_discrepancies = len(discrepancies)
print(f"Total number of genes analyzed: {total_genes}")
print(f"Number of genes with discrepancies: {genes_with_discrepancies}")
print(f"Percentage of genes with discrepancies: {(genes_with_discrepancies / total_genes) * 100:.2f}%")

# Check for potential singletons
singletons = discrepancies[discrepancies['total_assemblies'] == 1]
if not singletons.empty:
    print(f"\nWarning: {len(singletons)} gene(s) appear to be singletons and are marked as discrepant:")
    for _, singleton in singletons.iterrows():
        print(f"Gene: {singleton['gene']}, Assembly: {singleton['discrepant_assemblies'][0]}")


Data shape: (49718, 46)
Columns: ['assembly', 'gene', 'replicon_name', 'color', 'best_replicon', 'pos', 'Unclassified', 'chromosome', 'cp26', 'cp32-1', 'cp32-10', 'cp32-11', 'cp32-12', 'cp32-13', 'cp32-2', 'cp32-3', 'cp32-4', 'cp32-5', 'cp32-6', 'cp32-7', 'cp32-8', 'cp32-9', 'cp32-9-4', 'cp9', 'cp9-3', 'lp17', 'lp21', 'lp21-cp9', 'lp25', 'lp28-1', 'lp28-11', 'lp28-2', 'lp28-3', 'lp28-4', 'lp28-5', 'lp28-6', 'lp28-7', 'lp28-8', 'lp28-9', 'lp36', 'lp38', 'lp5', 'lp54', 'lp56', 'lipo', 'surface_lipo']

Detailed Replicon Location Analysis:

Gene: 2-9-5 36K minus strand ORF
Best Replicon: cp32-3
Total assemblies: 25
Number of discrepant assemblies: 4
Discrepant assemblies: URI33H, URI48H, URI111H, URI41H
Replicons in discrepant assemblies: cp32-8, cp32-8, cp32-10, cp32-10
Number of non-discrepant assemblies: 21
Non-discrepant assemblies: UWI283P, URI93H, URI36H, URI56H, UNY172P, UCT29H, UCT109H, UCT31H, UCT96H, URI101H, URI102H, URI107H, URI39H, URI40H, URI42H, URI87H, URI89H, URI91H, UWI24

NameError: name 'gene_summary' is not defined

In [48]:
import pandas as pd
import numpy as np

def analyze_bulk_replicon_discrepancies(file_path, threshold=4):
    # Read the CSV file
    df = pd.read_csv(file_path)

    print("Data shape:", df.shape)
    print("Columns:", df.columns.tolist())

    # Identify discrepancies
    df['is_discrepant'] = df['replicon_name'] != df['best_replicon']

    # Group by replicon_name and summarize discrepancies
    replicon_summary = df.groupby('replicon_name').agg({
        'is_discrepant': 'sum',
        'assembly': lambda x: list(x[df.loc[x.index, 'is_discrepant']].unique()),
        'gene': lambda x: list(x[df.loc[x.index, 'is_discrepant']].unique()),
        'best_replicon': lambda x: x[~df.loc[x.index, 'is_discrepant']].unique().tolist()
    }).reset_index()

    # Rename columns for clarity
    replicon_summary.columns = ['replicon_name', 'discrepancy_count', 'discrepant_assemblies', 'discrepant_genes', 'expected_replicons']

    # Calculate total assemblies and genes for each replicon
    replicon_summary['total_assemblies'] = df.groupby('replicon_name')['assembly'].nunique().values
    replicon_summary['total_genes'] = df.groupby('replicon_name')['gene'].nunique().values

    # Calculate non-discrepant counts
    replicon_summary['non_discrepant_count'] = replicon_summary['total_assemblies'] - replicon_summary['discrepancy_count']

    # Filter based on threshold
    replicon_summary = replicon_summary[
        (replicon_summary['discrepancy_count'] > 0) &
        (replicon_summary['discrepancy_count'] <= threshold)
    ]

    return replicon_summary

discrepancies = analyze_bulk_replicon_discrepancies(infile)

print("\nBulk Replicon Discrepancy Analysis:")
if not discrepancies.empty:
    for _, disc in discrepancies.iterrows():
        print(f"\nReplicon: {disc['replicon_name']}")
        print(f"Expected Replicon(s): {', '.join(disc['expected_replicons'])}")
        print(f"Total assemblies: {disc['total_assemblies']}")
        print(f"Number of discrepant assemblies: {disc['discrepancy_count']}")
        print(f"Discrepant assemblies: {', '.join(disc['discrepant_assemblies'])}")
        print(f"Total genes on this replicon: {disc['total_genes']}")
        print(f"Number of discrepant genes: {len(disc['discrepant_genes'])}")
        print(f"Discrepant genes: {', '.join(disc['discrepant_genes'])}")
else:
    print("No discrepancies found matching the current criteria.")

# Summary statistics
print("\nSummary Statistics:")
total_replicons = len(pd.unique(df['replicon_name']))
replicons_with_discrepancies = len(discrepancies)
print(f"Total number of replicons analyzed: {total_replicons}")
print(f"Number of replicons with discrepancies: {replicons_with_discrepancies}")
print(f"Percentage of replicons with discrepancies: {(replicons_with_discrepancies / total_replicons) * 100:.2f}%")

Data shape: (49718, 46)
Columns: ['assembly', 'gene', 'replicon_name', 'color', 'best_replicon', 'pos', 'Unclassified', 'chromosome', 'cp26', 'cp32-1', 'cp32-10', 'cp32-11', 'cp32-12', 'cp32-13', 'cp32-2', 'cp32-3', 'cp32-4', 'cp32-5', 'cp32-6', 'cp32-7', 'cp32-8', 'cp32-9', 'cp32-9-4', 'cp9', 'cp9-3', 'lp17', 'lp21', 'lp21-cp9', 'lp25', 'lp28-1', 'lp28-11', 'lp28-2', 'lp28-3', 'lp28-4', 'lp28-5', 'lp28-6', 'lp28-7', 'lp28-8', 'lp28-9', 'lp36', 'lp38', 'lp5', 'lp54', 'lp56', 'lipo', 'surface_lipo']

Bulk Replicon Discrepancy Analysis:

Replicon: chromosome
Expected Replicon(s): chromosome
Total assemblies: 49
Number of discrepant assemblies: 2
Discrepant assemblies: URI39H, URI36H
Total genes on this replicon: 814
Number of discrepant genes: 2
Discrepant genes: group_1977, group_223

Replicon: lp28-8
Expected Replicon(s): 
Total assemblies: 1
Number of discrepant assemblies: 4
Discrepant assemblies: URI33H
Total genes on this replicon: 4
Number of discrepant genes: 4
Discrepant genes: 