In [2]:
import pygenomeviz
import pandas
from collections import defaultdict
import numpy

In [3]:
plasmid_hits = '../../plasmid_id/output/best_matches_v9_1kb.tsv'
nucl_alns = '../big-ava/nucl_v5.1/all_aln_coords.tsv'

In [4]:
import re

def extract_order_from_newick(newick_string):
    # Remove any newlines and extra spaces
    newick_string = newick_string.strip().replace('\n', '')

    # Remove branch lengths and any bootstrap values
    newick_string = re.sub(r':[0-9.]+', '', newick_string)
    newick_string = re.sub(r'\)[0-9.]+', ')', newick_string)

    # Extract taxa names, ignoring the trailing semicolon
    taxa = re.findall(r'([^():,;]+)', newick_string)

    # Only remove leading/trailing whitespace and quotes
    taxa = [taxon.strip().strip('\'"') for taxon in taxa]

    # Remove any empty strings
    taxa = [taxon for taxon in taxa if taxon]

    # Create a list of tuples with (index, taxon)
    ordered_taxa = [(i+1, taxon) for i, taxon in enumerate(taxa)]

    return ordered_taxa

newick_tree = "(URI39H:0.000000028,URI89H:0.000034914,((URI91H:0.000113474,URI120H:0.000069830)0.746:0.000008723,(URI41H:0.000017455,(UCT31H:0.000104792,(((UCT96H:0.009957911,(ESI26H:0.010986128,(((URI33H:0.000122150,URI48H:0.000052523)1.000:0.019438073,((UNY172P:0.001391002,UWI283P:0.001184137)1.000:0.018878056,(UCT50H:0.000026207,UNY169P:0.000026209)1.000:0.017485409)1.000:0.007458478)0.999:0.003959685,((URI36H:0.000139620,URI56H:0.000105496)1.000:0.019761557,(((UNY193P:0.000061637,URI34H:0.000026339)1.000:0.010978769,(UCT35H:0.000088604,UNY203P:0.000086727)1.000:0.009098609)1.000:0.010560518,(((UWI247P:0.000362638,UWI248P:0.000309601)1.000:0.001640700,(UCT113H:0.000763915,(UNY208P:0.000139258,URI88H:0.000026500)1.000:0.001592699)1.000:0.001600894)1.000:0.005599140,(UWI263P:0.006130838,((((URI103H:0.000008731,URI86H:0.000008723)0.936:0.000026163,(URI118H:0.000017448,(URI44H:0.000008723,URI46H:0.000000028)0.998:0.000078492)0.128:0.000000028)1.000:0.000000029,(URI112H:0.000026166,(URI117H:0.000017445,UCT30H:0.000008723)1.000:0.000000028)1.000:0.000000028)0.962:0.000043394,(UCT110H:0.000000028,(UNY149P:0.000008732,(UCT92H:0.000148310,(UCT32H:0.000000028,URI47H:0.000008729)0.942:0.000000028)0.998:0.000000028)1.000:0.000000028)1.000:0.000322906)1.000:0.004759419)1.000:0.003996133)1.000:0.010473404)1.000:0.005786725)0.986:0.004200007)1.000:0.009934399)1.000:0.003578198)1.000:0.009609763,(((URI93H:0.000008730,(URI102H:0.000000028,URI107H:0.000000028)0.906:0.000000028)0.999:0.000078568,(UCT109H:0.000034957,URI87H:0.000026190)1.000:0.000000028)0.992:0.000078562,(UCT29H:0.000026214,URI101H:0.000000028)0.741:0.000008730)0.998:0.000131554)0.107:0.000034297,(URI111H:0.000000028,(URI40H:0.000078581,URI42H:0.000017456)0.954:0.000000028)0.932:0.000017455)0.882:0.000000028)1.000:0.000000028)1.000:0.000000028)0.986:0.000000028);"
order = extract_order_from_newick(newick_tree)

norder = [item[1] for item in order]
forder = []
#for order, asm in order:
#    if asm in asms_plot:
#        print(order, asm)
#        forder.append((order,asm))

In [5]:
def fix_bad_name(x):
    if isinstance(x, str):
        if '|' in x:
            parts = x.split('|')
            fix = parts[-1] if len(parts) == 3 else x
            return fix
        return x

pids = pandas.read_csv(plasmid_hits, sep='\t')
pids['best_hit'] = pids['best_hit'].apply(fix_bad_name)

In [6]:
nucl_v6 = pandas.read_csv(nucl_alns, sep='\t')


In [7]:
lr_pids = pids[pids['method'] == 'longread']
for idx, row in lr_pids.iterrows():
    lr_pids.loc[idx, 'aln_id'] = f"{row['name']}_{row['contig']}"
all_hits = lr_pids['best_hit'].unique()
above_len = lr_pids[lr_pids['contig_len'] > 1000]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lr_pids.loc[idx, 'aln_id'] = f"{row['name']}_{row['contig']}"


In [8]:
plasmid_df = above_len[['aln_id','best_hit']]
#plasmid_df.to_csv('best_matches_v9_1kb_mapper.csv', index=False)

In [40]:
asms_for_lp28_4 = ["UCT109H","URI102H", "URI42H", "UCT35H", "UNY172P", "URI46H"]
# include cp32-10 from UCT31H,
# include lp28-2 from URI102H, UCT109H.

In [9]:
def pull_best_hit_and_ident(asm, rep, plasmid_df, list):
    best_hits = plasmid_df[(plasmid_df['aln_id'].str.contains(asm, na=False))&
                            (plasmid_df['best_hit'].str.contains(rep, na=False, regex=True))]
    if best_hits.empty:
        print(f"Assembly: {asm} ain't got {rep}")
    else:
        print(f"Assembly: {asm}")
        aln_id = best_hits['aln_id'].iloc[0] if len(best_hits['aln_id']) > 0 else None
        if aln_id:
            print(best_hits)
            print("---")
            list.append(aln_id)
        else:
            print(f"No valid aln_id for {asm}")
    return list

In [20]:
list = []
best_hits = plasmid_df[plasmid_df['best_hit'].str.contains('lp28-2', na=False, regex=True)]

for idx, row in best_hits.iterrows():
    list.append(row[0])



  list.append(row[0])


In [21]:
print(list)

['UCT31H_contig000016', 'URI87H_contig000014', 'UWI283P_MC149_lp28-2', 'URI107H_contig000013', 'URI111H_contig000015', 'UCT96H_contig000011', 'UNY172P_B356_lp28-2', 'URI89H_contig000014', 'URI102H_contig000002', 'URI40H_contig000013', 'URI39H_contig000015', 'UNY169P_B348_lp28-2', 'URI101H_contig000015', 'URI42H_contig000013', 'UCT109H_contig000002', 'URI41H_contig000013', 'URI91H_contig000017', 'UCT29H_contig000013', 'URI34H_contig000010', 'URI120H_contig000014', 'UNY193P_B477_lp28-x', 'URI93H_contig000014']


In [10]:
def find_contigs_for_plot(asm_dict, plasmid_df):
    assembly_list = []
    for rep, data in asm_dict.items():
        if isinstance(data, list):
            for asm in data:
                assembly_list.extend(pull_best_hit_and_ident(asm, rep, plasmid_df, assembly_list))
        elif isinstance(data, str):
            assembly_list.extend(pull_best_hit_and_ident(data, rep, plasmid_df, assembly_list))
        else:
            print("ERROR: You gotta specify either a single string for an assembly, OR a list of strings. Try again!")
    return assembly_list


In [None]:
asms1 = {
    'lp28-2': ['UCT109H','URI102H'],
    'cp32-10':'URI42H',
    'lp28-4': ['URI40H', "UNY172P", 'UNY208P', 'UWI283P']
}
asms2 = {
    'lp28-2': ['UCT109H','URI102H'],
    'cp32-10':'URI42H',
    'lp28-4': ['URI40H', "UNY172P", 'UNY208P', 'UWI283P']
}

In [44]:
asm_list1 = find_contigs_for_plot(asms1, plasmid_df)
asm_list2 = find_contigs_for_plot(asms2, plasmid_df)

Assembly: UCT109H
                     aln_id best_hit
13316  UCT109H_contig000002   lp28-2
---
Assembly: URI102H
                    aln_id best_hit
9956  URI102H_contig000002   lp28-2
---
Assembly: URI42H
                    aln_id best_hit
12949  URI42H_contig000002  cp32-10
---
Assembly: URI40H
                    aln_id best_hit
11116  URI40H_contig000016   lp28-4
---
Assembly: UNY172P
                   aln_id best_hit
8771  UNY172P_B356_lp28-4   lp28-4
---
Assembly: UNY208P
                    aln_id best_hit
9273  UNY208P_MR641_lp28-4   lp28-4
---
Assembly: UWI283P
                    aln_id best_hit
3377  UWI283P_MC149_lp28-4   lp28-4
---


In [None]:
#this explains clearly what is happening in lp28-4 for the OspC type A isolates;
#
#the other two examples I'd be interested in looking at are for ospC type A isolates with lp17
# -- two isolates seem to encode lp17 on lp28-1,
# and lp28-1 among OspC type K isolates, where two seem divergent, one of which seems to encode lp28-1 genes on lp28-4 and one ?lp38

# Type_K = ['UCT110H', 'UCT30H', 'UCT32H', 'UCT92', 'UNY149P', 'URI103H', 'URI112H', 'URI117H', 'URI118H', 'URI44H', 'URI46H', 'URI47H', 'URI86H']
# -- look at lp28-1, look at the ones that have lp28-4: UCT30H, lp28-9: URI88H

# Type_A = [ 'UCT109H', 'UCT29H', 'UCT31H', 'UNY1032P', 'URI101H', 'URI102H', 'URI107H', 'URI111H', 'URI120H', 'URI39H', 'URI40H', 'URI41H', 'URI42H', 'URI87H', 'URI89H', 'URI91H', 'URI93H']
# -- with lp17: ^ , with lp28-3 that encode for lp17: ['URI93H', 'URI41H']


In [12]:
def get_contigs_for_plasmid(list, plasmid, plasmid_df):
    list_of_files = []
    for asm in list:
        best_hits = plasmid_df[(plasmid_df['aln_id'].str.contains(asm, na=False)) &
                               (plasmid_df['best_hit'].str.contains(plasmid, na=False, regex=True))]
        if best_hits.empty:
            print(f"Assembly: {asm} ain't got it!")
        else:
            print(f"Assembly: {asm}")
            aln_id = best_hits['aln_id'].iloc[0] if len(best_hits['aln_id']) > 0 else None
            if aln_id:
                print(best_hits)
                print("---")
                list_of_files.append(aln_id)
            else:
                print(f"No valid aln_id for {asm}")
    return list_of_files

In [None]:
asms3 = {
    'lp17': [
        'UCT109H_contig000017',
        'UCT29H_contig000020',
        'UCT31H_contig000020',
        'URI101H_contig000021',
        'URI102H_contig000016',
        'URI107H_contig000019',
        'URI111H_contig000019',
        'URI120H_contig000021',
        'URI39H_contig000020',
        'URI40H_contig000018',
        'URI42H_contig000016',
        'URI87H_contig000019',
        'URI89H_contig000020',
        'URI91H_contig000021',
        'URI93H_contig000018'
        ],
    'lp28-3': [
        'URI93H_contig000003',
        'URI41H_contig000004'],
}
asms4 = {
    'lp28-1': [
        'UCT110H_contig000012',
        'UCT32H_contig000015',
        'UCT92H_contig000017',
        'UNY149P_B247_lp28-1',
        'URI103H_contig000018',
        'URI112H_contig000010',
        'URI117H_contig000017',
        'URI118H_contig000016',
        'URI44H_contig000012',
        'URI46H_contig000016',
        'URI47H_contig000014',
        'URI86H_contig000015'
        ],
    'lp28-5': 'UCT30H_contig000005',
    'lp28-7': 'URI88H_contig000003',
}


In [41]:
plot3 = [
    'URI39H_contig000020',
    'URI89H_contig000020',
    'URI91H_contig000021',
    'URI120H_contig000021',
    'URI41H_contig000004',
    'UCT31H_contig000020',
    'URI93H_contig000018',
    'URI93H_contig000003',
    'URI102H_contig000016',
    'URI107H_contig000019',
    'UCT109H_contig000017',
    'URI87H_contig000019',
    'UCT29H_contig000020',
    'URI101H_contig000021',
    'URI111H_contig000019',
    'URI40H_contig000018',
    'URI42H_contig000016',
]
plot4 = [
    'URI88H_contig000003,
    'URI103H_contig000018',
    'URI86H_contig000015',
    'URI118H_contig000016',
    'URI44H_contig000012',
    'URI46H_contig000016',
    'URI112H_contig000010',
    'URI117H_contig000017',
    'UCT30H_contig000005',
    'UCT110H_contig000012',
    'UNY149P_B247_lp28-1',
    'UCT92H_contig000017',
    'UCT32H_contig000015',
    'URI47H_contig000014',
]

In [24]:
odict = defaultdict()
for idx, asm in order:
    odict[asm] = idx
    #if asm in ctg.split("_")[0] == asm:
    #        print(ctg)
for asm in list:
    print(odict[asm.split('_')[0]], asm)



6 UCT31H_contig000016
44 URI87H_contig000014
12 UWI283P_MC149_lp28-2
42 URI107H_contig000013
47 URI111H_contig000015
7 UCT96H_contig000011
11 UNY172P_B356_lp28-2
2 URI89H_contig000014
41 URI102H_contig000002
48 URI40H_contig000013
1 URI39H_contig000015
14 UNY169P_B348_lp28-2
46 URI101H_contig000015
49 URI42H_contig000013
43 UCT109H_contig000002
5 URI41H_contig000013
3 URI91H_contig000017
45 UCT29H_contig000013
18 URI34H_contig000010
4 URI120H_contig000014
17 UNY193P_B477_lp28-x
40 URI93H_contig000014


In [25]:
type_A_lp28s

['URI93H_contig000003', 'URI41H_contig000004']

In [49]:
get_contigs_for_plasmid(['UCT30H'],'lp28-5', plasmid_df)

Assembly: UCT30H
                   aln_id best_hit
2125  UCT30H_contig000005   lp28-5
---


['UCT30H_contig000005']

In [17]:
type_K = ['UCT110H', 'UCT30H', 'UCT32H', 'UCT92', 'UNY149P', 'URI103H', 'URI112H', 'URI117H', 'URI118H', 'URI44H', 'URI46H', 'URI47H', 'URI86H']
type_A = [ 'UCT109H', 'UCT29H', 'UCT31H', 'UNY1032P', 'URI101H', 'URI102H', 'URI107H', 'URI111H', 'URI120H', 'URI39H', 'URI40H', 'URI41H', 'URI42H', 'URI87H', 'URI89H', 'URI91H', 'URI93H']
type_A_lp17 = get_contigs_for_plasmid(type_A, 'lp17', plasmid_df)
type_A_lp28s = get_contigs_for_plasmid(['URI93H', 'URI41H'], 'lp28-3', plasmid_df)
type_K_lp28s = get_contigs_for_plasmid(type_K, 'lp28-1', plasmid_df)
#type_K_lp28s = get_contigs_for_plasmid(, 'lp28-1', plasmid_df)



Assembly: UCT109H
                     aln_id best_hit
13331  UCT109H_contig000017     lp17
---
Assembly: UCT29H
                    aln_id best_hit
15037  UCT29H_contig000020     lp17
15039  UCT29H_contig000022     lp17
---
Assembly: UCT31H
                  aln_id best_hit
911  UCT31H_contig000020     lp17
---
Assembly: UNY1032P ain't got it!
Assembly: URI101H
                     aln_id best_hit
12618  URI101H_contig000021     lp17
12620  URI101H_contig000023     lp17
---
Assembly: URI102H
                    aln_id best_hit
9970  URI102H_contig000016     lp17
---
Assembly: URI107H
                    aln_id best_hit
3725  URI107H_contig000019     lp17
---
Assembly: URI111H
                    aln_id best_hit
6397  URI111H_contig000019     lp17
---
Assembly: URI120H
                     aln_id best_hit
15801  URI120H_contig000021     lp17
---
Assembly: URI39H
                    aln_id best_hit
11559  URI39H_contig000020     lp17
11564  URI39H_contig000025     lp17
---
Assembly: URI

In [28]:
list_o_plots = []
missing = []
asms_plot = ['UCT109H', 'URI102H', "UNY172P", 'URI42H', 'URI40H', 'UNY208P', 'UWI283P']
for asm in ['UCT109H', 'URI102H']:
    best_hits = plasmid_df[(plasmid_df['aln_id'].str.contains(asm, na=False)) &
                           (plasmid_df['best_hit'].str.contains('lp28-2', na=False, regex=True))]
    if best_hits.empty:
        print(f"Assembly: {asm} ain't got it!")
        missing.append(asm)
    else:
        print(f"Assembly: {asm}")
        aln_id = best_hits['aln_id'].iloc[0] if len(best_hits['aln_id']) > 0 else None
        if aln_id:
            print(best_hits)
            print("---")
            list_o_plots.append(aln_id)
        else:
            print(f"No valid aln_id for {asm}")

for asm in ['URI42H']:
    best_hits = plasmid_df[(plasmid_df['aln_id'].str.contains(asm, na=False)) &
                           (plasmid_df['best_hit'].str.contains('cp32-10', na=False, regex=True))]
    if best_hits.empty:
        print(f"Assembly: {asm} ain't got it!")
        missing.append(asm)
    else:
        print(f"Assembly: {asm}")
        aln_id = best_hits['aln_id'].iloc[0] if len(best_hits['aln_id']) > 0 else None
        if aln_id:
            print(best_hits)
            print("---")
            list_o_plots.append(aln_id)
        else:
            print(f"No valid aln_id for {asm}")

for asm in ['URI40H', "UNY172P", 'UNY208P', 'UWI283P']:
    best_hits = plasmid_df[(plasmid_df['aln_id'].str.contains(asm, na=False)) &
                           (plasmid_df['best_hit'].str.contains('lp28-4', na=False, regex=True))]
    if best_hits.empty:
        print(f"Assembly: {asm} ain't got it!")
        missing.append(asm)
    else:
        print(f"Assembly: {asm}")
        aln_id = best_hits['aln_id'].iloc[0] if len(best_hits['aln_id']) > 0 else None
        if aln_id:
            print(best_hits)
            print("---")
            list_o_plots.append(aln_id)
        else:
            print(f"No valid aln_id for {asm}")


Assembly: UCT109H
                     aln_id best_hit
13316  UCT109H_contig000002   lp28-2
---
Assembly: URI102H
                    aln_id best_hit
9956  URI102H_contig000002   lp28-2
---
Assembly: URI42H
                    aln_id best_hit
12949  URI42H_contig000002  cp32-10
---
Assembly: URI40H
                    aln_id best_hit
11116  URI40H_contig000016   lp28-4
---
Assembly: UNY172P
                   aln_id best_hit
8771  UNY172P_B356_lp28-4   lp28-4
---
Assembly: UNY208P
                    aln_id best_hit
9273  UNY208P_MR641_lp28-4   lp28-4
---
Assembly: UWI283P
                    aln_id best_hit
3377  UWI283P_MC149_lp28-4   lp28-4
---


In [29]:
list_o_plots

['UCT109H_contig000002',
 'URI102H_contig000002',
 'URI42H_contig000002',
 'URI40H_contig000016',
 'UNY172P_B356_lp28-4',
 'UNY208P_MR641_lp28-4',
 'UWI283P_MC149_lp28-4']

In [194]:
rows = []
for idx, contig in enumerate(list_o_plots):
    for contig2 in list_o_plots:
        if contig != contig2:
            print(contig, contig2)
            df1 = nucl_v6[(nucl_v6['QUERY_ID'] == contig) &
                                (nucl_v6['REF_ID'] == contig2)]
            df2 = nucl_v6[(nucl_v6['QUERY_ID'] == contig2) &
                    (nucl_v6['REF_ID'] == contig)]
            if not df1.empty:
                rows.append(df1)
            if not df2.empty:
                rows.append(df2)

final_df = pandas.concat(rows, ignore_index=True)


URI39H_contig000017 URI89H_contig000018
URI39H_contig000017 URI91H_contig000020
URI39H_contig000017 URI120H_contig000016
URI39H_contig000017 URI41H_contig000014
URI39H_contig000017 UCT31H_contig000018
URI39H_contig000017 UCT96H_contig000015
URI39H_contig000017 ESI26H_contig000009
URI39H_contig000017 URI33H_contig000013
URI39H_contig000017 URI48H_contig000014
URI39H_contig000017 UNY172P_B356_cp26
URI39H_contig000017 UWI283P_MC149_cp26
URI39H_contig000017 UCT50H_contig000008
URI39H_contig000017 UNY169P_B348_cp26
URI39H_contig000017 URI36H_contig000013
URI39H_contig000017 URI56H_contig000003
URI39H_contig000017 UNY193P_B477_cp26
URI39H_contig000017 URI34H_contig000009
URI39H_contig000017 UCT35H_contig000009
URI39H_contig000017 UNY203P_MR616_cp26
URI39H_contig000017 UWI247P_MC104_cp26
URI39H_contig000017 UWI248P_MC105_cp26
URI39H_contig000017 UCT113H_contig000004
URI39H_contig000017 UNY208P_MR641_cp26
URI39H_contig000017 URI88H_contig000016
URI39H_contig000017 UWI263P_MC123_cp26
URI39H_con

KeyboardInterrupt: 

[(1, 'URI39H'), (2, 'URI89H'), (3, 'URI91H'), (4, 'URI120H'), (5, 'URI41H'), (6, 'UCT31H'), (7, 'UCT96H'), (8, 'ESI26H'), (9, 'URI33H'), (10, 'URI48H'), (11, 'UNY172P'), (12, 'UWI283P'), (13, 'UCT50H'), (14, 'UNY169P'), (15, 'URI36H'), (16, 'URI56H'), (17, 'UNY193P'), (18, 'URI34H'), (19, 'UCT35H'), (20, 'UNY203P'), (21, 'UWI247P'), (22, 'UWI248P'), (23, 'UCT113H'), (24, 'UNY208P'), (25, 'URI88H'), (26, 'UWI263P'), (27, 'URI103H'), (28, 'URI86H'), (29, 'URI118H'), (30, 'URI44H'), (31, 'URI46H'), (32, 'URI112H'), (33, 'URI117H'), (34, 'UCT30H'), (35, 'UCT110H'), (36, 'UNY149P'), (37, 'UCT92H'), (38, 'UCT32H'), (39, 'URI47H'), (40, 'URI93H'), (41, 'URI102H'), (42, 'URI107H'), (43, 'UCT109H'), (44, 'URI87H'), (45, 'UCT29H'), (46, 'URI101H'), (47, 'URI111H'), (48, 'URI40H'), (49, 'URI42H')]
