In [24]:
import pandas
import pickle

In [25]:
# Ok let's crack open this jar of pickled plasmids.
parsing_table = pickle.load(open('parsing_tables/blast_parsing_dict.pkl', 'rb'))
# ok now let's set up the columns for our matrix and drop the synthetic vector I added a while back...
plasmids_in_db = {k : v for k, v in parsing_table.items()}
matrix_cols = []
for k in plasmids_in_db:
    matrix_cols.append(plasmids_in_db[k]['name'])
matrix_cols = list(set(matrix_cols))
matrix_cols.sort()
matrix_cols.remove('pBSV2')
matrix_cols.remove('lp21-cp9')
print(matrix_cols)

['chromosome', 'cp26', 'cp32-1', 'cp32-1+5', 'cp32-10', 'cp32-11', 'cp32-12', 'cp32-2', 'cp32-3', 'cp32-3+10', 'cp32-4', 'cp32-5', 'cp32-5+1', 'cp32-5-1', 'cp32-6', 'cp32-7', 'cp32-8', 'cp32-9', 'cp32-9-4', 'cp9', 'cp9-3', 'lp17', 'lp21', 'lp25', 'lp28-1', 'lp28-11', 'lp28-2', 'lp28-3', 'lp28-4', 'lp28-5', 'lp28-6', 'lp28-7', 'lp28-8', 'lp28-9', 'lp36', 'lp38', 'lp5', 'lp54', 'lp56']


In [26]:
def make_method_dicts(subset_df):
    # Create a dictionary to store the sets
    sr_plasmids, lr_plasmids = {}, {}
    
    sr_subset = subset[subset['method'] == 'shortread']
    lr_subset = subset[subset['method'] == 'longread']
    
    # Group by the 'name' column and create sets of 'best_hit'
    for name, group in sr_subset.groupby('name'):
        best_hit_set = set(group['best_hit'])
        sr_plasmids[name] = best_hit_set
    
    # Group by the 'name' column and create sets of 'best_hit'
    for name, group in lr_subset.groupby('name'):
        best_hit_set = set(group['best_hit'])
        lr_plasmids[name] = best_hit_set
    
    print("lr_plasmids:", lr_plasmids)
    print(len(lr_plasmids.keys()))
    
    print("sr_plasmids:", sr_plasmids)
    print(len(sr_plasmids.keys()))
    return sr_plasmids, lr_plasmids

def generate_matrix(sr_plasmids, lr_plasmids, version, cutoff):
    for k in lr_plasmids:
        print(f'longread: {k} : len:{len(list(set(lr_plasmids[k])))} : {list(set(lr_plasmids[k]))}')
        lr_plasmids[k] = list(set(lr_plasmids[k]))
        print(f'shortread: {k[:-1]} : len: {len(list(set(sr_plasmids[k[:-1]])))}{list(set(sr_plasmids[k[:-1]]))}')
        sr_plasmids[k[:-1]] = list(set(sr_plasmids[k[:-1]]))
    
    dicts_of_results = [sr_plasmids, lr_plasmids]
    plasmid_presence_matrix = []
    for dict1 in dicts_of_results:
        for k in dict1:
            #print(k)
            for v in dict1[k]:
                #print(v)
                plasmid_presence_matrix.append((k,v))
    
    plasmid_df = pandas.DataFrame(plasmid_presence_matrix, columns=['key','val']).set_index(['key'])
    plasmid_count_matrix = plasmid_df.pivot_table(index='key', columns='val',aggfunc=len,fill_value=0,sort=True)
    plasmid_count_matrix['Sum'] = plasmid_count_matrix.sum(axis=1)
    print(plasmid_count_matrix)
    # make filename
    outfile = f'plasmid_presence_matrix_{version}_{cutoff}.csv'
    plasmid_count_matrix.to_csv(outfile)

In [34]:
new_csv = 'best_matches_v6_1kb.tsv'
version = 'v6'
df_1000bp_cutoff = pandas.read_csv(new_csv,header=0,delimiter='\t')
df_2500bp_cutoff = df_1000bp_cutoff[plasmid_results_df['contig_len'] >= 2500]
df_5000bp_cutoff = df_1000bp_cutoff[plasmid_results_df['contig_len'] >= 5000]

In [35]:
subset1000 = df_1000bp_cutoff[['name','method','contig','best_hit','best_method','completeness']]
subset2500 = df_2500bp_cutoff[['name','method','contig','best_hit','best_method','completeness']]
subset5000 = df_5000bp_cutoff[['name','method','contig','best_hit','best_method','completeness']]

In [36]:
sr_pl_1000, lr_pl_1000 = make_method_dicts(subset1000)
sr_pl_2500, lr_pl_2500 = make_method_dicts(subset2500)
sr_pl_5000, lr_pl_5000 = make_method_dicts(subset5000)

lr_plasmids: {'ESI26H': {'lp36', 'cp32-1', 'cp32-6', 'lp28-4', nan, 'cp32-5', 'lp28-3', 'chromosome', 'lp54', 'cp32-3+10', 'cp32-9', 'cp26', 'cp32-3', 'cp32-9-4', 'lp17', 'lp28-9'}, 'UCT109H': {'lp25', 'cp32-7', 'lp28-2', 'cp32-10', 'lp36', 'cp32-6', 'cp32-4', nan, 'lp28-1', 'cp32-5', 'lp28-3', 'chromosome', 'lp38', 'lp54', 'cp32-9', 'cp26', 'cp32-3', 'lp17'}, 'UCT110H': {'lp28-5', 'cp32-7', nan, 'lp28-1', 'cp32-5', 'chromosome', 'lp36', 'cp32-8', 'lp28-3', 'lp54', 'lp28-6', 'cp26', 'lp17', 'cp32-9', 'cp32-3', 'lp25', 'cp32-6', 'cp32-2', 'lp28-4', 'lp21', 'cp32-11', 'cp32-12', 'lp38'}, 'UCT113H': {'cp32-7', 'lp28-5', nan, 'lp28-1', 'chromosome', 'lp36', 'lp28-3', 'lp54', 'lp28-6', 'cp26', 'lp17', 'cp32-4', 'cp32-9', 'lp25', 'lp28-4', 'lp21', 'cp32-11', 'cp32-12', 'lp38'}, 'UCT29H': {'cp32-7', 'lp28-2', nan, 'lp28-1', 'cp32-5', 'chromosome', 'cp32-10', 'cp32-1', 'lp28-3', 'lp54', 'cp26', 'lp17', 'cp32-4', 'cp32-9', 'cp32-3', 'lp25', 'cp32-6', 'lp21', 'lp28-4', 'cp32-11', 'lp38', 'cp9'},

In [37]:
generate_matrix(sr_pl_1000, lr_pl_1000, version, '1000bp')
generate_matrix(sr_pl_2500, lr_pl_2500, version, '2500bp')
generate_matrix(sr_pl_5000, lr_pl_5000, version, '5000bp')

longread: ESI26H : len:16 : [nan, 'cp32-5', 'chromosome', 'cp32-9', 'cp32-3', 'lp36', 'cp32-1', 'cp32-6', 'lp28-4', 'lp28-3', 'lp54', 'cp32-3+10', 'cp26', 'cp32-9-4', 'lp17', 'lp28-9']
shortread: ESI26 : len: 17['lp28-2', nan, 'cp32-5', 'chromosome', 'cp32-9', 'lp28-7', 'lp36', 'cp32-1', 'cp32-6', 'cp32-8', 'lp28-3', 'cp32-12', 'lp54', 'cp32-3+10', 'cp26', 'lp17', 'lp28-9']
longread: UCT109H : len:18 : ['cp32-7', 'lp28-2', nan, 'cp32-4', 'lp28-1', 'cp32-5', 'chromosome', 'cp32-9', 'cp32-3', 'lp25', 'cp32-10', 'lp36', 'cp32-6', 'lp28-3', 'lp38', 'lp54', 'cp26', 'lp17']
shortread: UCT109 : len: 22['cp32-7', 'lp28-2', nan, 'cp32-4', 'lp28-1', 'cp32-5', 'chromosome', 'cp32-9', 'cp32-3', 'lp25', 'lp36', 'lp28-4', 'cp32-2', 'cp32-6', 'lp56', 'lp28-3', 'cp32-11', 'lp38', 'lp54', 'cp9', 'cp26', 'lp17']
longread: UCT110H : len:23 : ['lp28-5', 'cp32-7', nan, 'lp28-1', 'cp32-5', 'chromosome', 'cp26', 'cp32-9', 'cp32-3', 'lp25', 'lp36', 'cp32-6', 'cp32-2', 'cp32-8', 'lp28-4', 'lp21', 'lp28-3', 'cp