In [19]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import gffutils

PROJECT_PATH = Path("/work/rodrigues_2023/")

In [20]:
counts_path = PROJECT_PATH / "dump/20230503/barcode-counts"
meta_path = PROJECT_PATH / "data/reference/20230503_curated_metadata.csv"

In [21]:
df = [pd.read_csv(file, sep='\t', names=['barcode',str(file.name).split('.')[0]]).set_index('barcode') for file in counts_path.glob('*.counts')]

In [22]:
%%capture output
from functools import reduce
dfm = reduce(lambda left,right: pd.merge(left,right,on=['barcode'],how='outer'), df)

In [23]:
dominant_barcodes = dfm.apply(lambda x: dfm.index[np.argmax(x)],axis=0)
dominant_barcodes.index.name = 'Sample ID'
dominant_barcodes.sort_values().to_csv(counts_path / 'dominant_barcodes.csv')

In [24]:
meta = pd.read_csv(meta_path)
meta

Unnamed: 0,Colony Number,Identification,Sample ID,Mouse ID,Tissue,Media
0,1,C3L Co Mac 1,C3L_CO_MAC1_S12,C3L,CO,MAC
1,2,C4L I Mac1,C4L_I_MAC1_S287,C4L,I,MAC
2,3,C4L I Mac2,C4L_I_MAC2_S171,C4L,I,MAC
3,4,C3L Co TSA1,C3L_CO_TSA1_S7,C3L,CO,TSA
4,5,C3L Co TSA2,C3L_CO_TSA2_S172,C3L,CO,TSA
...,...,...,...,...,...,...
152,153,C1L Co 17-2,17_2_S154,C1L,Co,LB
153,154,C1L Co 17-3,17_3_S155,C1L,Co,LB
154,155,C1L Co 17-4,17_4_S156,C1L,Co,LB
155,156,C1L Co 17-5,17_5_S157,C1L,Co,LB


In [12]:
set(dominant_barcodes.index) - set(meta['Sample ID'])

{'ATEC_BC_S159'}

In [17]:
dominant_barcodes['Mouse ID'].unique()

array(['C3L', 'C4L', 'C2R', 'C1N', 'C3N', 'C1RL', 'C1L', 'C1R', ' C4L',
       'C2RL', 'C4R', 'C2N', 'C4N', 'C3R'], dtype=object)

In [14]:
dominant_barcodes = pd.merge(left=dominant_barcodes.to_frame(name='barcode'), right=meta.set_index('Sample ID'), on='Sample ID')

In [15]:
dominant_barcodes['Mouse+Tissue'] = dominant_barcodes['Mouse ID'] + '_' + dominant_barcodes['Tissue']

In [18]:
dominant_barcodes.to_csv(PROJECT_PATH / "dump/20230503/dominant-barcodes/dominant_barcodes.csv")
dominant_barcodes

Unnamed: 0_level_0,barcode,Colony Number,Identification,Mouse ID,Tissue,Media,Mouse+Tissue
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C3L_I_LB4_S201,AGAGTGACTGT,36,C3L I LB4,C3L,I,LB,C3L_I
C4L_I_TSA5_S218,AGAGTGACTGT,53,C4L I TSA5,C4L,I,TSA,C4L_I
C3L_I_LB1_S206,AATGTTACCGA,41,C3L I LB1,C3L,I,LB,C3L_I
1_1_S237,AGAGTGACTGT,72,C2R Co 1-1,C2R,Co,LB,C2R_Co
11_4_S126,AGAGTGACTGT,125,C1N C 11-4,C1N,C,LB,C1N_C
...,...,...,...,...,...,...,...
C3N_CO_LB1_S183,ATAGTGACCAC,17,C3N Co LB1,C3N,CO,LB,C3N_CO
C4R_I_LB2-2_S236,AGAGTGACTGT,71,C4L I LB2,C4L,I,LB,C4L_I
17_2_S154,ACGGTTTCTAA,153,C1L Co 17-2,C1L,Co,LB,C1L_Co
13_2_S134,AGAGTGACTGT,133,C1R Co 13-2,C1R,Co,LB,C1R_Co


In [28]:
dominant_barcodes.reset_index().groupby(['barcode'])['Mouse ID'].agg(lambda x: len(list(set(x)))).to_frame()

Unnamed: 0_level_0,Mouse ID
barcode,Unnamed: 1_level_1
AAAGGAACCGG,1
AAAGTCCCAGA,1
AAAGTCCCTAT,1
AAAGTGACTTT,1
AACGCCACAAA,1
AACGTGCCGGT,1
AAGGAATCACA,1
AAGGATACGGT,1
AAGGCTTCGAG,1
AATGCAACCGT,1


In [20]:
dominant_barcodes.reset_index().groupby(['Mouse ID'])['barcode'].agg(len).to_frame()

Unnamed: 0_level_0,barcode
Mouse ID,Unnamed: 1_level_1
C1L,15
C1N,13
C1R,8
C1RL,1
C2N,11
C2R,14
C2RL,13
C3L,14
C3N,18
C3R,1


In [26]:
dominant_barcodes.reset_index().groupby(['Mouse+Tissue'])['barcode'].agg(len).to_frame()

Unnamed: 0_level_0,barcode
Mouse+Tissue,Unnamed: 1_level_1
C1L_C,5
C1L_CO,5
C1L_I,5
C1N_C,3
C1N_CO,5
C1N_I,5
C1RL_I,1
C1R_C,5
C1R_CO,3
C2N_C,4
