
### Collects all raw UMI and gene counts and merges with splotch metadata

Input files: <br>
Aligned counts: $*$stdata_under_tissue_IDs.txt<br>
Splotch log: WTGF Splotch.e29801978 / ASF Splotch.e29727902<br>
Splotch metadata file: wtgf_metadata.txt / asf_metadata.txt<br>
Annotation files: $*$annotations.txt<br>
Patches: $*$.jpg<br>
Splotch information file: wtgf_information.p / asf_information.p 

In [1]:
import os
import sys
import glob
import scanpy as sc
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import statistics
import statsmodels.api as sm
import matplotlib.patches as mpatches
import scipy.stats
import warnings
import pickle
from itertools import chain
warnings.filterwarnings('ignore')

### Gets seq reads data, annotation and UMI counts

In [2]:
counts_path = glob.glob(os.path.join('/aligned_counts', "*"))
genes_all = []
umis_all = []
names_all = []
xy_all = []
for sam in counts_path:
    path = os.path.basename(sam)
    print("Processing...", sam)
    df = pd.read_csv(sam, sep = "\t")
    
    genes_all.append(df.astype(bool).sum(axis = 0).tolist())
    umis_all.append(df.sum(axis = 0).tolist())
    names_all.append([path for i in range(0,len(df.columns))])
    xy_all.append([str(round(float(i.split("_")[0])))+"_"+str(round(float(i.split("_")[1]))) for i in df.columns])
   

Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN38_D2_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN41_C1_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN46_E2_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN44_D1_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN73_C1_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN72_E2_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN46_C1_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN72_D2_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-

In [3]:
#Flatten lists
umis_all_flat = [item for sublist in umis_all for item in sublist]
genes_all_flat = [item for sublist in genes_all for item in sublist]
names_all_flat = [item.split("_stdata_under_tissue_IDs.txt")[0] for sublist in names_all for item in sublist]
xy_all_flat = [item for sublist in xy_all for item in sublist]

## Merge UMIs and genes into a df

In [4]:
# Collect umi and gene data into a dataframe
dfplot = pd.DataFrame([umis_all_flat, genes_all_flat, names_all_flat, xy_all_flat]).T
dfplot.columns = ['UMIs', 'Genes', 'Name', "x_y"]
dfplot_filtered = dfplot[dfplot['UMIs']>800]
dfplot_filtered.reset_index(inplace=True, drop = True)

## Merge with numbers of sections detected per array

In [5]:
# read in summary file from splotch colon prep
summary_sections = pd.read_csv('/Splotch.e29727896', sep = "\t")
summary_sections.columns = ['metadata']
samples = [os.path.basename(i).split("_stdata_adjusted.tsv")[0] for i in summary_sections[summary_sections.metadata.str.contains('Processing')].metadata]
sections = [int(i[18:].split(" tissue sections")[0]) for i in summary_sections[summary_sections.metadata.str.contains('Keeping')].metadata]
secs = pd.DataFrame([samples, sections]).T
secs.columns = ['Name', 'sections']


## Merge with study design metadata (splotch)

In [7]:
# read in metadata file 
meta = pd.read_csv('/metadata.txt', sep = "\t")
meta_1 = pd.merge(meta, secs, left_on="Name", right_on="Name", how = "inner")
meta_2 = pd.merge(meta_1, dfplot_filtered, left_on="Name", right_on="Name", how = "inner")
meta_2 = meta_2[meta_2["sections"] != 0]
meta_2.reset_index(inplace=True, drop = True)
spots = pd.DataFrame(meta_2.groupby(by = "Name").count().sections)
spots.columns = ["spots"]
meta_3 = pd.merge(meta_2, spots, left_on="Name", right_on="Name", how = "inner")


## Merge with morphological annotations (MROIs)

In [8]:
# read in all annotations files
anns_all = []
for ann in os.listdir("/annotations"):
    
    if ann == ".DS_Store":
        continue
    
    short_name = ann.split("_annotations.txt")[0]
    
    if short_name not in np.unique(meta_3.Name):
        print(short_name)
        continue  
    
    df_ann = pd.read_csv(os.path.join("/annotations", ann), sep = "\t")
    df_ann['x'] = [str(round(float(i))) for i in df_ann['x']]
    df_ann['y'] = [str(round(float(i))) for i in df_ann['y']]
    df_ann['x_y'] = [str(i)+"_"+str(j) for i,j in zip(df_ann['x'], df_ann['y'])]
     
    anns_all.append(df_ann)
anns_all_flat = pd.concat(anns_all)
anns_all_flat.reset_index(inplace = True, drop = True)
meta_3["patch"] = [i+"_"+j for i,j in zip (meta_3["Name"], meta_3["x_y"])]
anns_all_flat["patch"] = [i+"_"+j for i,j in zip (anns_all_flat["image"], anns_all_flat["x_y"])]
meta_4 = pd.merge(meta_3, anns_all_flat, left_on="patch", right_on="patch", how = "inner")
meta_4.drop(["image", 'x_y_y'], axis = 1, inplace = True)
meta_4.rename(columns={"x_y_x": "x_y"}, inplace = True,)
meta_4.sections = meta_4.sections.astype("int")
meta_4.drop_duplicates(["patch"], inplace = True, keep='first')


## Merge with patches

In [9]:
ls = []
for dirs in glob.glob(os.path.join('/patches', "*")):
    ls.append([os.path.basename(i).split(".jpg")[0] for i in glob.glob(os.path.join(dirs, "*.jpg"))])

ls_all_patches = list(chain(*ls))
ls_all = [i.split("_")[0]+"_"+i.split("_")[1]+"_"+str(round(float(i.split("_")[2])))+"_"+str(round(float(i.split("_")[3]))) for i in ls_all_patches]
meta_4 = meta_4[meta_4["patch"].isin(list(np.intersect1d(meta_4.patch, ls_all)))]

## Check spots that contain normalized data and subset to QC only those

In [10]:
# Load sample_information file
info_file = os.path.join('/splotch_outputs', 'information.p')
info = pickle.load(open(info_file,'rb'))   
metadata = info['metadata']
n_levels = info['n_levels']
lambda_patches = pd.DataFrame([os.path.basename(i[0]).split("_stdata_adjusted.tsv")[0]+"_"+i[1] for i in info['filenames_and_coordinates']])
lambda_patches.columns = ["spotch_patches"]
meta_5 = pd.merge(meta_4, lambda_patches, left_on="patch", right_on="spotch_patches", how = "inner")

In [11]:
# sanity check 
len(lambda_patches) == len(meta_5)

True

In [15]:
'Clean up metadata to match'

'Clean up metadata to match'

In [11]:
meta_5.rename(columns={"Level 1": "Mouse", "Level 2": "Type", "value": "annotation"},inplace=True)
meta_5.index = meta_5.spotch_patches
meta_5.drop(['spotch_patches', 'patch'], axis=1,inplace=True)
meta_5.rename(columns={"x_y":"patch"},inplace=True)
meta_5.UMIs = meta_5.UMIs.astype(int)
meta_5.Genes = meta_5.Genes.astype(int)
meta_5.spots = meta_5.spots.astype(int)
meta_5.sections = meta_5.sections.astype(int)

In [23]:
abbreviate_anns_dict = {
    "peyer's patch": "PP",
     'epithelium': "E",
     'epithelium and mucosae':"EMM",
    'epithelium and muscle and submucosa':"ALL",
    'epithelium and mucosae and submucosa':"EMMSUB",
    'crypt apex':"APEX",
    'crypt base and mid':"LOWERMID",
    'crypt base': "BASE",
    'crypt mid': "MID",
    'muscle and submucosa':"MSUB",
    'mucosae and interna':"MMI",
    'externa':"ME",
    'externa and interna':"MEI",
    'interna':"MI",
    'pellet':'PE',
    'mucosa':'MU',
    'mucosa and pellet':'MUPE',
    'epithelium and mucosa':'EMU',
    'crypt apex and mucosa':'APEXMU'
}

rename_dict = {"muscularis mucosae and muscularis propria muscularis interna and peyer's patch":"mucosae and peyer's patch",
               "epithelium and peyer's patch":"peyer's patch",
                  'epithelium and lamina propria' : 'epithelium',
    'epithelium and lamina propria and mucosa' : 'epithelium and mucosa',
    'epithelium and lamina propria and mucosa and pellet': 'epithelium and mucosa and pellet',
    'epithelium and lamina propria and muscularis mucosae' : 'epithelium and mucosae',
    'epithelium and lamina propria and muscularis mucosae and muscularis propria muscularis externa and muscularis propria muscularis interna and submucosa all':'epithelium and muscle and submucosa',
    'epithelium and lamina propria and muscularis mucosae and submucosa all':'epithelium and mucosae and submucosa',
    "epithelium and lamina propria and peyer's patch" : "peyer's patch",
    'epithelium apex of crypt and lamina propria' : 'crypt apex',
    'epithelium base of crypt and epithelium mid crypt and lamina propria' : 'crypt base and mid',
    'epithelium base of crypt and lamina propria' : 'crypt base',
    'epithelium base of crypt and lamina propria and muscularis mucosae and muscularis propria muscularis externa and muscularis propria muscularis interna and submucosa all' : 'crypt base',
    'epithelium base of crypt and muscularis mucosae and muscularis propria muscularis interna': 'crypt base',
    'epithelium mid crypt and lamina propria' : 'crypt mid',
    'mucosa and pellet' : 'mucosa and pellet',
    'muscularis all and submucosa all' : 'muscle and submucosa',
    'muscularis mucosae and muscularis propria muscularis externa and muscularis propria muscularis interna' : 'muscle and submucosa',
    'muscularis mucosae and muscularis propria muscularis interna' : 'mucosae and interna',
    "muscularis mucosae and muscularis propria muscularis interna and peyer's patch" : "peyer's patch",
    "muscularis mucosae and peyer's patch" : "peyer's patch",
    'muscularis propria muscularis externa' : 'externa',
    'muscularis propria muscularis externa and muscularis propria muscularis interna':'externa and interna',
    'muscularis propria muscularis interna':'interna',
    'muscularis propria muscularis externa and muscularis propria muscularis interna and muscularis mucosae and submucosa all' : 'muscle and submucosa',
    'pellet':'pellet',
    "peyer's patch":"peyer's patch",
    'mucosa':'mucosa',
    'epithelium apex of crypt and mucosa':'crypt apex and mucosa',
    "Rest": "rest"}

In [24]:
meta_5['annotation2'] = meta_5.annotation.map(rename_dict)

print(meta_5.head())
print([i for i in meta_5.annotation2.unique() if i not in abbreviate_anns_dict])

                            Name Mouse Type  sections  UMIs  Genes  patch  \
spotch_patches                                                              
10015CN38_C1_10_10  10015CN38_C1   M1C   GF         3  3068   2067  10_10   
10015CN38_C1_10_11  10015CN38_C1   M1C   GF         3  3334   2302  10_11   
10015CN38_C1_10_12  10015CN38_C1   M1C   GF         3  2915   1987  10_12   
10015CN38_C1_10_13  10015CN38_C1   M1C   GF         3  4282   2598  10_13   
10015CN38_C1_10_14  10015CN38_C1   M1C   GF         3  6652   3632  10_14   

                    spots   x   y                     annotation  \
spotch_patches                                                     
10015CN38_C1_10_10    444  10  10                         pellet   
10015CN38_C1_10_11    444  10  11                         pellet   
10015CN38_C1_10_12    444  10  12                         pellet   
10015CN38_C1_10_13    444  10  13              mucosa and pellet   
10015CN38_C1_10_14    444  10  14  epithelium and la

In [25]:

meta_5["short_annotations"] = meta_5.annotation2.map(abbreviate_anns_dict)


Write the meta file as this is the final metadata used in the study

In [26]:
meta_5.to_csv("/Metadata_final.csv", sep = "\t")