In [1]:
import os

import pandas as pd

import common

# Assign notebook and folder names
notebook_name = '01_subset_metadata'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
print('Figure folder:', figure_folder)
print('Data folder:', data_folder)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

Figure folder: ../figures/01_subset_metadata
Data folder: ../data/01_subset_metadata


First read in the table that contains all the metadata. Print the length of the table (to know how many cells we are dealing with) and look at the first 5 rows.

In [2]:
metadata = pd.read_table("../data/00_original/E-MTAB-5061.sdrf.txt", index_col=0)
print(len(metadata))
metadata.head(5)

3514


Unnamed: 0_level_0,Comment[ENA_SAMPLE],Comment[BioSD_SAMPLE],Characteristics[organism],Characteristics[organism part],Characteristics[individual],Characteristics[single cell well quality],Characteristics[cell type],Characteristics[disease],Characteristics[sex],Characteristics[age],...,Comment[ENA_EXPERIMENT],Scan Name,Comment[SUBMITTED_FILE_NAME],Comment[ENA_RUN],Comment[FASTQ_URI],Protocol REF.3,Derived Array Data File,Comment [Derived ArrayExpress FTP file],Factor Value[disease],Factor Value[cell type]
Source Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AZ_A1,ERS1348470,SAMEA4437021,Homo sapiens,pancreas,AZ,low quality cell,not applicable,normal,male,43,...,ERX1700346,AZ_A1.fastq.gz,AZ_A1.fastq.gz,ERR1630013,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/003/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,not applicable
AZ_A10,ERS1348471,SAMEA4437022,Homo sapiens,pancreas,AZ,OK,delta cell,normal,male,43,...,ERX1700347,AZ_A10.fastq.gz,AZ_A10.fastq.gz,ERR1630014,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/004/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,delta cell
AZ_A11,ERS1348472,SAMEA4437023,Homo sapiens,pancreas,AZ,OK,alpha cell,normal,male,43,...,ERX1700348,AZ_A11.fastq.gz,AZ_A11.fastq.gz,ERR1630015,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/005/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,alpha cell
AZ_A12,ERS1348473,SAMEA4437024,Homo sapiens,pancreas,AZ,OK,delta cell,normal,male,43,...,ERX1700349,AZ_A12.fastq.gz,AZ_A12.fastq.gz,ERR1630016,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/006/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,delta cell
AZ_A2,ERS1348474,SAMEA4437025,Homo sapiens,pancreas,AZ,OK,gamma cell,normal,male,43,...,ERX1700350,AZ_A2.fastq.gz,AZ_A2.fastq.gz,ERR1630017,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/007/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,gamma cell


We are only going to keep a small number of cells to analyze on our machines locally. Let's define a few cell populations to keep for the analysis. 


I'm going to take a look at the data to see how many cells were collected per patient, per cell type. First, groupby will collect by patient age and cell type. Then I will call a specific column to count (in this case 'Comment[ENA_SAMPLE]') which will give me a count of the number of cells in each age and cell type

In [3]:
pd.options.display.max_rows=1000

metadata.groupby(by = ['Characteristics[age]','Characteristics[cell type]'])['Comment[ENA_SAMPLE]'].count()

Characteristics[age]  Characteristics[cell type] 
22                    PSC cell                         6
                      acinar cell                      3
                      alpha cell                     136
                      beta cell                       34
                      co-expression cell               6
                      delta cell                       7
                      ductal cell                      8
                      endothelial cell                 1
                      gamma cell                       2
                      not applicable                 180
23                    PSC cell                        10
                      acinar cell                      3
                      alpha cell                      92
                      beta cell                       35
                      co-expression cell               6
                      delta cell                      12
                      ductal cell     

We are going to focus on the endocrine cells, so I am going to subset the metadata only for endocrine

In [4]:
cell_types_to_keep = ['alpha cell','beta cell','gamma cell','delta cell',
                      'unclassified endocrine cell','epsilon cell']


This is a subset of endocrine cells so I am going to call this subset endocrine only. With .loc I am pointing to a COLUMN in medata that is called 'Characteristics[cell type]'. Then I am keeping all the rows that contain a value in that column that is in my list defined above. This cuts it down to 629 cells.

In [5]:
endocrine_only = metadata.loc[metadata['Characteristics[cell type]'].isin(cell_types_to_keep)]
print(len(endocrine_only))
endocrine_only.head()

1515


Unnamed: 0_level_0,Comment[ENA_SAMPLE],Comment[BioSD_SAMPLE],Characteristics[organism],Characteristics[organism part],Characteristics[individual],Characteristics[single cell well quality],Characteristics[cell type],Characteristics[disease],Characteristics[sex],Characteristics[age],...,Comment[ENA_EXPERIMENT],Scan Name,Comment[SUBMITTED_FILE_NAME],Comment[ENA_RUN],Comment[FASTQ_URI],Protocol REF.3,Derived Array Data File,Comment [Derived ArrayExpress FTP file],Factor Value[disease],Factor Value[cell type]
Source Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AZ_A10,ERS1348471,SAMEA4437022,Homo sapiens,pancreas,AZ,OK,delta cell,normal,male,43,...,ERX1700347,AZ_A10.fastq.gz,AZ_A10.fastq.gz,ERR1630014,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/004/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,delta cell
AZ_A11,ERS1348472,SAMEA4437023,Homo sapiens,pancreas,AZ,OK,alpha cell,normal,male,43,...,ERX1700348,AZ_A11.fastq.gz,AZ_A11.fastq.gz,ERR1630015,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/005/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,alpha cell
AZ_A12,ERS1348473,SAMEA4437024,Homo sapiens,pancreas,AZ,OK,delta cell,normal,male,43,...,ERX1700349,AZ_A12.fastq.gz,AZ_A12.fastq.gz,ERR1630016,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/006/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,delta cell
AZ_A2,ERS1348474,SAMEA4437025,Homo sapiens,pancreas,AZ,OK,gamma cell,normal,male,43,...,ERX1700350,AZ_A2.fastq.gz,AZ_A2.fastq.gz,ERR1630017,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/007/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,gamma cell
AZ_A6,ERS1348478,SAMEA4437029,Homo sapiens,pancreas,AZ,OK,alpha cell,normal,male,43,...,ERX1700354,AZ_A6.fastq.gz,AZ_A6.fastq.gz,ERR1630021,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/001/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,alpha cell


That's still too many cells. So let's only keep cells from 6 people, 3 healthy and 3 T2D

How many cells came from each person in this group? In this data, the unique people are marked with a different age. So I am going to count the number of time each age value appears in the age column with value_counts()

This tells me that 234 endocrine cells were collected from the donor that was 57 years old and so on... 

In [6]:
endocrine_only['Characteristics[age]'].value_counts()

57    234
55    230
25    221
22    179
23    178
37    150
52    119
48     80
27     65
43     59
Name: Characteristics[age], dtype: int64

I am going to pick 6 people to keep. 

In [7]:
ages_to_keep = ['23','22','25','57','52','55']

endocrine_only_6_samples = endocrine_only.loc[endocrine_only['Characteristics[age]'].isin(ages_to_keep)]
print(len(endocrine_only_6_samples))
endocrine_only_6_samples.head()

1161


Unnamed: 0_level_0,Comment[ENA_SAMPLE],Comment[BioSD_SAMPLE],Characteristics[organism],Characteristics[organism part],Characteristics[individual],Characteristics[single cell well quality],Characteristics[cell type],Characteristics[disease],Characteristics[sex],Characteristics[age],...,Comment[ENA_EXPERIMENT],Scan Name,Comment[SUBMITTED_FILE_NAME],Comment[ENA_RUN],Comment[FASTQ_URI],Protocol REF.3,Derived Array Data File,Comment [Derived ArrayExpress FTP file],Factor Value[disease],Factor Value[cell type]
Source Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HP1502401_A10,ERS1348567,SAMEA4437118,Homo sapiens,pancreas,HP1502401,OK,beta cell,normal,male,25,...,ERX1700443,HP1502401_A10.fastq.gz,HP1502401_A10.fastq.gz,ERR1630110,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/000/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,beta cell
HP1502401_A12,ERS1348569,SAMEA4437120,Homo sapiens,pancreas,HP1502401,OK,alpha cell,normal,male,25,...,ERX1700445,HP1502401_A12.fastq.gz,HP1502401_A12.fastq.gz,ERR1630112,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/002/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,alpha cell
HP1502401_A15,ERS1348571,SAMEA4437122,Homo sapiens,pancreas,HP1502401,OK,alpha cell,normal,male,25,...,ERX1700447,HP1502401_A15.fastq.gz,HP1502401_A15.fastq.gz,ERR1630114,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/004/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,alpha cell
HP1502401_A17,ERS1348573,SAMEA4437124,Homo sapiens,pancreas,HP1502401,OK,alpha cell,normal,male,25,...,ERX1700449,HP1502401_A17.fastq.gz,HP1502401_A17.fastq.gz,ERR1630116,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/006/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,alpha cell
HP1502401_A18,ERS1348574,SAMEA4437125,Homo sapiens,pancreas,HP1502401,OK,delta cell,normal,male,25,...,ERX1700450,HP1502401_A18.fastq.gz,HP1502401_A18.fastq.gz,ERR1630117,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/007/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,delta cell


That's still too many cells, so I am only going to keep 50 alpha cells per person in addition to all the rest of the enodcrine cells. 

There is a lot going on in this loop. I will break down each step. 

I am going to loop over the ages to keep because for each person, I need to grab 50 alpha cells. 

The first row grabs all the rows in the dataframe that come from one person. 

Then I grab only the rows that are alpha cells. 

I define rows_to_keep as the first 50 alpha cells (head(50))

I make a new dataframe to be everything from the old one EXCEPT the alpha cells.

Then I make a new dataframe (df_final) with only 50 alpha rows, plus the dataframe that has no alpha cells. 

I print the lengths of each to check that I am really adding 50 for each one. 

Then at the end I am saving that dataframe into a list of dataframes that I opened in the beginning (called all_dfs). After the loop finishes, I am merging all of the dataframes back into one with pd.concat(all_dfs) leaving us with a metadata_subset. 

In [8]:
endocrine_only_6_samples.groupby(by=['Characteristics[cell type]','Characteristics[age]'])['Comment[ENA_SAMPLE]'].count()

Characteristics[cell type]   Characteristics[age]
alpha cell                   22                      136
                             23                       92
                             25                      117
                             52                       87
                             55                       96
                             57                      141
beta cell                    22                       34
                             23                       35
                             25                       48
                             52                       11
                             55                       64
                             57                       10
delta cell                   22                        7
                             23                       12
                             25                       21
                             52                        5
                             55       

In [9]:
all_dfs = []

for i in ages_to_keep:
    print("age = "+str(i))
    df = endocrine_only_6_samples.loc[endocrine_only_6_samples['Characteristics[age]'] == int(i)]
    alpha_rows = df.loc[df['Characteristics[cell type]'] == 'alpha cell']
    alpha_rows_to_keep = alpha_rows.head(50)
    df_new = df.loc[df['Characteristics[cell type]'] != 'alpha cell']
    print(len(df_new))
    df_final = pd.concat([df_new, alpha_rows_to_keep])
    print(len(df_final))
    all_dfs.append(df_final)

metadata_subset = pd.concat(all_dfs)

age = 23
86
136
age = 22
43
93
age = 25
104
154
age = 57
93
143
age = 52
32
82
age = 55
134
184


In [10]:
print(len(metadata_subset))
metadata_subset.head()

792


Unnamed: 0_level_0,Comment[ENA_SAMPLE],Comment[BioSD_SAMPLE],Characteristics[organism],Characteristics[organism part],Characteristics[individual],Characteristics[single cell well quality],Characteristics[cell type],Characteristics[disease],Characteristics[sex],Characteristics[age],...,Comment[ENA_EXPERIMENT],Scan Name,Comment[SUBMITTED_FILE_NAME],Comment[ENA_RUN],Comment[FASTQ_URI],Protocol REF.3,Derived Array Data File,Comment [Derived ArrayExpress FTP file],Factor Value[disease],Factor Value[cell type]
Source Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HP1504901_A10,ERS1349302,SAMEA4437853,Homo sapiens,pancreas,HP1504901,OK,delta cell,normal,male,23,...,ERX1701178,HP1504901_A10.fastq.gz,HP1504901_A10.fastq.gz,ERR1630845,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/005/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,delta cell
HP1504901_A16,ERS1349308,SAMEA4437859,Homo sapiens,pancreas,HP1504901,OK,beta cell,normal,male,23,...,ERX1701184,HP1504901_A16.fastq.gz,HP1504901_A16.fastq.gz,ERR1630851,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/001/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,beta cell
HP1504901_A21,ERS1349314,SAMEA4437865,Homo sapiens,pancreas,HP1504901,OK,delta cell,normal,male,23,...,ERX1701190,HP1504901_A21.fastq.gz,HP1504901_A21.fastq.gz,ERR1630857,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/007/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,delta cell
HP1504901_A3,ERS1349318,SAMEA4437869,Homo sapiens,pancreas,HP1504901,OK,beta cell,normal,male,23,...,ERX1701194,HP1504901_A3.fastq.gz,HP1504901_A3.fastq.gz,ERR1630861,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/001/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,beta cell
HP1504901_A5,ERS1349320,SAMEA4437871,Homo sapiens,pancreas,HP1504901,OK,gamma cell,normal,male,23,...,ERX1701196,HP1504901_A5.fastq.gz,HP1504901_A5.fastq.gz,ERR1630863,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR163/003/...,P-MTAB-51966,pancreas_refseq_rpkms_counts_3514sc.txt,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,gamma cell


792 cells! Great, we're happy with that number, so let's save the subset file as a csv)

In [14]:
metadata_subset.to_csv(data_folder+"/metadata_subset_792_cells.txt", sep="\t", encoding='utf-8')