In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import linregress

In [4]:
metadata = pd.read_csv('scRNA_Data/GSE139495_1.tsv', sep='\t')
metadata.head(15)

Unnamed: 0,cells,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,alra_snn_res.0.5,seurat_clusters,samples
0,ACAGCTAAGGGTTCCC,Unfiltered_control,7,4,0.0,9,9,Unfiltered_control
1,ACATACGTCATTTGGG,Unfiltered_control,7,7,0.03601,9,9,Unfiltered_control
2,ACATCAGAGTCGTACT,Unfiltered_control,11,6,0.0,9,9,Unfiltered_control
3,AGAGTGGGTGCAACTT,Unfiltered_control,7,4,0.150754,9,9,Unfiltered_control
4,AGGCCACGTTCGCGAC,Unfiltered_control,9,2,0.0,9,9,Unfiltered_control
5,AGTGGGACATCCGGGT,Unfiltered_control,9,4,0.0,9,9,Unfiltered_control
6,ATCACGAGTCCAGTTA,Unfiltered_control,8,8,0.0,9,9,Unfiltered_control
7,ATCATGGGTCCGACGT,3,9,8,0.0,9,9,Filtered_samples
8,ATTACTCCAGATCTGT,Unfiltered_control,12,5,0.0,9,9,Unfiltered_control
9,CAAGATCGTCCATCCT,3,13,5,0.0,9,9,Filtered_samples


In [7]:
metadata.columns

Index(['cells', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt',
       'alra_snn_res.0.5', 'seurat_clusters', 'samples'],
      dtype='object')

In [9]:
metadata['orig.ident'].unique()

array(['Unfiltered_control', '3', '8,9,10', '2', '1', '4', '6', '7', '5',
       '11', 'Failed_capture'], dtype=object)

In [11]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cells             12330 non-null  object 
 1   orig.ident        12330 non-null  object 
 2   nCount_RNA        12330 non-null  int64  
 3   nFeature_RNA      12330 non-null  int64  
 4   percent.mt        12330 non-null  float64
 5   alra_snn_res.0.5  12330 non-null  int64  
 6   seurat_clusters   12330 non-null  int64  
 7   samples           12330 non-null  object 
dtypes: float64(1), int64(4), object(3)
memory usage: 770.8+ KB


In [13]:
# 1. Separate data data3_renamed into 2 different dfs: one is for Unfiltered_control, 2nd is for metastatic patients (1 through 11)
# 2. We will exclude weird values in data4. (Values with - and .)(Optional) (Francisco)
# 3. We will assign patients (control and metastatic) to cells in data4.
# 4. We will train our datasets 
# 5. Elbow method (Deidra)
# 6. We will make our clusters 
# 7. We will assign names of clusters (Immune cell sinature genes or profile) (Nurmaa)
# 8. We will compare control and metastatic clusters to see if there is any differences in immune cells profiles (clusters)
# 9. Which gene is highest in the metastatic CTCs (Drivers of metastasis, so targeting these genes may reduce metastasis)
# 10. Which gene is lowest in the metastatic CTCs (Suppressed genes in metastasis, increasing those genes may reduce metastasis
## and improve patient survival)

# For quality control: 

# nCount_RNA	(The count of total RNA, it must be 500-20000)

# nFeature_RNA	(The number of total gene count within the cells, it must be 200-(2500-3000))

# percent.mt	(percentage of mitochondrial DNA, must be less than 10-20%)


### Quality control pass

* nCount_RNA should be 500-20000,
* nFeature_RNA should be 200-3000
* percent.mt should be less than 10%

In [18]:
# Filter the DataFrame with Quality control measurements

metadata_filtered_nCount = metadata[(metadata['nCount_RNA'] > 500) & (metadata['nCount_RNA'] < 20000)]
metadata_filtered_nFeature.info()


<class 'pandas.core.frame.DataFrame'>
Index: 12097 entries, 40 to 12275
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cells             12097 non-null  object 
 1   orig.ident        12097 non-null  object 
 2   nCount_RNA        12097 non-null  int64  
 3   nFeature_RNA      12097 non-null  int64  
 4   percent.mt        12097 non-null  float64
 5   alra_snn_res.0.5  12097 non-null  int64  
 6   seurat_clusters   12097 non-null  int64  
 7   samples           12097 non-null  object 
dtypes: float64(1), int64(4), object(3)
memory usage: 850.6+ KB


In [22]:
metadata_filtered_nFeature = metadata_filtered_nCount[(metadata_filtered_nCount['nFeature_RNA']>200) & (metadata_filtered_nCount['nFeature_RNA']<3000)]
metadata_filtered_nFeature.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 11998 entries, 40 to 12275
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cells             11998 non-null  object 
 1   orig.ident        11998 non-null  object 
 2   nCount_RNA        11998 non-null  int64  
 3   nFeature_RNA      11998 non-null  int64  
 4   percent.mt        11998 non-null  float64
 5   alra_snn_res.0.5  11998 non-null  int64  
 6   seurat_clusters   11998 non-null  int64  
 7   samples           11998 non-null  object 
dtypes: float64(1), int64(4), object(3)
memory usage: 843.6+ KB


In [25]:
metadata_filtered_percentMT = metadata_filtered_nFeature[(metadata_filtered_nFeature['percent.mt']<10)]
metadata_filtered_percentMT.info()
# In terms of mitochondrial DNA, the quality was very good

<class 'pandas.core.frame.DataFrame'>
Index: 11998 entries, 40 to 12275
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cells             11998 non-null  object 
 1   orig.ident        11998 non-null  object 
 2   nCount_RNA        11998 non-null  int64  
 3   nFeature_RNA      11998 non-null  int64  
 4   percent.mt        11998 non-null  float64
 5   alra_snn_res.0.5  11998 non-null  int64  
 6   seurat_clusters   11998 non-null  int64  
 7   samples           11998 non-null  object 
dtypes: float64(1), int64(4), object(3)
memory usage: 843.6+ KB


In [31]:
metadata_qcPassed = metadata_filtered_percentMT

metadata_qcPassed.to_csv('metadata_qcPassed', header = True)

In [33]:
data_df = pd.read_csv('scRNA_Data/GSE139495_2.tsv', sep='\t')
data_df.head()

Unnamed: 0,ACAGCTAAGGGTTCCC,ACATACGTCATTTGGG,ACATCAGAGTCGTACT,AGAGTGGGTGCAACTT,AGGCCACGTTCGCGAC,AGTGGGACATCCGGGT,ATCACGAGTCCAGTTA,ATCATGGGTCCGACGT,ATTACTCCAGATCTGT,CAAGATCGTCCATCCT,...,TCGGGACAGGACTGGT,TCTATTGAGCCTCGTG,TCTTTCCAGTCTTGCA,TGCCAAAAGCGTCAAG,TGCGGGTTCCTCATTA,TGCTACCTCTCGCATC,TGGCTGGTCGGTGTTA,TTAGGACTCCACGACG,TTGGAACGTCCCTTGT,TTTACTGCATCCCACT
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33538 entries, MIR1302-2HG to FAM231C
Columns: 12341 entries, ACAGCTAAGGGTTCCC to TTTACTGCATCCCACT
dtypes: int64(12341)
memory usage: 3.1+ GB


In [41]:
filtered_data_df = data_df[~data_df.index.str.contains(r'[.-]')]
filtered_data_df

Unnamed: 0,ACAGCTAAGGGTTCCC,ACATACGTCATTTGGG,ACATCAGAGTCGTACT,AGAGTGGGTGCAACTT,AGGCCACGTTCGCGAC,AGTGGGACATCCGGGT,ATCACGAGTCCAGTTA,ATCATGGGTCCGACGT,ATTACTCCAGATCTGT,CAAGATCGTCCATCCT,...,TCGGGACAGGACTGGT,TCTATTGAGCCTCGTG,TCTTTCCAGTCTTGCA,TGCCAAAAGCGTCAAG,TGCGGGTTCCTCATTA,TGCTACCTCTCGCATC,TGGCTGGTCGGTGTTA,TTAGGACTCCACGACG,TTGGAACGTCCCTTGT,TTTACTGCATCCCACT
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM87B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DIP2A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
S100B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PRMT2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MAFIP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
