# Single-Cell Data Processing

In [1]:
import numpy as np
import scipy
import scanpy as sc
import pandas as pd

## Functions

In [2]:
def filt(adata,st):
    ## selected dataset:
    gns=['PSMB5',  'PSMB6',  'PSMB7',  'PSMB8',  'PSMB9',  'PSMB10',  'PTPRC']
    
    ## filtering cells with less genes and genes withh less cells
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)
    rw=adata[:, gns]

    ##raw counts
    if st =='csv':
        y=pd.DataFrame(data=rw.X, index=rw.obs_names, columns=rw.var_names)
    if st== 'sparse':
        y=pd.DataFrame.sparse.from_spmatrix(data=rw.X, index=rw.obs_names, columns=rw.var_names)
    ## Total-count normalize (library-size correct) the data matrix X to 10,000 reads per cell, so that counts become comparable among cells.
    sc.pp.normalize_total(adata, target_sum=1e4)
    adata=adata[:, gns]
    ## storing sparse matrin to pandas
    if st =='csv':
        x=pd.DataFrame(data=adata.X, index=adata.obs_names, columns=adata.var_names)
    if st== 'sparse':
        x=pd.DataFrame.sparse.from_spmatrix(data=adata.X, index=adata.obs_names, columns=adata.var_names)
    #return raw and

    return (x,y)

In [3]:
def avg_score(dt1):
    ## for proteasomes
    dt1['Immunoproteasome']= (1/3)*(np.log2(dt1["PSMB8"]+1)+np.log2(dt1["PSMB9"]+1)+np.log2(dt1["PSMB10"]+1))
    dt1['Const_proteasome']=(1/3)*(np.log2(dt1["PSMB5"]+1)+np.log2(dt1["PSMB6"]+1)+np.log2(dt1["PSMB7"]+1))
    return dt1

In [29]:
combined_st=[] ## All studies

In [9]:
pth='data/Single-cell/'

In [28]:
results='Results/'

## Average Score: Immunoproteasome and Constitutive (10x)


### Breast Cancer 
https://doi.org/10.1038/s41422-020-0355-0 <br>

In [5]:
## Reading counts and features
bcr = sc.read_10x_mtx(
    'Wu_etal_2021_BRCA_scRNASeq/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)  

In [6]:
## creating raw and normalized data for plotting
bcr_pd,bcr_raw=filt(bcr,'sparse')

In [104]:
## Meta Data
meta=pd.read_csv(pth+"Breast_meta.csv")
meta.drop(0,axis=0,inplace=True)
meta.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,NAME,Patient,Percent_mito,nCount_RNA,nFeature_RNA,celltype_major,celltype_minor,celltype_subset,subtype,gene_module,Calls,normal_cell_call,CNA_value
1,CID3586_AAGACCTCAGCATGAG,CID3586,1.506221349,4581,1689,Endothelial,Endothelial ACKR1,Endothelial ACKR1,HER2+,no_gene_module,no_scTYPER_call,no_inferCNV_call,no_CNA_value
2,CID3586_AAGGTTCGTAGTACCT,CID3586,5.793742758,1726,779,Endothelial,Endothelial ACKR1,Endothelial ACKR1,HER2+,no_gene_module,no_scTYPER_call,no_inferCNV_call,no_CNA_value
3,CID3586_ACCAGTAGTTGTGGCC,CID3586,1.383238405,1229,514,Endothelial,Endothelial ACKR1,Endothelial ACKR1,HER2+,no_gene_module,no_scTYPER_call,no_inferCNV_call,no_CNA_value
4,CID3586_ACCCACTAGATGTCGG,CID3586,1.923076923,1352,609,Endothelial,Endothelial ACKR1,Endothelial ACKR1,HER2+,no_gene_module,no_scTYPER_call,no_inferCNV_call,no_CNA_value
5,CID3586_ACTGATGGTCAACTGT,CID3586,13.32554062,1711,807,Endothelial,Endothelial ACKR1,Endothelial ACKR1,HER2+,no_gene_module,no_scTYPER_call,no_inferCNV_call,no_CNA_value


In [105]:
## Calculate average proteasme score
bcr_pd['NAME']=bcr_pd.index
norm=meta.merge(bcr_pd,on='NAME')
norm=avg_score(norm)

In [107]:
## save files for plotting
norm.to_csv(results+"/breast_score.csv")

In [14]:
norm.celltype_major.unique()

array(['Endothelial', 'CAFs', 'PVL', 'B-cells', 'T-cells', 'Myeloid',
       'Normal Epithelial', 'Plasmablasts', 'Cancer Epithelial'],
      dtype=object)

In [33]:
## Consistency across cell annotation 
norm['celltype_major']=norm['celltype_major'].str.replace('T-cells','T-cell')
norm['celltype_major']=norm['celltype_major'].str.replace('Cancer Epithelial','Epithelial-cell')
norm['celltype_major']=norm['celltype_major'].str.replace('B-cells','B-cell')
x=norm[['NAME','celltype_major','Immunoproteasome','Const_proteasome']] ## Only tumour population selected
x.columns=['Sample','cell-type','Immunoproteasome','Const_proteasome']
x['Tumour']='Breast'
combined_st.append(x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


### Colon Cancer 


In [113]:
colon_all=[]

#### Study 1
https://doi.org/10.1038/s41588-020-0636-z<br>

In [16]:
colon=sc.read_csv('GSE132465_GEO_processed_CRC_10X_raw_UMI_count_matrix.txt.gz',delimiter="\t")
colon1=colon.transpose()

In [17]:
## creating raw and normalized data for plotting
colon1_pd,colon1_raw=filt(colon1, 'csv')

In [114]:
## Meta Data
meta=pd.read_csv(pth+"GSE132465_GEO_processed_CRC_10X_cell_annotation.txt.gz",sep='\t')
meta

Unnamed: 0,Index,Patient,Class,Sample,Cell_type,Cell_subtype
0,SMC01-T_AAACCTGCATACGCCG,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2
1,SMC01-T_AAACCTGGTCGCATAT,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2
2,SMC01-T_AAACCTGTCCCTTGCA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2
3,SMC01-T_AAACGGGAGGGAAACA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2
4,SMC01-T_AAACGGGGTATAGGTA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2
...,...,...,...,...,...,...
63684,SMC10-N_TCAGCTCGTAGCGTCC,SMC10,Normal,SMC10-N,Mast cells,Mast cells
63685,SMC10-N_TGACTAGCAGACGCAA,SMC10,Normal,SMC10-N,Mast cells,Mast cells
63686,SMC10-N_TGCTACCGTCTCCATC,SMC10,Normal,SMC10-N,Mast cells,Mast cells
63687,SMC10-N_TTTATGCAGTGTCTCA,SMC10,Normal,SMC10-N,Mast cells,Mast cells


In [115]:
## Calculate average proteasme score
colon1_pd['Index']=colon1_pd.index
norm=meta.merge(colon1_pd,on='Index')
norm=avg_score(norm)

colon1_raw['Index']=colon1_raw.index
raw=meta.merge(colon1_raw,on='Index')
raw=avg_score(raw)

In [116]:
###selecting cells from 25% percentile for Figure 3
colon_all.append(norm[norm['Const_proteasome']>norm['Const_proteasome'].quantile(0.5)])

In [36]:
## Consistency across cell annotation 
norm['Cell_type']=norm['Cell_type'].str.replace('T cells','T-cell')
norm['Cell_type']=norm['Cell_type'].str.replace('Epithelial cells','Epithelial-cell')
norm['Cell_type']=norm['Cell_type'].str.replace('B cells','B-cell')
x=norm[['Index','Cell_type','Immunoproteasome','Const_proteasome']][norm['Class']=='Tumor']
x.columns=['Sample','cell-type','Immunoproteasome','Const_proteasome']
x['Tumour']='Colon1'
combined_st.append(x)

#### Study 2
https://doi.org/10.1038/s41588-020-0636-z<br>

In [23]:
colon2=sc.read_csv('GSE144735_processed_KUL3_CRC_10X_raw_UMI_count_matrix.txt.gz',delimiter="\t")
colon2=colon2.transpose()

In [24]:
## creating raw and normalized data for plotting
colon2_pd,colon2_raw=filt(colon2, 'csv')

In [117]:
## Meta Data
meta=pd.read_csv(pth+"/GSE144735_processed_KUL3_CRC_10X_annotation.txt.gz",sep='\t')
#meta['region']=meta['Sample'].str.slice(-1)
meta.head()

Unnamed: 0,Index,Patient,Class,Sample,Cell_type,Cell_subtype
0,KUL01-T_AAACCTGGTCTTTCAT,KUL01,Tumor,KUL01-T,Epithelial cells,CMS1
1,KUL01-T_AAACGGGTCGGTTAAC,KUL01,Tumor,KUL01-T,Epithelial cells,CMS3
2,KUL01-T_AAAGATGGTATAGGGC,KUL01,Tumor,KUL01-T,Epithelial cells,CMS3
3,KUL01-T_AAAGATGGTGGCCCTA,KUL01,Tumor,KUL01-T,Epithelial cells,CMS1
4,KUL01-T_AAAGCAAGTAAACACA,KUL01,Tumor,KUL01-T,Epithelial cells,CMS3


In [118]:
## Calculate average proteasme score
colon2_pd['Index']=colon2_pd.index
norm=meta.merge(colon2_pd,on='Index')
norm=avg_score(norm)

colon2_raw['Index']=colon2_raw.index
raw=meta.merge(colon2_raw,on='Index')
raw=avg_score(raw)

In [119]:
temp2=pd.melt(raw[(raw['Class']!='Normal')&(raw['Cell_type']=='Epithelial cells')], id_vars=[ 'Class', 'Cell_type'], value_vars=['Const_proteasome','Immunoproteasome'])
###selecting cells from 25% percentile for Figure 3
colon_all.append(norm[norm['Const_proteasome']>norm['Const_proteasome'].quantile(0.5)])

In [39]:
norm.Cell_type.unique()
norm['Cell_type']=norm['Cell_type'].str.replace('T cells','T-cell')
norm['Cell_type']=norm['Cell_type'].str.replace('Epithelial cells','Epithelial-cell')
norm['Cell_type']=norm['Cell_type'].str.replace('B cells','B-cell')
x=norm[['Index','Cell_type','Immunoproteasome','Const_proteasome']][norm['Class']=='Tumor']
x.columns=['Sample','cell-type','Immunoproteasome','Const_proteasome']
x['Tumour']='Colon2'
combined_st.append(x)

In [122]:
temp2.to_csv(results+'colon2_border.csv')
pd.concat(colon_all).to_csv(results+'colon_all.csv')

### Prostate Cancer 

In [40]:
pros = sc.read_10x_mtx(
    'Prostate/',  
    var_names='gene_symbols', # the directory with the `.mtx` file                # use gene symbols for the variable names (variables-axis index)
    cache=True)  

In [41]:
pros_pd,pros_raw=filt(pros,'sparse')

In [42]:
meta= pd.read_csv(pth+"Prostate_meta.csv")

In [43]:
pros_pd['cell_name']=pros_pd.index
norm=meta.merge(pros_pd,on='cell_name')
norm=avg_score(norm)

pros_raw['cell_name']=pros_raw.index
raw=meta.merge(pros_raw,on='cell_name')
raw=avg_score(raw)

In [45]:
norm['cell_type'].unique()

array(['Epithelial', 'Malignant', nan, 'Fibroblast', 'Endothelial',
       'Lymphoid', 'Myeloid'], dtype=object)

In [46]:
norm['cell_type']=norm['cell_type'].str.replace('Malignant','Epithelial-cell')
x=norm[['cell_name','cell_type','Immunoproteasome','Const_proteasome']]
x.columns=['Sample','cell-type','Immunoproteasome','Const_proteasome']
x['Tumour']='Prostate'
combined_st.append(x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### Liver Cancer 

Data already normalized by log2 (TPM + 1) <br>
https://doi.org/10.1016/j.cell.2020.11.041

In [47]:
liver = sc.read_10x_mtx(
    'LIver/',  
    var_names='gene_symbols', # the directory with the `.mtx` file                # use gene symbols for the variable names (variables-axis index)
    cache=True)  

In [48]:
liver_pd,liver_raw=filt(liver,'sparse')

In [49]:
meta=pd.read_csv(pth+"Liver_meta.txt",sep=" ")

 Liver data is already in Log2(TPM+1). So the raw dataframe is converted to antilog for further calculations

In [50]:
liver_counts=(2**liver_raw)-1

In [51]:
liver_counts['cell_name']=liver_counts.index
norm=meta.merge(liver_counts,on='cell_name')
norm=avg_score(norm)

In [52]:
norm.to_csv(results+"liver_score.csv")

In [53]:
norm.head()

Unnamed: 0,index,cell_name,sample,source,disease,cell_type,malignant,cell_subtype_clusters,PSMB5,PSMB6,PSMB7,PSMB8,PSMB9,PSMB10,PTPRC,Immunoproteasome,Const_proteasome
0,1,P01_T_0001,P01,Tumor,Relapsed_HCC,Malignant,yes,C10_Tumor,42.411339,0.205808,0.672493,0.027544,0.193336,0.041238,0.048045,0.1175,2.150667
1,2,P01_T_0003,P01,Tumor,Relapsed_HCC,Malignant,yes,C10_Tumor,0.068287,70.012451,61.24992,0.041238,0.095052,0.068287,0.020729,0.094867,4.068433
2,3,P01_T_0004,P01,Tumor,Relapsed_HCC,NK_cell,no,C4_NK,0.068287,0.244012,139.069611,78.893166,0.269271,74.583519,40.642944,4.301333,2.513433
3,4,P01_T_0007,P01,Tumor,Relapsed_HCC,Malignant,yes,C10_Tumor,0.167158,0.034404,0.027544,0.0,0.020729,0.034404,0.0,0.026133,0.103667
4,5,P01_T_0008,P01,Tumor,Relapsed_HCC,Endothelial,no,C17_Endo.,0.034404,0.379361,0.173648,14.032365,0.212513,0.0,0.054822,1.396,0.247933


In [54]:
norm['cell_type'].unique()

array(['Malignant', 'NK_cell', 'Endothelial', 'Myeloid', 'T_cell', 'HSC',
       'Plasma', 'B_cell', 'Dendritic', 'Epithelial'], dtype=object)

In [55]:
norm['cell_type']=norm['cell_type'].str.replace('Malignant','Epithelial-cell')
norm['cell_type']=norm['cell_type'].str.replace('T_cell','T-cell')
norm['cell_type']=norm['cell_type'].str.replace('B_cell','B-cell')
x=norm[['cell_name','cell_type','Immunoproteasome','Const_proteasome']]
x.columns=['Sample','cell-type','Immunoproteasome','Const_proteasome']
x['Tumour']='Liver'
combined_st.append(x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Pancreas Cancer 

In [56]:
pnc = sc.read_10x_mtx(
    'Pancreas/',  
    var_names='gene_symbols', # the directory with the `.mtx` file                # use gene symbols for the variable names (variables-axis index)
    cache=True)  

In [57]:
pnc_pd,pnc_raw=filt(pnc,'sparse')

In [58]:
meta=pd.read_csv(pth+"Pancreas_meta.txt",sep=" ")
meta['Type']='Metastasis'
meta['Type'][meta['sample'].str.contains("P0")]='Primary'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [59]:
 ## Calculate average proteasme score
pnc_pd['cell_name']=pnc_pd.index
norm=meta.merge(pnc_pd,on='cell_name')
norm=avg_score(norm)

pnc_raw['cell_name']=pnc_raw.index
raw=meta.merge(pnc_raw,on='cell_name')
raw=avg_score(raw)

In [61]:
 ## Calculate average proteasme score
pnc_pd['cell_name']=pnc_pd.index
norm=meta.merge(pnc_pd,on='cell_name')
norm=avg_score(norm)

pnc_raw['cell_name']=pnc_raw.index
raw=meta.merge(pnc_raw,on='cell_name')
raw=avg_score(raw)

In [62]:
norm['cell_type'].unique()

array(['EMT', 'Fibroblast', 'Macrophage', 'Endothelial', 'ETC', 'TIL'],
      dtype=object)

In [63]:
norm.head()

Unnamed: 0,index,cell_name,sample,cell_type,malignant,Type,PSMB5,PSMB6,PSMB7,PSMB8,PSMB9,PSMB10,PTPRC,Immunoproteasome,Const_proteasome
0,1,P03:1,P03,EMT,yes,Primary,2.644803,2.644803,2.644803,2.644803,0.0,0.0,0.0,0.621947,1.865841
1,2,P03:2,P03,EMT,yes,Primary,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,P03:3,P03,EMT,yes,Primary,0.0,0.0,0.0,9.212345,0.0,0.0,0.0,1.117414,0.0
3,4,P03:4,P03,Fibroblast,no,Primary,2.883506,2.883506,0.0,2.883506,2.883506,2.883506,0.0,1.95736,1.304907
4,5,P03:5,P03,EMT,yes,Primary,0.0,0.0,0.0,1.631321,1.631321,1.631321,0.0,1.395787,0.0


In [64]:
norm['cell_type']=norm['cell_type'].str.replace('ETC','Epithelial-cell')
x=norm[['cell_name','cell_type','Immunoproteasome','Const_proteasome']][(norm['malignant']=='yes')&(norm['Type']=='Primary')]
x.columns=['Sample','cell-type','Immunoproteasome','Const_proteasome']
x['Tumour']='Pancreas'
combined_st.append(x)

### Lung Cancer 

In [65]:
lcs = sc.read_10x_mtx(
    'LC_counts/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

In [66]:
lc_pd,lc_raw=filt(lcs,'sparse')

In [67]:
meta=pd.read_csv(pth+"Lung_metadata.csv.gz")

In [68]:
lc_pd['Cell']=lc_pd.index
norm=meta.merge(lc_pd,on='Cell')
norm=avg_score(norm)

lc_raw['Cell']=lc_pd.index
raw=meta.merge(lc_raw,on='Cell')
raw=avg_score(raw)

In [70]:
## Consistency across cell annotation 
norm.CellType.unique()
norm['CellType']=norm['CellType'].str.replace('T_cell','T-cell')
norm['CellType']=norm['CellType'].str.replace('Cancer','Epithelial-cell')
norm['CellType']=norm['CellType'].str.replace('B_cell','B-cell')
x=norm[['Cell','CellType','Immunoproteasome','Const_proteasome']][norm['CellFromTumor']==True] ## Only tumour population selected
x.columns=['Sample','cell-type','Immunoproteasome','Const_proteasome']
x['Tumour']='Lung'
combined_st.append(x)

### Kidney Cancer 

In [71]:
kid = sc.read_10x_mtx(
    'KIdney/',  
    var_names='gene_symbols', # the directory with the `.mtx` file                # use gene symbols for the variable names (variables-axis index)
    cache=True)  

In [72]:
kid_pd,kid_raw=filt(kid,'sparse')

In [74]:
meta=pd.read_csv(pth+"Kidney_meta.txt",sep=" ")

In [75]:
meta.head()

Unnamed: 0,cell_name,patient,sample,disease,cell_type,malignant,cell_subtype
1,SI_18854_AAACCTGCAAGTAGTA-1,SS_2005,SI_18854,Clear_Cell_RCC,Malignant,yes,Malignant
2,SI_18854_AAACCTGTCCACTGGG-1,SS_2005,SI_18854,Clear_Cell_RCC,Malignant,yes,Malignant
3,SI_18854_AAACCTGTCCTTTCTC-1,SS_2005,SI_18854,Clear_Cell_RCC,Malignant,yes,Malignant
4,SI_18854_AAACGGGCAAACTGCT-1,SS_2005,SI_18854,Clear_Cell_RCC,Macrophage,no,Macrophage
5,SI_18854_AAACGGGCAAGGTTTC-1,SS_2005,SI_18854,Clear_Cell_RCC,Malignant,yes,Malignant


In [76]:
kid_raw['cell_name']=kid_raw.index
raw=meta.merge(kid_raw,on='cell_name')
raw=avg_score(raw)

kid_pd['cell_name']=kid_pd.index
norm=meta.merge(kid_pd,on='cell_name')
norm=avg_score(norm)

In [78]:
norm.head()

Unnamed: 0,cell_name,patient,sample,disease,cell_type,malignant,cell_subtype,PSMB5,PSMB6,PSMB7,PSMB8,PSMB9,PSMB10,PTPRC,Immunoproteasome,Const_proteasome
0,SI_18854_AAACCTGCAAGTAGTA-1,SS_2005,SI_18854,Clear_Cell_RCC,Malignant,yes,Malignant,1.726817,2.590226,3.453635,6.043861,5.180452,0.0,0.0,1.814693,1.815425
1,SI_18854_AAACCTGTCCACTGGG-1,SS_2005,SI_18854,Clear_Cell_RCC,Malignant,yes,Malignant,2.238263,0.559566,1.119132,6.714789,6.714789,1.119132,0.0,2.326242,1.139946
2,SI_18854_AAACCTGTCCTTTCTC-1,SS_2005,SI_18854,Clear_Cell_RCC,Malignant,yes,Malignant,2.879632,1.919754,0.0,8.638894,8.638894,0.959877,0.0,2.502833,1.167256
3,SI_18854_AAACGGGCAAACTGCT-1,SS_2005,SI_18854,Clear_Cell_RCC,Macrophage,no,Macrophage,0.0,0.0,0.804311,0.804311,1.608622,0.804311,0.804311,1.028728,0.283816
4,SI_18854_AAACGGGCAAGGTTTC-1,SS_2005,SI_18854,Clear_Cell_RCC,Malignant,yes,Malignant,1.742009,3.484017,0.871004,1.742009,5.226026,0.0,0.0,1.364515,1.507946


In [79]:
norm['cell_type'].unique()

array(['Malignant', 'Macrophage', 'T_cell', 'Endothelial', 'vSMC', 'ua',
       'Mast', 'Plasma', 'Epithelial', 'B_cell', 'Monocyte', 'NK_cell',
       'Unassigned', 'Pericyte', 'Mesangial'], dtype=object)

In [80]:
norm['cell_type']=norm['cell_type'].str.replace('Malignant','Epithelial-cell')
norm['cell_type']=norm['cell_type'].str.replace('T_cell','T-cell')
norm['cell_type']=norm['cell_type'].str.replace('B_cell','B-cell')
x=norm[['cell_name','cell_type','Immunoproteasome','Const_proteasome']]
x.columns=['Sample','cell-type','Immunoproteasome','Const_proteasome']
x['Tumour']='Kidney'
combined_st.append(x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Ovarian Cancer 

In [81]:
oc = sc.read_10x_mtx(
    'Ovarian/Group1/',  
    var_names='gene_symbols', # the directory with the `.mtx` file                # use gene symbols for the variable names (variables-axis index)
    cache=True)  

In [82]:
meta=pd.read_csv(pth+"ovarian_meta.txt",sep=",")
meta.head()

Unnamed: 0,cell_name,sample,patient,cell_type,cluster_old,cluster_new,tsne_x,tsne_y
0,10x_3288_t1_AAACATACCTTCCG-1,3288.1,5,Macrophage,1,13,45.33631,46.93348
1,10x_3288_t1_AAACATACTCCTAT-1,3288.1,5,Macrophage,1,13,35.07609,-20.10105
2,10x_3288_t1_AAACATTGAACTGC-1,3288.1,5,Macrophage,1,13,27.41272,-1.876431
3,10x_3288_t1_AAACATTGCTGACA-1,3288.1,5,Fibroblast,2,8,-12.22481,-86.46082
4,10x_3288_t1_AAACCGTGACAGTC-1,3288.1,5,Fibroblast,2,8,-1.349342,-65.22515


In [83]:
oc_pd,oc_raw=filt(oc,'sparse')

In [84]:
oc_raw['cell_name']=oc_raw.index
norm=meta.merge(oc_raw,on='cell_name')
norm=avg_score(norm)

In [86]:
meta['cell_type'].unique()

array(['Macrophage', 'Fibroblast', 'Malignant', nan, 'B_cell', 'T_cell',
       'Erythrocyte', 'Dendritic'], dtype=object)

In [87]:
norm['cell_type']=norm['cell_type'].str.replace('Malignant','Epithelial-cell')
norm['cell_type']=norm['cell_type'].str.replace('T_cell','T-cell')
norm['cell_type']=norm['cell_type'].str.replace('B_cell','B-cell')
x=norm[['cell_name','cell_type','Immunoproteasome','Const_proteasome']]
x.columns=['Sample','cell-type','Immunoproteasome','Const_proteasome']
x['Tumour']='Ovarian'
combined_st.append(x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Average Score: Immunoproteasome and Constitutive (SMRT-seq)

### Oral Cancer 

Primary tumors from 18 treatment-naive patients and matching lymph node metastasis samples
from five of these patients. Cells with less than 2000 counts were excluded.

In [88]:
hnsc = sc.read_10x_mtx(
    'HNSC/',  
    var_names='gene_symbols', # the directory with the `.mtx` file                # use gene symbols for the variable names (variables-axis index)
    cache=True)  

In [89]:
hnsc_pd,hnsc_raw=filt(hnsc,'sparse')

In [90]:
meta=pd.read_csv(pth+"oral_meta.txt",sep=" ")
meta.head()

Unnamed: 0,cell_name,sample,cell_type,subclone,source,malignant
1,HN28_P15_D06_S330_comb,28,Fibroblast,0,Lymph_node,no
2,HN28_P6_G05_S173_comb,28,Fibroblast,0,Primary,no
3,HN26_P14_D11_S239_comb,26,Malignant,1,Lymph_node,yes
4,HN26_P14_H05_S281_comb,26,Fibroblast,0,Lymph_node,no
5,HN26_P25_H09_S189_comb,26,Malignant,1,Lymph_node,yes


In [91]:
meta['cell_type'].unique()

array(['Fibroblast', 'Malignant', 'B_cell', 'Myocyte', 'Macrophage',
       'Endothelial', 'T_cell', 'Dendritic', 'Mast'], dtype=object)

In [92]:
hnsc_raw['cell_name']=hnsc_raw.index
norm=meta.merge(hnsc_raw,on='cell_name')
norm=avg_score(norm)

In [240]:
norm['cell_type']=norm['cell_type'].str.replace('Malignant','Epithelial-cell')
norm['cell_type']=norm['cell_type'].str.replace('T_cell','T-cell')
norm['cell_type']=norm['cell_type'].str.replace('B_cell','B-cell')
x=norm[['cell_name','cell_type','Immunoproteasome','Const_proteasome']][(norm['malignant']=='yes')]
x.columns=['Sample','cell-type','Immunoproteasome','Const_proteasome']
x['Tumour']='Oral'
combined_st.append(x)

Combining all the studies for futher plots

In [364]:
combined=pd.concat(combined_st)
combined.head()

Unnamed: 0,Sample,cell-type,Immunoproteasome,Const_proteasome,Tumour
0,CID3586_AAGACCTCAGCATGAG,Endothelial,1.615873,0.556785,Breast
1,CID3586_AAGGTTCGTAGTACCT,Endothelial,0.921402,0.0,Breast
2,CID3586_ACCAGTAGTTGTGGCC,Endothelial,0.0,0.0,Breast
3,CID3586_ACCCACTAGATGTCGG,Endothelial,0.0,0.0,Breast
4,CID3586_ACTGATGGTCAACTGT,Endothelial,0.924984,0.924984,Breast


In [365]:
combined.to_csv(results+"All_studies.csv",index=None)