# Extract data from Supplementary Excel files

The purpose of this notebook is to extract data from the Excel files, clean them, and write them to CSV.


In [1]:
import os
import common

# Assign notebook and folder names
notebook_name = '04_extract_data_from_supplementary_excel_files'
figure_folder = os.path.join(common.FIGURE_FOLDER, notebook_name)
data_folder = os.path.join(common.DATA_FOLDER, notebook_name)
print('Figure folder:', figure_folder)
print('Data folder:', data_folder)

# Make the folders
! mkdir -p $figure_folder
! mkdir -p $data_folder

Figure folder: ../figures/04_extract_data_from_excel
Data folder: ../data/04_extract_data_from_excel


In [4]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [6]:
input_folder = os.path.join(common.DATA_FOLDER, '00_original')

## Cluster markers

These are the genes that were preferentially expressed in individual clusters, according to their analysis.


The original supplementary file was corrupted and had many problems with the column values so I fixed them by hand, which is why we use `mmc4_v2.xlsx`.

In [10]:
filename = os.path.join(input_folder, 'mmc4_v2.xlsx')

cluster_markers = pd.read_excel(filename,
                                sheetname='FINAL_MARKERS_FOR_EACH_CLUSTER.',
                                skiprows=3)
cluster_markers = cluster_markers.rename(columns={"Unnamed: 0": 'gene_symbol'})

# Remove any rows with NA because those are all header rows
print(cluster_markers.shape)
cluster_markers = cluster_markers.dropna()
print(cluster_markers.shape)
cluster_markers['cluster #'] = cluster_markers['cluster #'].astype(int)
cluster_markers.head()

(4292, 5)
(3994, 5)


Unnamed: 0,gene_symbol,myAUC,myDiff,power,cluster #
0,CALB1,0.966,3.615047,0.466,1
1,SLC4A3,0.963,3.448571,0.463,1
2,TPM3,0.965,3.151521,0.465,1
3,SEPT4,0.964,2.939258,0.464,1
4,VIM,0.944,2.937992,0.444,1


In [11]:
cluster_markers.groupby('cluster #').size()

cluster #
1     190
2     174
3     162
4      84
5     159
6     156
7     164
8     145
9     145
10    120
11    111
12     68
13    163
14    127
15     69
16     97
17     99
18     76
19    115
22     51
23     67
24     49
25     14
26     87
27     27
28     48
29     39
30     60
32     81
33     47
34    147
35    164
36    153
37    236
38    147
39    153
dtype: int64

## Pairwise markers

These are the genes that distinguish sets of cells from each other.



In [9]:
filename = os.path.join(input_folder, 'mmc4_v2.xlsx')

pairwise_markers = pd.read_excel(filename,
                                sheetname='PAIRWISE_MARKERS_FINAL.txt',
                                skiprows=3)
print(pairwise_markers.shape)
pairwise_markers.head()

(193618, 3)


Unnamed: 0.1,Unnamed: 0,myP,myDiff
0,VIM,0.0,3.399495
1,NDRG1,0.0,3.258577
2,SEPT4,0.0,2.984931
3,CALB1,0.0,2.978919
4,TFAP2B,0.0,2.829322
