# Generating setList for Pathway Analysis
- Using the MSigDB C8 cell-type gene sets
- Downloaded Oct 5 2021 (v7.4)
- For PD Burdens project

In [10]:
import pandas as pd
import csv 
import numpy as np

In [23]:
## Make list of all possible genes in incoming file
! awk '{$1=$2=""; print $0}' c8.all.v7.4.symbols.gmt > list_of_genes_c8.txt # Copy everything other than the first 2 columns 
! tr ' ' '\n' < list_of_genes_c8.txt > pre_file_list_of_genes_c8.txt # Convert the spaces between each gene to new lines 
! sed '/^$/d' pre_file_list_of_genes_c8.txt > file_list_of_genes_c8.txt # Remove any empty rows from the file to prevent problems later 

# Clean up
! rm list_of_genes_c8.txt
! rm pre_file_list_of_genes_c8.txt

## Read in file
with open('file_list_of_genes_c8.txt') as f:
    all_genes = f.read().splitlines()

In [24]:
## Read in the coordinates file
coordinates_file = pd.read_csv("hg38_refFlat_annotables_BioMart_genes_coordinates_OCT2021.txt", sep="\t")
coordinates_dict = dict(zip(coordinates_file.gene_name, coordinates_file.Coordinates))

In [25]:
## Compare with dictionary and see what genes are missing
missing_genes = [] # Initialize empty list 
for gene in all_genes:
    if gene not in coordinates_dict:
        missing_genes.append(gene)

unique_missing_genes = list(set(missing_genes)) # Make unique list of genes missing 
print(unique_missing_genes)
len(unique_missing_genes)

[]


0

In [26]:
## Make a function that maps the dictionary values 
def replace(list, dictionary):
    return [dictionary.get(item, item) for item in list]

# Make an array of the weird chromsome prefixes to remove (if there are any)
prefixes = ('CHR_')

# Initialize an empty list 
list_of_pathways = []
list_of_error_pathways = [] 

# Read in the MSigDB .gmt file 
# Loop through the c2 file 
with open('c8.all.v7.4.symbols.gmt', 'r') as f:
    reader = csv.reader(f, dialect='excel', delimiter='\t')
    for row in reader:
        del row[1] # Remove the 2nd element, the ones with the website link 
        number_of_genes = len(row)
        row2 = replace(row, coordinates_dict) # Map the coordinates to the gene symbols
        row3 = [element for element in row2 if not element.startswith(prefixes)] # Remove any of those weird prefixes 
        number_of_found_genes = len(row2)
        if number_of_genes != number_of_found_genes:
            list_of_error_pathways.append(row3)
        #print(row3)
        list_of_pathways.append(row3) # Add to the list of lists 

In [27]:
## Sanity check, see if any of the now converted pathways have more or less genes than the input file 
len(list_of_error_pathways)

0

In [28]:
# Save out the file 
with open('c8.all.v7.4.symbols.setFile.Oct52021.txt', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(list_of_pathways)

In [29]:
# No idea how to do this in Python but use bash to replace the FIRST comma with a tab and remove the temp file
! sed 's/,/'$'\t''/' c8.all.v7.4.symbols.setFile.Oct52021.txt > c8.all.v7.4.symbols.setFile.Oct52021.final.txt
! rm c8.all.v7.4.symbols.setFile.Oct52021.txt
! wc -l c8.all.v7.4.symbols.setFile.Oct52021.final.txt

     671 c8.all.v7.4.symbols.setFile.Oct52021.final.txt


In [31]:
! grep -w "GBA" c8.all.v7.4.symbols.gmt # GBA not in the C8 files 

In [34]:
! grep -w "LRRK2" c8.all.v7.4.symbols.gmt | head -3 # LRRK2 is in C8 files lets check if its in the setFile

FAN_EMBRYONIC_CTX_OPC	http://www.gsea-msigdb.org/gsea/msigdb/cards/FAN_EMBRYONIC_CTX_OPC	PDGFRA	EPN2	SCRG1	PLPPR1	APOD	ARL4A	COL9A1	PCDH15	BEX1	MEG3	GRIA2	THY1	PLAT	LINC00643	PLLP	BCHE	LHFPL3	LRRC4C	GPR17	LIMA1	COL20A1	CA10	CNTN1	PHLDA1	SLC35F1	SEMA5A	SNAP25	KLRC2	PCSK1N	IL1RAP	SCN1A	SPRY4	SCG3	MPZL1	PDE4B	BRINP1	VXN	TAOK3	OLIG1	NXPH1	ARL2BP	TSPAN13	UCHL1	SLC1A1	ATCAY	C2orf80	CXADR	LRRK2	SNTG1	CCND1	TM4SF1	FIP1L1	PCDH20	BRINP3	CNPY2
MANNO_MIDBRAIN_NEUROTYPES_HMGL	http://www.gsea-msigdb.org/gsea/msigdb/cards/MANNO_MIDBRAIN_NEUROTYPES_HMGL	A2M	ABCG1	ABI3	ACSF2	ACSL5	ACY3	ADAM28	ADAMTSL4-AS1	ADAP2	ADAT2	ADCK2	ADCY7	ADORA3	ADPGK	AFF1	AGR2	ALOX5	ALOX5AP	ANKRD44	ANXA11	ANXA4	ANXA5	AP1B1	APBB1IP	APOBEC3C	APOBEC3D	APOBR	APOC1	APOC2	APOE	ARHGAP25	ARHGAP26	ARHGAP4	ARHGDIB	ARHGEF6	ARPC1B	ARRB2	ARRDC2	ASAH1	ATF3	PGGHG	ATP8B4	AXL	B2M	B3GNT5	B4GALT1	BAG3	BCAP29	BCL2A1	BCL3	BDNF	BHLHE41	BIN1	BIN2	BIRC3	BLNK	BLVRB	BMP2K	BRI3	BTK	C12orf75	C17orf67	PITHD1	C1orf54	C1QA	C1QB	C1QC	C3	C3AR1	CAPG	CAPZB	CARD1

In [36]:
! grep -w "12:40196743-40369285" c8.all.v7.4.symbols.setFile.Oct52021.final.txt | head -2 # Right where you expect them

FAN_EMBRYONIC_CTX_OPC	4:54229279-54298245,17:19215614-19336715,4:173384700-173406380,9:101028726-101325135,3:195568704-195584033,7:12686855-12690958,6:70215060-70303083,10:53802770-55627942,X:103062650-103064171,14:100779409-100861031,4:157204181-157366075,11:119415475-119424985,8:42174717-42207676,14:62114352-62130962,16:57248546-57284672,3:165772903-165837462,7:104328699-104907232,11:40114202-41459773,2:127645863-127652639,12:50175787-50283546,20:63293185-63334851,17:51630312-52160017,12:40692438-41072415,12:76025446-76033932,6:117907263-118317676,5:9035032-9546075,20:10218829-10307418,12:10426853-10442300,X:48831095-48835610,3:190514050-190659750,2:165984640-166149214,5:142310426-142326455,15:51681491-51721026,1:167721191-167791919,1:65792513-66374579,9:119153457-119369435,8:66493519-66518524,12:118149800-118372907,21:33070140-33072413,7:8433608-8752961,16:57245258-57253635,7:16753754-16784536,4:41256412-41268455,9:4490467-4587469,19:3879863-3928082,2:208165342-208190030,21:17513042

In [37]:
! grep "_alt" c8.all.v7.4.symbols.setFile.Oct52021.final.txt

In [38]:
! grep "CHR_" c8.all.v7.4.symbols.setFile.Oct52021.final.txt