# Create H5 Structure for Gasperini Data

This notebook creates a h5 data structure that contains all the information from the Gasperini et al. dataset, including the gene counts matrix, target site-cell matrix, covariates, and guide efficiencies. The h5 structure allows for faster reading of matrices, which improves runtime when training models at scale.

Author: Karthik Guruvayurappan

In [41]:
library(Matrix)
library(rhdf5)

In [2]:
# create matrix to hold Gasperini data
h5createFile('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5')

## Create dataframe of cell covariates

In [3]:
# read in covariates dataframe from paper
covariates <- read.table('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.phenoData.txt.gz')
covariate.names <- read.table('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale.phenoData.colnames.txt')$V1
colnames(covariates) <- covariate.names
head(covariates)

Unnamed: 0_level_0,sample,cell,total_umis,Size_Factor,gene,all_gene,barcode,read_count,umi_count,proportion,guide_count,sample_directory,ko_barcode_file,id,prep_batch,within_batch_chip,within_chip_lane,percent.mito
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<int>,<int>,<dbl>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
1,1A_1_SI-GA-E2,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,17572,1.0096818,chr10.845_top_two_chr1.11183_top_two_chr1.11293_top_two_chr11.1791_top_two_chr1.12598_top_two_chr1.12806_top_two_chr11.5089_top_two_chr12.283_top_two_chr12.3086_top_two_chr12.4956_top_two_chr13.188_top_two_chr14.1937_top_two_chr14.986_top_two_chr1.5035_top_two_chr15.3069_top_two_chr15.3317_top_two_chr15.3544_top_two_chr16.1866_top_two_chr16.2150_top_two_chr17.3444_top_two_chr17.6206_top_two_chr18.1149_top_two_chr18.252_top_two_chr18.398_second_two_chr18.575_top_two_chr1.8819_top_two_chr18.960_top_two_chr19.2258_top_two_chr19.2871_top_two_chr20.2100_top_two_chr20.282_top_two_chr21.339_top_two_chr2.1667_top_two_chr22.1035_top_two_chr2.2107_top_two_chr22.1271_top_two_chr22.844_top_two_chr2.5796_top_two_chr3.4238_top_two_chr3.767_second_two_chr4.1285_top_two_chr4.1381_top_two_chr4.1723_top_two_chr4.2949_top_two_chr4.3273_top_two_chr5.1426_top_two_chr5.2703_top_two_chr5.4895_top_two_chr6.1373_top_two_chr6.239_top_two_chr6.4987_top_two_chr6.5076_top_two_chr6.5348_top_two_chr7.1421_top_two_chr7.2879_top_two_chr7.4321_top_two_chr7.922_top_two_chr8.1463_top_two_chr8.2152_top_two_chr8.2650_top_two_chr9.1077_top_two_chr9.2443_top_two_chrX.1277_top_two_chrX.600_top_two_EEF2_TSS_MRPL9_TSS_PARK7_TSS,chr10.845_top_two_chr1.11183_top_two_chr1.11293_top_two_chr11.1791_top_two_chr1.12598_top_two_chr1.12806_top_two_chr11.5089_top_two_chr12.283_top_two_chr12.3086_top_two_chr12.4956_top_two_chr13.188_top_two_chr14.1937_top_two_chr14.986_top_two_chr1.5035_top_two_chr15.3069_top_two_chr15.3317_top_two_chr15.3544_top_two_chr16.1866_top_two_chr16.2150_top_two_chr17.3444_top_two_chr17.6206_top_two_chr18.1149_top_two_chr18.252_top_two_chr18.398_second_two_chr18.575_top_two_chr1.8819_top_two_chr18.960_top_two_chr19.2258_top_two_chr19.2871_top_two_chr20.2100_top_two_chr20.282_top_two_chr21.339_top_two_chr2.1667_top_two_chr22.1035_top_two_chr2.2107_top_two_chr22.1271_top_two_chr22.844_top_two_chr2.5796_top_two_chr3.4238_top_two_chr3.767_second_two_chr4.1285_top_two_chr4.1381_top_two_chr4.1723_top_two_chr4.2949_top_two_chr4.3273_top_two_chr5.1426_top_two_chr5.2703_top_two_chr5.4895_top_two_chr6.1373_top_two_chr6.239_top_two_chr6.4987_top_two_chr6.5076_top_two_chr6.5348_top_two_chr7.1421_top_two_chr7.2879_top_two_chr7.4321_top_two_chr7.922_top_two_chr8.1463_top_two_chr8.2152_top_two_chr8.2650_top_two_chr9.1077_top_two_chr9.2443_top_two_chrX.1277_top_two_chrX.600_top_two_EEF2_TSS_MRPL9_TSS_PARK7_TSS,AGAAAGCTCCTCCAGTTCAC_TGATCGCTTTGACTGTGACA_ACAATAAAGAACAGAACACA_GTAAATTGAGACCTCAGGAG_TCTTCCCCCCACCAATAACA_GAGAAAAAAACAATTCAGGC_TCTTAGAGTTCACAGAAGAA_GCTGGGAATTTCTCTCCTGG_AGTGTAACAGAATATCAAAT_ACCCACTGTGACTAGACAAA_AGAAGGATAGAGACTGCTGG_CCAGGCACTTGTGAGAACAA_TGATGGTGTCCCCACCCAAA_GCAGGCCCCATGGATACCCG_AAGGAGTGTGTTCCACACCA_GTACCCTCCCTACCCCCGAG_TGCCTGCTAGAGTCAATAGG_AATGCCAGTTTCCCCCACAG_GCCATTGCTGTAGAGACACT_GCCCAGTCAGAACCCAGGAA_GCACAGATTTACACGCCCGT_GAGAGGCTGCCAGCCCACAG_TGTTCTATATTGCCACCTAG_AGCATCAAATTGCAGAGCAG_ACGAAGAATGAATTGAAGGG_CTGTTTCAGAAAGCTCCCAA_ACTTTGAGCTGCTTCAAGGG_GAGACCTTCCCCCTACCCAG_TTTCCCCGCTGACAGACTGA_ATGACTGCCCCCAGCAGCAA_GGCGCAAAGACAGTGCCAGA_CTCTGACTCACACAACAGGA_GCAAGTTTGCTTTCTCCTGG_TTGAAAGACACATAGCACGA_TCACAGATATTGACTGCCCT_ACAACCCCAAGAACTAGCGG_GCAGCGAAGCTGTTCCACCA_TGCTGCATCCAGATGTTACG_AGACACAGTCAAATGAGGCA_AGCTCTGTCAACCTGCCATG_TCTGCCTGAATGTTTCTCAG_CTGTTTATACCGAGCAGTAG_TGAGCTCCGCCTACACACGG_TTAGTAGAGTGTAGACTGGG_CTCCTTACCCCAGCCAATCG_GGGCCATTACCTTTGCAGAG_GGAGCCCACGACGCTCAAGG_GTTTCCTTATGTATACGTGG_AGAACTCTCATGCGGTCAGG_GAGCTTACTCATGTCCCCCA_CTGACTCTGAGACAGCCAAG_ACCAGAGGAGAGATCAAAGG_GATAAGCAGGAATAAAATCA_AATCACAAGGAAGCAAAGAG_GCAGCTAGATAGATAAGGGG_TTCCATCCTGGGCATGACCA_GGTCGCAGATAGGTGACGGA_GCTCCTGCTTCACTAACCCT_TCAGGGAGCACCATACTGCG_AGAATAAGAGAAGTTCCTGC_ATAAGCTCTTGAGACCTGGG_ACCTCAATACTGCTTACAGA_ATCGGTGGCAGCCAGCTCCA_AGGCTGGTCTGTCTTCCCGA_CGAGTCGCGCCGAGGATGGG_CGGGCGCCGCCATGTTCACG_GTACCACTCACCCCACACCG,14135,964,0.9698189,67,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.058786706
2,1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,8923,0.9396766,chr1.12695_top_two_chr11.3294_top_two_chr1.6799_second_two_chr17.695_top_two_chr20.2033_top_two_chr20.314_top_two_chr2.403_second_two_chr2.403_top_two_chr2.4480_top_two_chr2.5077_top_two_chr3.2983_top_two_chr3.5568_top_two_chr5.2905_top_two_chr6.1406_top_two_chr6.1523_second_two_chr6.1669_top_two_chr6.2579_top_two_chr6.4327_top_two_chr6.4750_top_two_chr7.4162_top_two_chr7.4555_top_two_ID2_TSS_MYC_TSS_NXT1_TSS_TMEM14A_TSS_TXN2_TSS,chr1.12695_top_two_chr11.3294_top_two_chr1.6799_second_two_chr17.695_top_two_chr20.2033_top_two_chr20.314_top_two_chr2.403_second_two_chr2.403_top_two_chr2.4480_top_two_chr2.5077_top_two_chr3.2983_top_two_chr3.5568_top_two_chr5.2905_top_two_chr6.1406_top_two_chr6.1523_second_two_chr6.1669_top_two_chr6.2579_top_two_chr6.4327_top_two_chr6.4750_top_two_chr7.4162_top_two_chr7.4555_top_two_ID2_TSS_MYC_TSS_NXT1_TSS_TMEM14A_TSS_TXN2_TSS,GTAGAGCCTCCAGAACTGTG_AGGTTTATCCAGATGAACTG_CATCTGGTAGCCTCACACCG_GCAGGCTCCCTTAATGCAGG_TTTCTGGCCAATTAGAACCG_GGTCCTGTGCTGTCCCCAAG_GGTCAGCTTGTTTATCCCTG_GCAGCTGCTGGGAATAGCAT_AGCTGGCACCCAGACACTGA_TTAACCACAAAGTAGCATGG_GGAGCAGGTGGGAAACTGGA_TGGCCCCACCAAATGCTGCA_GTGTTTCTGAGCTGTCACCA_AGAAATCTGGGAGAAAGGCG_CATGCCATGGGACTTCCCCA_GGGTGTTGACGACAAACAGA_GTTGCCGTGGTTATATACCG_GCTGATCTGAAACTAAACCG_CTAGCAGTAGTCAGTGGAGG_CAGAACCCATCCCCCACGGA_GGATTCCTCCTGCTAAACGT_CTCACTCACTCCGCCTCTAG_TAATTCCAGCGAGAGGCAGG_CTCGCGTCTGCGTGGAGACG_AGACGGCTGGGCGCCGAGTG_CGACAGGCGTGCCCTTGACG,4329,293,0.8443804,26,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.036086518
3,1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,14637,0.9908029,ALDH1A2_TSS_BRI3_TSS_chr10.1918_top_two_chr10.350_top_two_chr10.4094_top_two_chr10.454_top_two_chr1.11338_top_two_chr11.2186_top_two_chr11.3163_top_two_chr11.3769_top_two_chr11.5908_second_two_chr12.966_top_two_chr1.3124_top_two_chr15.3008_top_two_chr16.1496_top_two_chr16.2611_top_two_chr16.5173_top_two_chr17.2188_top_two_chr1.7246_top_two_chr17.3741_top_two_chr17.5668_top_two_chr1.7903_top_two_chr18.249_top_two_chr1.8657_top_two_chr19.2423_top_two_chr19.309_top_two_chr19.367_top_two_chr19.5650_top_two_chr21.584_top_two_chr2.1716_second_two_chr2.1833_top_two_chr2.5001_top_two_chr2.5074_second_two_chr2.6078_top_two_chr2.6480_top_two_chr4.3242_top_two_chr4.52_top_two_chr5.1197_top_two_chr5.2062_top_two_chr5.5177_top_two_chr6.1077_top_two_chr6.2141_top_two_chr6.2186_top_two_chr6.2485_top_two_chr6.2726_top_two_chr6.4052_top_two_chr6.704_top_two_chr6.970_top_two_chr7.5935_top_two_chr7.851_top_two_chr8.3569_top_two_chr8.476_top_two_chr9.794_top_two_chrX.2199_top_two_chrX.752_top_two_CYB5B_TSS_DCTPP1_TSS_HDDC3_TSS_NDUFC1_TSS_TAF12_TSS_TIPRL_TSS,ALDH1A2_TSS_BRI3_TSS_chr10.1918_top_two_chr10.350_top_two_chr10.4094_top_two_chr10.454_top_two_chr1.11338_top_two_chr11.2186_top_two_chr11.3163_top_two_chr11.3769_top_two_chr11.5908_second_two_chr12.966_top_two_chr1.3124_top_two_chr15.3008_top_two_chr16.1496_top_two_chr16.2611_top_two_chr16.5173_top_two_chr17.2188_top_two_chr1.7246_top_two_chr17.3741_top_two_chr17.5668_top_two_chr1.7903_top_two_chr18.249_top_two_chr1.8657_top_two_chr19.2423_top_two_chr19.309_top_two_chr19.367_top_two_chr19.5650_top_two_chr21.584_top_two_chr2.1716_second_two_chr2.1833_top_two_chr2.5001_top_two_chr2.5074_second_two_chr2.6078_top_two_chr2.6480_top_two_chr4.3242_top_two_chr4.52_top_two_chr5.1197_top_two_chr5.2062_top_two_chr5.5177_top_two_chr6.1077_top_two_chr6.2141_top_two_chr6.2186_top_two_chr6.2485_top_two_chr6.2726_top_two_chr6.4052_top_two_chr6.704_top_two_chr6.970_top_two_chr7.5935_top_two_chr7.851_top_two_chr8.3569_top_two_chr8.476_top_two_chr9.794_top_two_chrX.2199_top_two_chrX.752_top_two_CYB5B_TSS_DCTPP1_TSS_HDDC3_TSS_NDUFC1_TSS_TAF12_TSS_TIPRL_TSS,CCAAGGCGTCCTCAGACCAG_AGCTCCAGGAAGGACCCCCG_TCACTCTCAGAGATAGTCTG_GGTAATCCACCCCACCCTAT_GGTTATCTGACTCACTGCAA_ACTCTGGGAACCTTAGATAA_GTTGTCCTTCAGGGACAGTG_GGAGACTTGCCTGGAATGAA_GTGCCTGCAGATCCTCCCAG_GGTCTGGTTGAAGCACACAG_GCCAAACCAACTTTGGGAGG_ACAGGCATGCGGCAACTGGA_ACTTCCTAGAAAACACCTGG_GCAGTCTCCCTTCTTCCACC_GAAAACGGCTGAGCAACGTT_CCGGATGGATGGGACGAGAC_ATTCACAGTGAGGGTCCTGG_ATAATTAACGTTAAACCAAT_AACATGGAAATGTCAACCAA_TGTCCTGCATCACTCAACAA_GTTCAATTCAGTCCCCACAG_TTCTTAGGTTCATCTCTGGC_GGTCCTAAAAAGAAGTAACG_CAGAGCAGCCACATGAGGAA_GCTGGACGTCAGCCCTGCCA_GGATTATTCCGGCCTGTCCA_ATTTCGACCCCACAGCCGCC_GGACAGTAATAGAAACAGGG_GCTTAGGGATCCTTGTAGGA_CGTTGTTGAGAACAGGACAG_ACTATGTGAATCTCTCCACA_TTAGATACCTGGCAGCAAAG_GCTACTGTCCAGCCCGCAGG_GCTGAAGGACTCTTCGAGCG_CATCTGTAATCTGACCTGTA_TATAGCTAAGAACTAAGCAT_GCATTTATCCTTCAGCCCAG_CTATTCCACGACTTGAACAA_GCCGGAGTCCCACTCCACGT_AGACTGAACCCAGCTTCACG_GCCAACTCCTTTCATTACTA_AGGCAGTTGAAGCCTGCAGG_GCAGCACTGCCACTGACGAG_GCATCAAGACTTGAACACAG_TCACTTGAAGAAAACAGCAG_CCATATCTCTGACTGCCCAC_TCTGCACCGTACTCTTTGGA_GCATGTTTAGTTGCTAGGAA_GCGCGCACACACAGTCCCCC_GTTGTATGACCAGCCCGTAT_GGAAAACCTGAGTAAGCAGG_AGTGGCGAGTGAAGACCTGA_CCTGGGCTGGTTGAGTTCCA_TTGGGATTAGTCATTTGTGG_AGTTAAGGAAGAGAAAAGGG_TGGCGACTGCGGAAGCTAGG_GCGGTGGGCGGCATGTCTGG_CGCGCGATGGGCTCTGAGGG_GAAAGGGGACGCAGCAAGGG_TGAGACGAACGCTTCACTGG_AACGGCTCGGAAGCCTAGGG,12362,884,0.9505376,61,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.069823051
4,1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,22798,1.0365782,C16orf91_TSS_chr1.11332_top_two_chr1.1933_top_two_chr14.2480_second_two_chr15.1531_top_two_chr15.3325_top_two_chr16.1863_top_two_chr16.1989_top_two_chr16.4195_top_two_chr1.6717_top_two_chr1.7355_top_two_chr18.1316_top_two_chr1.9549_top_two_chr20.2456_top_two_chr20.392_top_two_chr22.1453_top_two_chr3.3716_top_two_chr3.4901_top_two_chr3.5014_top_two_chr3.5543_top_two_chr4.1771_top_two_chr4.2248_top_two_chr4.260_top_two_chr4.2656_top_two_chr5.1201_second_two_chr5.2703_top_two_chr5.2943_top_two_chr6.1291_top_two_chr6.3912_top_two_chr6.4985_top_two_chr7.2653_top_two_chr7.2716_top_two_chr8.1450_second_two_chr8.2681_top_two_chr9.1167_top_two_chrX.2073_top_two_DHX29_TSS_FAM96A_TSS_HDDC3_TSS,C16orf91_TSS_chr1.11332_top_two_chr1.1933_top_two_chr14.2480_second_two_chr15.1531_top_two_chr15.3325_top_two_chr16.1863_top_two_chr16.1989_top_two_chr16.4195_top_two_chr1.6717_top_two_chr1.7355_top_two_chr18.1316_top_two_chr1.9549_top_two_chr20.2456_top_two_chr20.392_top_two_chr22.1453_top_two_chr3.3716_top_two_chr3.4901_top_two_chr3.5014_top_two_chr3.5543_top_two_chr4.1771_top_two_chr4.2248_top_two_chr4.260_top_two_chr4.2656_top_two_chr5.1201_second_two_chr5.2703_top_two_chr5.2943_top_two_chr6.1291_top_two_chr6.3912_top_two_chr6.4985_top_two_chr7.2653_top_two_chr7.2716_top_two_chr8.1450_second_two_chr8.2681_top_two_chr9.1167_top_two_chrX.2073_top_two_DHX29_TSS_FAM96A_TSS_HDDC3_TSS,GGCGTCAGTCGAGGAGTCAG_GCCAGCACTTCAGCTCACCG_GCTGAAGTCGAGGTGTGGCA_GTGACAGTGGGCATGAACCT_CCCACTTACCATGTGCCGGG_GAGACCCAAGCCAGCAGCCA_GCTACTTCAGAGTTAGCCAA_GGCTGCAACAATGAACCCCA_CTACTGAGCAGGGCAGGCCA_TTTGTTCCAGAACTTTCCAG_AGTACTACAGAAGACTGCTG_GGAGGCCTAACGAAGTGAGA_ATTTAGAACTAGGATGCGGG_GAGATCCCACCAAGTGCCTA_TTGCCACTCATCATCCAGCA_TTATTTGCAGAATCAAGGGC_TCTCCTTCCCTGTTGCTGCA_CTGGATGGCTAACCTGTTGA_CAGTTTCCATTAACTTTAAA_GTGGCTCTCCAGCTCCCCCA_AGCAATGAGGAAGTTCGAAC_TGCCTGTGTGCTAAGATAAA_TTATCAGGCCAGCCAGGCCA_GGAGAGATAACAATTCTGCT_GCACCGTTATGGCGCCTAAA_GCAGGCCTCCTTGAGCGTCG_TCAGCCTGTGGGTCTGCCCA_GGATGAGAACCAGATCACCA_GTGCTTGTACTGCCATCTAG_ACTAACGCAGACCCTTAGCA_GTGTGTGGTGAAGATCTAGG_TATCTCTAAACAGCACACTC_CATCAGTTCTGGTTACCCAC_GCAGCCTGCACACTACTGAG_GGACGCTAAAAATAGCCCAG_ATTGATGTCAAGGCATTGAG_AGAGCTCTCGGCTGTGCAGG_AGGAAGTTGATCAATCCCGG_TCGCAAGCACCGGCAGCAGG,7459,544,0.9395509,39,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.026186508
5,1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,10136,0.9528436,chr10.185_top_two_chr10.484_top_two_chr11.4167_top_two_chr12.528_top_two_chr13.465_top_two_chr14.2471_top_two_chr15.3685_top_two_chr15.36_top_two_chr1.640_top_two_chr18.1335_top_two_chr18.1476_top_two_chr18.300_second_two_chr1.8584_top_two_chr18.696_top_two_chr20.2244_top_two_chr20.314_top_two_chr21.140_top_two_chr2.3257_top_two_chr3.1708_top_two_chr3.1759_top_two_chr3.3203_top_two_chr3.389_top_two_chr3.5562_top_two_chr3.5668_top_two_chr3.945_top_two_chr4.3624_top_two_chr5.135_top_two_chr6.3123_second_two_chr6.477_top_two_chr6.503_top_two_chr7.3057_top_two_chr7.4042_top_two_chr7.5369_top_two_chr8.1657_top_two_chrX.276_top_two_LMO2_TSS_TIMM17B_TSS,chr10.185_top_two_chr10.484_top_two_chr11.4167_top_two_chr12.528_top_two_chr13.465_top_two_chr14.2471_top_two_chr15.3685_top_two_chr15.36_top_two_chr1.640_top_two_chr18.1335_top_two_chr18.1476_top_two_chr18.300_second_two_chr1.8584_top_two_chr18.696_top_two_chr20.2244_top_two_chr20.314_top_two_chr21.140_top_two_chr2.3257_top_two_chr3.1708_top_two_chr3.1759_top_two_chr3.3203_top_two_chr3.389_top_two_chr3.5562_top_two_chr3.5668_top_two_chr3.945_top_two_chr4.3624_top_two_chr5.135_top_two_chr6.3123_second_two_chr6.477_top_two_chr6.503_top_two_chr7.3057_top_two_chr7.4042_top_two_chr7.5369_top_two_chr8.1657_top_two_chrX.276_top_two_LMO2_TSS_TIMM17B_TSS,ATAAGGCACTCACATCCACC_GCTTGTCCCTAACACTCAGA_GGGCGAAAAGATGATGAAAG_GACCTTATATGTAGAATCCA_GAGTGGGGACTCAGGATGCA_GGGCTGTTGAGCTGCTGCTC_ACACGGCCCTGGACACAGCG_CTGCCATCATAGATAAGGTG_CCACCTTGCCCCTCAAGCAG_AGCTCGATGAGTGAAAACAG_GTCCTGTTCTGAAATCCAGG_GTTGCGTGTATAAATTTCAG_AAATTCCCAGAAGTGCTGGG_GGTCCATGAGCCACCCAGCG_TCAAGCTGAAACAGAAACTG_GTAACCGCCTTCTCCCCACG_GGGGGTGTGGATTTGAAGTG_AAGATAACACCTTCTGTGTT_AGATGAGTGAGGCATTCACC_GCAAGTAGGGAAATCCCCCG_GCACATTGCAGCGGAAACTA_CAGATGGCCGACTTGTACCA_TGCTCCCCACAAACAATGCG_TGTATTTGACTCTAGAACCA_AAATAAATATCACCCCTAGG_GGTCCTGGTTCCTAGGGTAA_AGGCAGCAGACAGAGCCCCG_GTGGGGACCCACATGTACAG_GGGAAGGTAGTGGGATTACA_GGCCTAAGTTAATTGGGCCG_GTTCATGTGCCAATTCCCTA_TTATTCCCACATGTCCACAG_GGAGGAATCATCCTACCGGG_GCTGCAACTGGATGACACAG_GCTAGGCCTCCATATTCCTA_CCCGGGCCGAAGGTGCGAGG_GGAGTACGCTCGGGAGCCCG,14831,1054,0.9599271,37,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.007991318
6,1A_1_SI-GA-E2,AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,9751,0.9488434,ACYP1_TSS_chr10.3313_top_two_chr10.3358_top_two_chr10.3437_top_two_chr10.3635_top_two_chr10.483_second_two_chr10.791_top_two_chr1.11140_top_two_chr1.11689_top_two_chr11.4348_second_two_chr12.1439_top_two_chr12.1559_top_two_chr12.1615_top_two_chr12.4908_top_two_chr1.2734_top_two_chr16.4012_top_two_chr1.663_top_two_chr1.7021_top_two_chr17.2361_top_two_chr17.5256_top_two_chr18.1266_top_two_chr18.246_top_two_chr18.261_top_two_chr19.2424_top_two_chr20.2590_top_two_chr20.652_top_two_chr2.2482_top_two_chr2.3688_top_two_chr2.4230_top_two_chr3.1355_top_two_chr3.1708_top_two_chr3.2715_top_two_chr3.3796_top_two_chr3.389_top_two_chr3.4555_top_two_chr4.2649_top_two_chr5.1388_top_two_chr5.1884_second_two_chr5.5186_top_two_chr6.1180_top_two_chr6.4102_top_two_chr6.5154_top_two_chr7.1855_top_two_chr7.4045_second_two_chr7.4787_top_two_chr7.4904_top_two_chr7.628_top_two_chr8.114_top_two_chr8.2377_top_two_chr9.1301_top_two_chr9.1736_top_two_chrX.2204_top_two_chrX.633_top_two_chrX.952_top_two_NARS_TSS_scrambled_7,ACYP1_TSS_chr10.3313_top_two_chr10.3358_top_two_chr10.3437_top_two_chr10.3635_top_two_chr10.483_second_two_chr10.791_top_two_chr1.11140_top_two_chr1.11689_top_two_chr11.4348_second_two_chr12.1439_top_two_chr12.1559_top_two_chr12.1615_top_two_chr12.4908_top_two_chr1.2734_top_two_chr16.4012_top_two_chr1.663_top_two_chr1.7021_top_two_chr17.2361_top_two_chr17.5256_top_two_chr18.1266_top_two_chr18.246_top_two_chr18.261_top_two_chr19.2424_top_two_chr20.2590_top_two_chr20.652_top_two_chr2.2482_top_two_chr2.3688_top_two_chr2.4230_top_two_chr3.1355_top_two_chr3.1708_top_two_chr3.2715_top_two_chr3.3796_top_two_chr3.389_top_two_chr3.4555_top_two_chr4.2649_top_two_chr5.1388_top_two_chr5.1884_second_two_chr5.5186_top_two_chr6.1180_top_two_chr6.4102_top_two_chr6.5154_top_two_chr7.1855_top_two_chr7.4045_second_two_chr7.4787_top_two_chr7.4904_top_two_chr7.628_top_two_chr8.114_top_two_chr8.2377_top_two_chr9.1301_top_two_chr9.1736_top_two_chrX.2204_top_two_chrX.633_top_two_chrX.952_top_two_NARS_TSS_scrambled_7,TCCGGGACCACCACGCCAAG_CCTCAGTTCCTCATTACGCG_CCATGCTTGTATCCACAACG_GTGCTTCTCAAATGCCCACG_GCTGGGGTAAGGCCACCGTG_TCTTAGAGAAAGCTAGCATA_AAAGTTTGTTGAAAGTGGCG_CTAGGTGGTGTGGGGAGTGA_GCTCCCACACGCAAACCGAA_GACACCCTCTCAGAACAGAG_TAACCTTGCACCACTCACTG_GCATCCCTAGACCCTAGCGG_CTGGGTCTACAGTTTCTCCG_AAAAGACGTCAAGCAGTCGG_ATAAAGGCGGAGGCTCTGGG_GATAGAGACCCTATACCCGG_ATATCAAAGCGATTACCCGG_CTGCAGCTATCCAAAGAAGT_GCAGTCGTGACACAAAGTGA_CTTATCTCAGAGCCACAGCT_GCTGCTGGAATAGGGCCAGA_CTAGCAAGCTCCCATGTCCC_TGCAGACAACATGGCGCATC_ACAGAGGGAGGACCACAGAG_TTAAAAAGAGAAACAGGCTG_TGAGTCACTGAACTGGGCCA_TGCCACCCTGAGCTTCTGAG_ACAGATAAGTGAAAATGCAG_GGGCCACCACAGTAGCAGAG_CCTGGAACTCAGGCTGCAAC_AGATGAGTGAGGCATTCACC_GGTGAGAGCCAAGCGTCGGA_AAGGACTTATAATTACAAAG_GCAGGGATCCTGCAGAGAGG_AAGTTCCTGCTTGACACCCA_GGCAGTTCAGTCTCACAAAG_AGCTGGAACACAGCACCCAA_TGAAAGCCAACAGCGCAGAG_GGAATCCAGATCCCGAGCAA_TCTTGCTATATCTTGCCGGA_ACAGTGAGTCAGCTTCTGAG_TTTAGAGTCCGACAGCTTGG_AGTCAGACCAGTACACCCAG_CTCTCCTGCTGGTAGAGCCA_AATGGGTCTGAAAAATAGAG_CTAGTACAGATAACAACCAA_ACTCTGCCACAACAAAACAG_GATCTAACACGGAGATCCGG_GAGATAATGGGGAATCCAAG_TTTCTCCCAGACCGGTACCG_AATGCTCTCAGGCAGACACG_TCAGTCTTTGAAAAACAGGA_CTACTGCAGTAACAAAACCA_ACGTGCTGACGGACAGGCAT_AGCAAGAGCTGGAAACCCCG_GTGGCCCTGGTCACCTCCAG_ACCGCTGCCCTAACCACTGG,17018,1241,0.9725705,57,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.022356681


In [4]:
# save covariates that were used in previously published model
covariates <- covariates[, c('cell', 'guide_count', 'prep_batch', 'percent.mito')]
head(covariates)

Unnamed: 0_level_0,cell,guide_count,prep_batch,percent.mito
Unnamed: 0_level_1,<chr>,<int>,<chr>,<dbl>
1,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,67,prep_batch_1,0.058786706
2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,26,prep_batch_1,0.036086518
3,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,61,prep_batch_1,0.069823051
4,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,39,prep_batch_1,0.026186508
5,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,37,prep_batch_1,0.007991318
6,AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,57,prep_batch_1,0.022356681


In [5]:
# read in cell cycle scores
s.scores <- read.csv('/iblm/netapp/home/karthik/crisprQTL/gasperini_data/s_scores.csv')
g2m.scores <- read.csv('/iblm/netapp/home/karthik/crisprQTL/gasperini_data/g2m_scores.csv')
colnames(s.scores) <- c('cell', 's.score')
colnames(g2m.scores) <- c('cell', 'g2m.score')
print(head(s.scores))
print(head(g2m.scores))

                              cell      s.score
1 AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2  0.110732311
2 AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2 -0.010290919
3 AAACCTGCAAACAACA-1_1A_1_SI-GA-E2 -0.175860130
4 AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2  0.003057281
5 AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2 -0.144480961
6 AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2  0.026418076
                              cell  g2m.score
1 AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2 -0.1319208
2 AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2 -0.1535426
3 AAACCTGCAAACAACA-1_1A_1_SI-GA-E2 -0.3084879
4 AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2 -0.1574859
5 AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2 -0.2362154
6 AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2 -0.1462899


In [6]:
covariates <- merge(covariates, s.scores, by = 'cell')
covariates <- merge(covariates, g2m.scores, by = 'cell')
head(covariates)

Unnamed: 0_level_0,cell,guide_count,prep_batch,percent.mito,s.score,g2m.score
Unnamed: 0_level_1,<chr>,<int>,<chr>,<dbl>,<dbl>,<dbl>
1,AAACCTGAGAACTGTA-1_2A_4_SI-GA-G5,24.0,prep_batch_2,0.05442845,0.0570467,0.002865079
2,AAACCTGAGAAGAAGC-1_2B_5_SI-GA-H6,30.0,prep_batch_2,0.06170906,-0.05632492,0.225004169
3,AAACCTGAGAAGGTTT-1_2A_7_SI-GA-G8,18.0,prep_batch_2,0.06625193,-0.17070798,0.167475723
4,AAACCTGAGAATAGGG-1_2B_6_SI-GA-H7,34.0,prep_batch_2,0.01928651,-0.10711479,1.108284663
5,AAACCTGAGAATTCCC-1_1B_2_SI-GA-F3,13.0,prep_batch_1,0.03646176,-0.15393309,-0.145533466
6,AAACCTGAGACACTAA-1_1B_5_SI-GA-F6,,prep_batch_1,0.03699082,-0.05339071,-0.188043139


In [7]:
h5write(covariates, '/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'covariates')

In [8]:
h5ls('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5')

Unnamed: 0_level_0,group,name,otype,dclass,dim
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
0,/,covariates,H5I_DATASET,COMPOUND,207324


In [9]:
covariates <- h5read('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'covariates')
head(covariates)

Unnamed: 0_level_0,cell,guide_count,prep_batch,percent.mito,s.score,g2m.score
Unnamed: 0_level_1,<chr>,<int>,<chr>,<dbl>,<dbl>,<dbl>
1,AAACCTGAGAACTGTA-1_2A_4_SI-GA-G5,24.0,prep_batch_2,0.05442845,0.0570467,0.002865079
2,AAACCTGAGAAGAAGC-1_2B_5_SI-GA-H6,30.0,prep_batch_2,0.06170906,-0.05632492,0.225004169
3,AAACCTGAGAAGGTTT-1_2A_7_SI-GA-G8,18.0,prep_batch_2,0.06625193,-0.17070798,0.167475723
4,AAACCTGAGAATAGGG-1_2B_6_SI-GA-H7,34.0,prep_batch_2,0.01928651,-0.10711479,1.108284663
5,AAACCTGAGAATTCCC-1_1B_2_SI-GA-F3,13.0,prep_batch_1,0.03646176,-0.15393309,-0.145533466
6,AAACCTGAGACACTAA-1_1B_5_SI-GA-F6,,prep_batch_1,0.03699082,-0.05339071,-0.188043139


## Read in gene counts matrix and write to h5

In [11]:
gene.counts <- readMM('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.exprs.mtx')

In [12]:
# read in gene names
genes <- read.table('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.genes.txt')$V1
rownames(gene.counts) <- genes

In [13]:
# read in cell names
cells <- read.table('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.cells.txt')$V1
colnames(gene.counts) <- cells

In [14]:
gene.counts <- as.matrix(gene.counts)

In [15]:
h5write(gene.counts, '/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'gene.counts')

Current chunk settings will exceed HDF5's 4GB limit.
Automatically adjusting them to: 13135 x 23170
You may wish to set these to more appropriate values using the 'chunk' argument.



In [16]:
# write cell and gene names to h5 (since they are not automatically written)
h5write(cells, '/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'cell.barcodes')
h5write(genes, '/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'gene.names')

In [17]:
h5ls('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5')

Unnamed: 0_level_0,group,name,otype,dclass,dim
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
0,/,cell.barcodes,H5I_DATASET,STRING,207324
1,/,covariates,H5I_DATASET,COMPOUND,207324
2,/,gene.counts,H5I_DATASET,FLOAT,13135 x 207324
3,/,gene.names,H5I_DATASET,STRING,13135


In [31]:
counts <- h5read('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'gene.counts')

In [32]:
genes <- h5read('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'gene.names')
rownames(counts) <- genes

In [39]:
test.count <- counts['ENSG00000237094', ]
test.count

In [40]:
cells <- h5read('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'cell.barcodes')
test.df <- cbind(cells, test.count)
head(test.df)

cells,test.count
AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,0
AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,0
AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,0
AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,1
AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,0
AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,0


## Read in guide efficiency values and write to h5

In [46]:
# read in GuideScan output
guidescan.guide.info <- read.csv('/iblm/netapp/home/karthik/GuideScan/Gasperini2019/guidescan_output.csv')
head(guidescan.guide.info)

Unnamed: 0_level_0,Index,gRNA,Chromosome,Start,End,Strand,Num.Off.Targets,Off.Target.Summary,Specificity,Cutting.Efficiency
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<chr>,<int>,<chr>,<dbl>,<dbl>
1,0,CTAAAGCATTGGCTGAGAAGNGG,chr8,23911081,23911103,-,41,2:2 | 3:39,0.136228,0.56501
2,1,GTAGTTCACATAATCCCTGTNGG,chr4,25698193,25698215,-,55,2:0 | 3:55,0.165929,0.572492
3,2,AAGTTGACTCTACATAGCAGNGG,chr8,23912565,23912587,+,22,2:1 | 3:21,0.341067,0.636691
4,3,AATATTCTCCCTCATTCTGGNGG,chr5,12539360,12539382,-,803,2:26 | 3:777,0.00274364,0.6198
5,4,AATCCTCTAATGGACGAAGANGG,chr8,23913057,23913079,-,24,2:0 | 3:24,0.334415,0.602272
6,5,AGATACCTATGGCCATATAGNGG,chr5,12540099,12540121,+,14,2:0 | 3:14,0.351723,0.531946


In [47]:
h5write(guidescan.guide.info, '/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'guidescan.output')

In [48]:
h5ls('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5')

Unnamed: 0_level_0,group,name,otype,dclass,dim
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
0,/,cell.barcodes,H5I_DATASET,STRING,207324
1,/,covariates,H5I_DATASET,COMPOUND,207324
2,/,gene.counts,H5I_DATASET,FLOAT,13135 x 207324
3,/,gene.names,H5I_DATASET,STRING,13135
4,/,guidescan.output,H5I_DATASET,COMPOUND,13189


In [51]:
guidescan.output <- h5read('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'guidescan.output')
head(guidescan.output)

Unnamed: 0_level_0,Index,gRNA,Chromosome,Start,End,Strand,Num.Off.Targets,Off.Target.Summary,Specificity,Cutting.Efficiency
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<chr>,<int>,<chr>,<dbl>,<dbl>
1,0,CTAAAGCATTGGCTGAGAAGNGG,chr8,23911081,23911103,-,41,2:2 | 3:39,0.136228,0.56501
2,1,GTAGTTCACATAATCCCTGTNGG,chr4,25698193,25698215,-,55,2:0 | 3:55,0.165929,0.572492
3,2,AAGTTGACTCTACATAGCAGNGG,chr8,23912565,23912587,+,22,2:1 | 3:21,0.341067,0.636691
4,3,AATATTCTCCCTCATTCTGGNGG,chr5,12539360,12539382,-,803,2:26 | 3:777,0.00274364,0.6198
5,4,AATCCTCTAATGGACGAAGANGG,chr8,23913057,23913079,-,24,2:0 | 3:24,0.334415,0.602272
6,5,AGATACCTATGGCCATATAGNGG,chr5,12540099,12540121,+,14,2:0 | 3:14,0.351723,0.531946


## Read in cell-target site matrix and write to h5