# Explorando datasets ANNOVAR

In [1]:
import pyspark

In [2]:
sc = pyspark.SparkContext('local[*]')

In [3]:
spark = pyspark.sql.SparkSession(sc).builder.appName("Explore ANNOVAR datasets").getOrCreate()

In [4]:
hg19_mcap =  sc.textFile('../data/hg19_mcap.txt')

In [5]:
hg19_mcap_csv = hg19_mcap.map(lambda line: ";".join(line.split("\t")) if not line.startswith("#") else ";".join(["chromosome", "start", "end", "ref", "alt", "mcap"]))

In [6]:
hg19_mcap_csv.take(10)

['chromosome;start;end;ref;alt;mcap',
 '1;69091;69091;A;T;0.00708247797993',
 '1;69091;69091;A;C;0.00708247797993',
 '1;69091;69091;A;G;0.00996425211598',
 '1;69092;69092;T;A;0.00369310043823',
 '1;69092;69092;T;C;0.00326415452909',
 '1;69092;69092;T;G;0.00369358636892',
 '1;69093;69093;G;A;0.00145301983733',
 '1;69093;69093;G;T;0.00145301983733',
 '1;69093;69093;G;C;0.00144694747062']

In [7]:
hg19_mcap_csv.saveAsTextFile('../data/hg19_mcap.csv')

In [8]:
hg19_map_df = spark.read.csv('../data/hg19_mcap.csv', header=True, sep=";")

In [9]:
hg19_map_df.show()

+----------+-----+-----+---+---+----------------+
|chromosome|start|  end|ref|alt|            mcap|
+----------+-----+-----+---+---+----------------+
|         1|69091|69091|  A|  T|0.00708247797993|
|         1|69091|69091|  A|  C|0.00708247797993|
|         1|69091|69091|  A|  G|0.00996425211598|
|         1|69092|69092|  T|  A|0.00369310043823|
|         1|69092|69092|  T|  C|0.00326415452909|
|         1|69092|69092|  T|  G|0.00369358636892|
|         1|69093|69093|  G|  A|0.00145301983733|
|         1|69093|69093|  G|  T|0.00145301983733|
|         1|69093|69093|  G|  C|0.00144694747062|
|         1|69094|69094|  G|  A|0.00504677681045|
|         1|69094|69094|  G|  T|0.00512526482336|
|         1|69094|69094|  G|  C|0.00512526482336|
|         1|69095|69095|  T|  A|0.00600421272717|
|         1|69095|69095|  T|  C| 0.0047045259721|
|         1|69095|69095|  T|  G| 0.0063892150463|
|         1|69097|69097|  A|  T|0.00301640918768|
|         1|69097|69097|  A|  C|0.00640998582146|


In [10]:
hg19_abraom = sc.textFile("../data/hg19_abraom.txt")

In [11]:
hg19_abraom_csv = hg19_abraom.map(lambda line: ";".join(line.split("\t")) if not line.startswith("#") else ";".join(["chromosome", "start", "end", "ref", "alt", "abraom_freq",
                                                                                                                   "abraom_filter", "abraom_cegh_filter"]))

In [12]:
hg19_abraom_csv.take(10)

['chromosome;start;end;ref;alt;abraom_freq;abraom_filter;abraom_cegh_filter',
 '1;13116;13116;T;G;0.010033;LowQual;FDP',
 '1;13244;13244;G;A;0.002193;LowQual;FAB',
 '1;13248;13248;C;G;0.004149;VQSRTrancheSNP99.00to99.90;FAB',
 '1;13273;13273;G;C;0.113333;VQSRTrancheSNP99.00to99.90;WK-LowCall',
 '1;13302;13302;C;T;0.017143;VQSRTrancheSNP99.00to99.90;WK-LowCall',
 '1;13380;13380;C;G;0.007782;VQSRTrancheSNP99.00to99.90;FAB',
 '1;13417;13417;-;GAGA;0.075145;PASS;WK-LowCall',
 '1;13418;13418;G;A;0.088398;VQSRTrancheSNP99.00to99.90;FAB',
 '1;13479;13479;A;T;0.003968;LowQual;FAB']

In [13]:
hg19_abraom_csv_ordered = hg19_abraom_csv.coalesce(1)

In [14]:
hg19_abraom_csv_ordered.saveAsTextFile("../data/hg19_abraom.csv")

In [15]:
hg19_abraom_df = spark.read.csv("../data/hg19_abraom.csv", header=True, sep=";")

In [16]:
hg19_abraom_df.show()

+----------+-----+-----+---+----+-----------+--------------------+------------------+
|chromosome|start|  end|ref| alt|abraom_freq|       abraom_filter|abraom_cegh_filter|
+----------+-----+-----+---+----+-----------+--------------------+------------------+
|         1|13116|13116|  T|   G|   0.010033|             LowQual|               FDP|
|         1|13244|13244|  G|   A|   0.002193|             LowQual|               FAB|
|         1|13248|13248|  C|   G|   0.004149|VQSRTrancheSNP99....|               FAB|
|         1|13273|13273|  G|   C|   0.113333|VQSRTrancheSNP99....|        WK-LowCall|
|         1|13302|13302|  C|   T|   0.017143|VQSRTrancheSNP99....|        WK-LowCall|
|         1|13380|13380|  C|   G|   0.007782|VQSRTrancheSNP99....|               FAB|
|         1|13417|13417|  -|GAGA|   0.075145|                PASS|        WK-LowCall|
|         1|13418|13418|  G|   A|   0.088398|VQSRTrancheSNP99....|               FAB|
|         1|13479|13479|  A|   T|   0.003968|         

In [17]:
hg19_clinvar = sc.textFile("../data/hg19_clinvar_20170905.txt")

In [20]:
hg19_clinvar_csv = hg19_clinvar.map(lambda line: ";".join(line.split("\t")) if not line.startswith("#") else ";".join(["chromosome", "start", "end", "ref", "alt", "clinsig",
                                                                                                                      "clndbn", "clnacc", "clndsdb", "clndsdbid"]))

In [21]:
hg19_clinvar_csv.take(10)

['chromosome;start;end;ref;alt;clinsig;clndbn;clnacc;clndsdb;clndsdbid',
 '1;1;1;0;0;Pathogenic;Hereditary_cancer-predisposing_syndrome;RCV000492594.1;MedGen:SNOMED_CT;C0027672:699346009',
 '1;1;1;0;0;Pathogenic;not_provided;RCV000490008.1;MedGen;CN221809',
 '1;1;1;0;0;Uncertain significance;not_specified;RCV000489770.1;MedGen;CN169374',
 '1;949523;949523;C;T;Pathogenic;Immunodeficiency_38_with_basal_ganglia_calcification;RCV000162196.3;MedGen:OMIM:Orphanet;C4015293:616126:ORPHA319563',
 '1;949608;949608;G;A;Benign;not_specified;RCV000455759.1;MedGen;CN169374',
 '1;949696;949696;-;G;Pathogenic;Immunodeficiency_38_with_basal_ganglia_calcification;RCV000148989.5;MedGen:OMIM:Orphanet;C4015293:616126:ORPHA319563',
 '1;949739;949739;G;T;Pathogenic;Immunodeficiency_38_with_basal_ganglia_calcification;RCV000148988.5;MedGen:OMIM:Orphanet;C4015293:616126:ORPHA319563',
 '1;955563;955563;G;C;Likely benign;not_specified;RCV000424799.1;MedGen;CN169374',
 '1;955596;955596;C;G;Uncertain significance;

In [22]:
hg19_clinvar_csv.saveAsTextFile("../data/hg19_clinvar.csv")

In [23]:
hg19_clinvar_df = spark.read.csv("../data/hg19_clinvar.csv", header=True, sep=";")

In [24]:
hg19_clinvar_df.show()

+----------+------+------+---+---+--------------------+--------------------+--------------+--------------------+--------------------+
|chromosome| start|   end|ref|alt|             clinsig|              clndbn|        clnacc|             clndsdb|           clndsdbid|
+----------+------+------+---+---+--------------------+--------------------+--------------+--------------------+--------------------+
|         1|     1|     1|  0|  0|          Pathogenic|Hereditary_cancer...|RCV000492594.1|    MedGen:SNOMED_CT|  C0027672:699346009|
|         1|     1|     1|  0|  0|          Pathogenic|        not_provided|RCV000490008.1|              MedGen|            CN221809|
|         1|     1|     1|  0|  0|Uncertain signifi...|       not_specified|RCV000489770.1|              MedGen|            CN169374|
|         1|949523|949523|  C|  T|          Pathogenic|Immunodeficiency_...|RCV000162196.3|MedGen:OMIM:Orphanet|C4015293:616126:O...|
|         1|949608|949608|  G|  A|              Benign|       

In [25]:
hg19_gnomad_exome = sc.textFile("../data/hg19_gnomad_exome.txt")

In [33]:
hg19_gnomad_exome.take(10)

['#Chr\tStart\tEnd\tRef\tAlt\tgnomAD_exome_ALL\tgnomAD_exome_AFR\tgnomAD_exome_AMR\tgnomAD_exome_ASJ\tgnomAD_exome_EAS\tgnomAD_exome_FIN\tgnomAD_exome_NFE\tgnomAD_exome_OTH\tgnomAD_exome_SAS',
 '1\t12198\t12198\tG\tC\t.\t.\t.\t.\t.\t.\t.\t.\t.',
 '1\t12237\t12237\tG\tA\t.\t.\t.\t.\t.\t.\t.\t.\t.',
 '1\t12259\t12259\tG\tC\t0\t.\t.\t.\t.\t.\t.\t.\t0',
 '1\t12266\t12266\tG\tA\t.\t.\t.\t.\t.\t.\t.\t.\t.',
 '1\t12272\t12272\tG\tA\t0\t.\t0\t.\t.\t.\t.\t.\t.',
 '1\t12554\t12554\tA\tG\t0\t0\t0\t0\t0\t0\t0\t0\t0',
 '1\t12559\t12559\tG\tA\t0.0051\t0\t0.0054\t0\t0.0027\t0\t0.0018\t0.0122\t0.0099',
 '1\t12573\t12573\tT\tC\t0.0005\t0\t0.0028\t0\t0\t0\t0\t0\t0',
 '1\t12586\t12586\tC\tT\t0.0006\t0\t0\t0\t0.0023\t0\t0\t0\t0.0009']

In [36]:
hg19_gnomad_exome_csv = hg19_gnomad_exome.map(lambda line: ";".join(line.split("\t")) if not line.startswith("#") else ";".join(["chromosome", "start", "end", "ref", "alt",
                                                                                                                            "gnomad_exome_all", "gnomad_exome_afr",
                                                                                                                            "gnomad_exome_amr", "gnomad_exome_asj",
                                                                                                                            "gnomad_exome_eas", "gnomad_exome_fin",
                                                                                                                            "gnomad_exome_nfe", "gnomad_exome_oth",
                                                                                                                            "gnomad_exome_sas"]))

In [37]:
hg19_gnomad_exome_csv.take(10)

['chromosome;start;end;ref;alt;gnomad_exome_all;gnomad_exome_afr;gnomad_exome_amr;gnomad_exome_asj;gnomad_exome_eas;gnomad_exome_fin;gnomad_exome_nfe;gnomad_exome_oth;gnomad_exome_sas',
 '1;12198;12198;G;C;.;.;.;.;.;.;.;.;.',
 '1;12237;12237;G;A;.;.;.;.;.;.;.;.;.',
 '1;12259;12259;G;C;0;.;.;.;.;.;.;.;0',
 '1;12266;12266;G;A;.;.;.;.;.;.;.;.;.',
 '1;12272;12272;G;A;0;.;0;.;.;.;.;.;.',
 '1;12554;12554;A;G;0;0;0;0;0;0;0;0;0',
 '1;12559;12559;G;A;0.0051;0;0.0054;0;0.0027;0;0.0018;0.0122;0.0099',
 '1;12573;12573;T;C;0.0005;0;0.0028;0;0;0;0;0;0',
 '1;12586;12586;C;T;0.0006;0;0;0;0.0023;0;0;0;0.0009']

In [44]:
hg19_gnomad_exome_csv = hg19_gnomad_exome_csv.coalesce(1)

In [45]:
hg19_gnomad_exome_csv.saveAsTextFile("../data/hg19_gnomad_exome.csv")

In [46]:
hg19_gnomad_exome_df = spark.read.csv("../data/hg19_gnomad_exome.csv", header=True, sep=";")

In [47]:
hg19_gnomad_exome_df.show()

+----------+-----+-----+---+---+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+
|chromosome|start|  end|ref|alt|gnomad_exome_all|gnomad_exome_afr|gnomad_exome_amr|gnomad_exome_asj|gnomad_exome_eas|gnomad_exome_fin|gnomad_exome_nfe|gnomad_exome_oth|gnomad_exome_sas|
+----------+-----+-----+---+---+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+
|         1|12198|12198|  G|  C|               .|               .|               .|               .|               .|               .|               .|               .|               .|
|         1|12237|12237|  G|  A|               .|               .|               .|               .|               .|               .|               .|               .|               .|
|         1|12259|12259|  G|  C|               0|               .|    

In [48]:
hg19_revel = sc.textFile("../data/hg19_revel.txt")

In [49]:
hg19_revel_csv = hg19_revel.map(lambda line: ";".join(line.split("\t")) if not line.startswith("#") else ";".join(["chromosome", "start", "end", "ref", "alt", "revel"]))

In [50]:
hg19_revel_csv.take(10)

['chromosome;start;end;ref;alt;revel',
 '1;35142;35142;G;A;0.027',
 '1;35142;35142;G;C;0.035',
 '1;35142;35142;G;T;0.043',
 '1;35143;35143;T;A;0.018',
 '1;35143;35143;T;C;0.034',
 '1;35143;35143;T;G;0.039',
 '1;35144;35144;A;C;0.012',
 '1;35145;35145;C;A;0.023',
 '1;35145;35145;C;G;0.029']

In [51]:
hg19_revel_csv = hg19_revel_csv.coalesce(1)

In [52]:
hg19_revel_csv.saveAsTextFile("../data/hg19_revel.csv")

In [53]:
hg19_revel_df = spark.read.csv("../data/hg19_revel.csv", header=True, sep=";")

In [54]:
hg19_revel_df.show()

+----------+-----+-----+---+---+-----+
|chromosome|start|  end|ref|alt|revel|
+----------+-----+-----+---+---+-----+
|         1|35142|35142|  G|  A|0.027|
|         1|35142|35142|  G|  C|0.035|
|         1|35142|35142|  G|  T|0.043|
|         1|35143|35143|  T|  A|0.018|
|         1|35143|35143|  T|  C|0.034|
|         1|35143|35143|  T|  G|0.039|
|         1|35144|35144|  A|  C|0.012|
|         1|35145|35145|  C|  A|0.023|
|         1|35145|35145|  C|  G|0.029|
|         1|35145|35145|  C|  T|0.016|
|         1|35146|35146|  A|  C|0.031|
|         1|35146|35146|  A|  G|0.016|
|         1|35146|35146|  A|  T|0.025|
|         1|35147|35147|  T|  A|0.004|
|         1|35147|35147|  T|  G|0.004|
|         1|35148|35148|  A|  G|0.010|
|         1|35149|35149|  A|  C|0.029|
|         1|35149|35149|  A|  T|0.022|
|         1|35150|35150|  T|  A|0.038|
|         1|35150|35150|  T|  G|0.055|
+----------+-----+-----+---+---+-----+
only showing top 20 rows

