In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode

## Set up Spark
sc = pyspark.SparkContext()
spark = SparkSession.builder \
           .getOrCreate()

## Data paths
ot_platform = "/usr/local/hdd3/mikg/opentargets/"


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/09/05 09:34:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
## Read Open Targets platform data 
diseases = (spark.read.parquet(ot_platform+"diseases/", header=True)
            .withColumnRenamed("id","diseaseId")
            .withColumnRenamed("name","diseaseName")
            )
targets = (spark.read.parquet(ot_platform+"targets/")
           .withColumnRenamed("id","targetId")
           .withColumnRenamed("approvedSymbol", "targetSymbol")
           .withColumnRenamed("approvedName","targetName")
           )
evidence = spark.read.parquet(ot_platform+"evidence")
knowndrugs = (spark.read.parquet(ot_platform+"knownDrugsAggregated").withColumnRenamed("approvedSymbol", "targetGeneSymbol").withColumnRenamed("approvedName","targetGeneName").withColumnRenamed("label","diseaseName"))
overall_associations = spark.read.parquet(ot_platform+"associationByOverallIndirect")

In [3]:
associations = spark.read.parquet(ot_platform+"associationByDatatypeDirect/")
associations.printSchema()

root
 |-- targetId: string (nullable = true)
 |-- diseaseId: string (nullable = true)
 |-- datatypeId: string (nullable = true)
 |-- datatypeHarmonicScore: double (nullable = true)
 |-- datatypeEvidenceCount: long (nullable = true)
 |-- diseaseLabel: string (nullable = true)
 |-- targetName: string (nullable = true)
 |-- targetSymbol: string (nullable = true)



In [4]:
associations.show(5)

+---------------+-----------+--------------+---------------------+---------------------+----------------+--------------------+------------+
|       targetId|  diseaseId|    datatypeId|datatypeHarmonicScore|datatypeEvidenceCount|    diseaseLabel|          targetName|targetSymbol|
+---------------+-----------+--------------+---------------------+---------------------+----------------+--------------------+------------+
|ENSG00000000003|EFO_0000305|rna_expression| 0.017416968888271284|                    1|breast carcinoma|       tetraspanin 6|      TSPAN6|
|ENSG00000000003|EFO_0000311|    literature| 0.020000000000000004|                    1|          cancer|       tetraspanin 6|      TSPAN6|
|ENSG00000000005|EFO_0000305|rna_expression| 0.025026262158649122|                    2|breast carcinoma|         tenomodulin|        TNMD|
|ENSG00000000005|EFO_0000311|    literature|                0.004|                    1|          cancer|         tenomodulin|        TNMD|
|ENSG00000000419|EFO

In [5]:
knowndrugs.show(5)

+-------------+---------------+-----------+-----+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+-----------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+
|       drugId|       targetId|  diseaseId|phase|              status|                urls|           ancestors|         diseaseName|targetGeneSymbol|      targetGeneName|targetClass|            prefName|          tradeNames|            synonyms|      drugType|   mechanismOfAction|          references|          targetName|
+-------------+---------------+-----------+-----+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+-----------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+
|    CHEMBL628|ENSG000001

In [6]:
knowndrugs.select(col("drugId"),col("targetId"),col("diseaseId"),col("status"),col("diseaseName"),col("targetGeneSymbol"),col("targetGeneName"), col("prefName"), col("drugType")).show(5)

+-------------+---------------+-----------+--------------------+--------------------+----------------+--------------------+--------------------+--------------+
|       drugId|       targetId|  diseaseId|              status|         diseaseName|targetGeneSymbol|      targetGeneName|            prefName|      drugType|
+-------------+---------------+-----------+--------------------+--------------------+----------------+--------------------+--------------------+--------------+
|    CHEMBL628|ENSG00000113231| DOID_13406|           Completed|pulmonary sarcoid...|           PDE8B|phosphodiesterase 8B|      PENTOXIFYLLINE|Small molecule|
|CHEMBL1096882|ENSG00000175482|EFO_0000095|Active, not recru...|chronic lymphocyt...|           POLD4|DNA polymerase de...|FLUDARABINE PHOSP...|Small molecule|
|CHEMBL1096882|ENSG00000177084|EFO_0000095|           Completed|chronic lymphocyt...|            POLE|DNA polymerase ep...|FLUDARABINE PHOSP...|Small molecule|
|CHEMBL1096882|ENSG00000100479|EFO_00000

In [10]:
! rm -rf ../data/opentargets_disease_associations
! rm -rf ../data/opentargets_knowndrugs

In [11]:

associations.write.option("header", "true").option("delimiter", "\t").csv("../data/opentargets_disease_associations")
knowndrugs.select(col("drugId"),col("targetId"),col("diseaseId"),col("status"),col("diseaseName"),col("targetGeneSymbol"),col("targetGeneName"), col("prefName"), col("drugType")).write.option("header", "true").option("delimiter", "\t").csv("../data/opentargets_knowndrugs")

                                                                                

In [12]:
%%sh
# Get a list of all tab-separated files in the current directory
files=$(ls ../data/opentargets_disease_associations/*.csv)

# Loop through each file
for file in $files; do
    # If it's the first file, just print it without omitting the header
    if [ -z "$first_file_processed" ]; then
        cat "$file" > ../data/opentargets_disease_associations.tsv
        first_file_processed=true
    else
        # For subsequent files, print all lines except the first one (header)
        tail -n +2 "$file" >> ../data/opentargets_disease_associations.tsv
    fi
done

In [13]:
%%sh
# Get a list of all tab-separated files in the current directory
files=$(ls ../data/opentargets_knowndrugs/*.csv)

# Loop through each file
for file in $files; do
    # If it's the first file, just print it without omitting the header
    if [ -z "$first_file_processed" ]; then
        cat "$file" > ../data/opentargets_knowndrugs.tsv
        first_file_processed=true
    else
        # For subsequent files, print all lines except the first one (header)
        tail -n +2 "$file" >> ../data/opentargets_knowndrugs.tsv
    fi
done