In [1]:
! pwd

/mnt/data/git/mikg/scripts


In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode

## Set up Spark
sc = pyspark.SparkContext()
spark = SparkSession.builder \
           .getOrCreate()

## Data paths
ot_platform = "/mnt/data/git/mikg/opentargets/"


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/25 08:33:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
## Read Open Targets platform data 
diseases = (spark.read.parquet(ot_platform+"diseases/", header=True)
            .withColumnRenamed("id","diseaseId")
            .withColumnRenamed("name","diseaseName")
            )
targets = (spark.read.parquet(ot_platform+"targets/")
           .withColumnRenamed("id","targetId")
           .withColumnRenamed("approvedSymbol", "targetSymbol")
           .withColumnRenamed("approvedName","targetName")
           )
evidence = spark.read.parquet(ot_platform+"evidence")
knowndrugs = (spark.read.parquet(ot_platform+"knownDrugsAggregated").withColumnRenamed("approvedSymbol", "targetGeneSymbol").withColumnRenamed("approvedName","targetGeneName").withColumnRenamed("label","diseaseName"))
overall_associations = spark.read.parquet(ot_platform+"associationByOverallIndirect")

                                                                                

In [7]:
overall_associations.printSchema()

root
 |-- diseaseId: string (nullable = true)
 |-- targetId: string (nullable = true)
 |-- diseaseLabel: string (nullable = true)
 |-- targetName: string (nullable = true)
 |-- targetSymbol: string (nullable = true)
 |-- overallDatasourceHarmonicScore: double (nullable = true)
 |-- overallDatatypeHarmonicScore: double (nullable = true)
 |-- overallDatasourceHarmonicVector: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- datasourceId: string (nullable = true)
 |    |    |-- datasourceHarmonicScore: double (nullable = true)
 |    |    |-- datasourceEvidenceCount: long (nullable = true)
 |    |    |-- weight: double (nullable = true)
 |-- overallDatatypeHarmonicVector: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- datatypeId: string (nullable = true)
 |    |    |-- datatypeHarmonicScore: double (nullable = true)
 |    |    |-- datatypeEvidenceCount: long (nullable = true)
 |    |    |-- weight: double (nullabl

In [5]:
associations = spark.read.parquet(ot_platform+"associationByDatatypeDirect/")
associations.printSchema()

root
 |-- targetId: string (nullable = true)
 |-- diseaseId: string (nullable = true)
 |-- datatypeId: string (nullable = true)
 |-- datatypeHarmonicScore: double (nullable = true)
 |-- datatypeEvidenceCount: long (nullable = true)
 |-- diseaseLabel: string (nullable = true)
 |-- targetName: string (nullable = true)
 |-- targetSymbol: string (nullable = true)



In [None]:
associations.show(5)

In [None]:
knowndrugs.show(5)

In [None]:
knowndrugs.select(col("drugId"),col("targetId"),col("diseaseId"),col("status"),col("diseaseName"),col("targetGeneSymbol"),col("targetGeneName"), col("prefName"), col("drugType")).show(5)

In [None]:
! rm -rf ../data/opentargets_disease_associations
! rm -rf ../data/opentargets_knowndrugs

In [None]:

associations.write.option("header", "true").option("delimiter", "\t").csv("../data/opentargets_disease_associations")
knowndrugs.select(col("drugId"),col("targetId"),col("diseaseId"),col("status"),col("diseaseName"),col("targetGeneSymbol"),col("targetGeneName"), col("prefName"), col("drugType")).write.option("header", "true").option("delimiter", "\t").csv("../data/opentargets_knowndrugs")

In [None]:
%%sh
# Get a list of all tab-separated files in the current directory
files=$(ls ../data/opentargets_disease_associations/*.csv)

# Loop through each file
for file in $files; do
    # If it's the first file, just print it without omitting the header
    if [ -z "$first_file_processed" ]; then
        cat "$file" > ../data/opentargets_disease_associations.tsv
        first_file_processed=true
    else
        # For subsequent files, print all lines except the first one (header)
        tail -n +2 "$file" >> ../data/opentargets_disease_associations.tsv
    fi
done

In [None]:
%%sh
# Get a list of all tab-separated files in the current directory
files=$(ls ../data/opentargets_knowndrugs/*.csv)

# Loop through each file
for file in $files; do
    # If it's the first file, just print it without omitting the header
    if [ -z "$first_file_processed" ]; then
        cat "$file" > ../data/opentargets_knowndrugs.tsv
        first_file_processed=true
    else
        # For subsequent files, print all lines except the first one (header)
        tail -n +2 "$file" >> ../data/opentargets_knowndrugs.tsv
    fi
done