## Colchain

Using DrugBank logs we do following:

1. preprocess format of Drugbank logs

2. parse the logs

3. Calculate C and I (complex queries)

4. find the queries with multiple prefixes(federation)

5. find the queries having predicate: { type: 'path'} and pathType: '+','*' (star and propersty path)

Query selection
https://docs.google.com/document/d/1I8XCZ-wnG1L89lGQxxbLUShUHPIBe9gOHu58aoWy1rc/edit

Complexity (# of triple patterns, )

Path join (eg., subject of a triple can be object) and star joins

Linkedness .. (use of multiple datasets, # of different namespace (ignoring rdf: rdfs:))

Use of Expensive SPARQL operations (Aggregate functions, Group By (Not an issue), ORDER BY (Easy), FILTER (Low), FILTER with regex or substring (High), OPTIONAL (High))


In [None]:
sparqlQuery = '''select * where{
  ?x <example:www/mbox> <mailto:alice@example> .
  ?x <foaf:knows>+/<foaf:name> ?name .
 }'''


In [None]:
{
  queryType: 'SELECT',
  variables: '*',
  where: [
    {
      type: 'bgp',
      triples: [
        {
          subject: Variable { termType: 'Variable', value: 'x' },
          predicate: NamedNode {
            termType: 'NamedNode',
            value: 'example:www/mbox'
          },
          object: NamedNode {
            termType: 'NamedNode',
            value: 'mailto:alice@example'
          }
        },
        {
          subject: Variable { termType: 'Variable', value: 'x' },
          predicate: {
            type: 'path',
            pathType: '/',
            items: [
              {
                type: 'path',
                pathType: '+',
                items: [
                  NamedNode {
                    termType: 'NamedNode',
                    value: 'foaf:knows'
                  }
                ]
              },
              NamedNode { termType: 'NamedNode', value: 'foaf:name' }
            ]
          },
          object: Variable { termType: 'Variable', value: 'name' }
        }
      ]
    }
  ],
  type: 'query',
  prefixes: {}
}

# Parse queries

In [None]:
const fs = require('fs');
const SparqlParser = require('sparqljs').Parser;
const csvParser = require('csv-parser');
const { unparse } = require('papaparse');
const { Worker, isMainThread, parentPort } = require('worker_threads');
const pMap = require('p-map');

// Create a new instance of the SPARQL parser
const parser = new SparqlParser();

// Function to execute a single SPARQL query and return the result
async function executeQuery(query) {
  try {
    const parsedQuery = parser.parse(query);
    // Here, you can execute the parsed query using your preferred SPARQL endpoint or RDF library
    // For the sake of example, we will just return the parsed query as a string
    return JSON.stringify(parsedQuery);
  } catch (error) {
    console.error('Error parsing query:', query);
    return 'Error parsing the query.';
  }
}

async function main() {
  // Read SPARQL queries from input CSV file and execute them in parallel
  const inputCsvFile = 'prefix_unique_bio2rdf_sparql_logs.csv';
  const outputCsvFile = 'translated_prefix_unique_bio2rdf_sparql_logs.csv';

  const csvData = [];

  // Step 1: Count the total number of queries in the input CSV file
  let totalQueries = 0;
  fs.createReadStream(inputCsvFile)
    .pipe(csvParser())
    .on('data', () => {
      totalQueries++;
    })
    .on('end', () => {
      console.log(`Total SPARQL queries to process: ${totalQueries}`);
    });

  const workerFunction = async (row) => {
    const sparqlQuery = row['query']; // Adjust the column name as per your input CSV format
    const parsedQuery = await executeQuery(sparqlQuery);

    // Add the results to the csvData array
    csvData.push({ Query: sparqlQuery, Parsed_Query: parsedQuery });

    // Calculate the completion percentage and show the progress
    const completionPercentage = (csvData.length / totalQueries) * 100;
    console.log(`Progress: ${completionPercentage.toFixed(2)}%`);

    return { Query: sparqlQuery, Parsed_Query: parsedQuery };
  };

  // Use pMap to parallelize the execution of queries
  const workerCount = 4; // Adjust the number of worker threads as per your CPU cores
  await pMap(
    fs.createReadStream(inputCsvFile).pipe(csvParser()),
    workerFunction,
    { concurrency: workerCount }
  );

  // Write the results to output CSV file
  const csv = unparse(csvData, { header: true, delimiter: ',' });
  fs.writeFileSync(outputCsvFile, csv);

  console.log('All SPARQL queries executed and results written to output CSV file.');
}

if (isMainThread) {
  main();
} else {
  parentPort.once('message', (message) => {
    parentPort.postMessage(workerFunction(message));
  });
}
