Skip to content

Commit

Permalink
refactored InchiBag classes added more tests and interfaces and javadoc
Browse files Browse the repository at this point in the history
  • Loading branch information
dkatzel-ncats committed Feb 2, 2022
1 parent abb26e9 commit be5e503
Show file tree
Hide file tree
Showing 13 changed files with 771 additions and 372 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Molwitch API Changelog
## 0.6.3
1. refactored InchiKeyBag into an interface and made Default and Binary implementations

## 0.6.2
1. added new CtTable Clean Rule M SAL lines with atom positions < 1 are removed from the atom list
1. MolSearcherFactory now returns an `Optional<MolSearcher>` to support not finding a searcher. Previously would throw NoSuchElementException.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ <h2 title="Enum BinaryInchiKey.BinaryInchiKeyBag.ResultType" class="title">Enum
<li><a href="https://docs.oracle.com/javase/8/docs/api/java/lang/Enum.html?is-external=true" title="class or interface in java.lang">java.lang.Enum</a>&lt;<a href="../../../../../gov/nih/ncats/molwitch/inchi/BinaryInchiKey.BinaryInchiKeyBag.ResultType.html" title="enum in gov.nih.ncats.molwitch.inchi">BinaryInchiKey.BinaryInchiKeyBag.ResultType</a>&gt;</li>
<li>
<ul class="inheritance">
<li>gov.nih.ncats.molwitch.inchi.BinaryInchiKey.BinaryInchiKeyBag.ResultType</li>
<li>gov.nih.ncats.molwitch.inchi.BinaryInchiKey.BinaryInchiKeyBag.InchiKeySearchResultType</li>
</ul>
</li>
</ul>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<head>
<!-- Generated by javadoc (1.8.0_292) on Tue May 25 11:14:17 EDT 2021 -->
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Uses of Class gov.nih.ncats.molwitch.inchi.BinaryInchiKey.BinaryInchiKeyBag.ResultType (ncats-molwitch 0.6.1-SNAPSHOT API)</title>
<title>Uses of Class gov.nih.ncats.molwitch.inchi.BinaryInchiKey.BinaryInchiKeyBag.InchiKeySearchResultType (ncats-molwitch 0.6.1-SNAPSHOT API)</title>
<meta name="date" content="2021-05-25">
<link rel="stylesheet" type="text/css" href="../../../../../../stylesheet.css" title="Style">
<script type="text/javascript" src="../../../../../../script.js"></script>
Expand Down Expand Up @@ -71,7 +71,7 @@
</a></div>
<!-- ========= END OF TOP NAVBAR ========= -->
<div class="header">
<h2 title="Uses of Class gov.nih.ncats.molwitch.inchi.BinaryInchiKey.BinaryInchiKeyBag.ResultType" class="title">Uses of Class<br>gov.nih.ncats.molwitch.inchi.BinaryInchiKey.BinaryInchiKeyBag.ResultType</h2>
<h2 title="Uses of Class gov.nih.ncats.molwitch.inchi.BinaryInchiKey.BinaryInchiKeyBag.InchiKeySearchResultType" class="title">Uses of Class<br>gov.nih.ncats.molwitch.inchi.BinaryInchiKey.BinaryInchiKeyBag.InchiKeySearchResultType</h2>
</div>
<div class="classUseContainer">
<ul class="blockList">
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

<groupId>gov.nih.ncats</groupId>
<artifactId>molwitch</artifactId>
<version>0.6.2</version>
<version>0.6.3-SNAPSHOT</version>

<name>ncats-molwitch</name>

Expand Down
201 changes: 1 addition & 200 deletions src/main/java/gov/nih/ncats/molwitch/inchi/BinaryInchiKey.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,214 +24,15 @@
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.ShortBuffer;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Objects;
import java.util.SortedSet;
import java.util.*;

/**
* A binary encoded InchiKey
*/
public class BinaryInchiKey {

private static final BigInteger SHIFT = BigInteger.valueOf(1).shiftLeft(48);
private static final BigInteger MASK = new BigInteger("FFFFFFFFFFFFFFFFFFFF000000000000", 16);

private static short[][] LOOKUP_2 = new short[26][26];
private static short[][][] LOOKUP_3 = new short[26][26][26];
static{
for(int i=0; i< 26; i++){
for(int j=0; j< 26; j++){
for(int k=0; k< 26; k++){
int value = (676* i) + (26 * j) + k;
if(value >=12844){
value -= 516;
}
if(value > 2704){
value -= 676;
}
LOOKUP_3[i][j][k]= (short) value;
}
LOOKUP_2[i][j]= (short)((26 * i) + j);

}
}
}
public static class BinaryInchiKeyBag{

private final BigInteger[] array;

public static BinaryInchiKeyBag fromSortedFile(File sortedInchikeyFile) throws IOException {
InputStreamSupplier supplier = InputStreamSupplier.forFile(sortedInchikeyFile);
//read the whole file 2x - first to get the number of records to efficiently size our array
int count=0;
try(BufferedReader in = new BufferedReader( new InputStreamReader(supplier.get()))){

while( in.readLine() !=null){
count++;
}
}
BigInteger[] a = new BigInteger[count];
count=0;
try(BufferedReader in = new BufferedReader( new InputStreamReader(supplier.get()))){
String line;
while( (line = in.readLine()) !=null){
a[count++] = encode(line);
}
}
return new BinaryInchiKeyBag(a);
}
public static BinaryInchiKeyBag fromSortedSet(SortedSet<String> inchiKeys){
BigInteger[] a = new BigInteger[inchiKeys.size()];
Iterator<String> iter = inchiKeys.iterator();
int i=0;
while(iter.hasNext()){
a[i++] = encode(iter.next());
}
return new BinaryInchiKeyBag(a);
}
private BinaryInchiKeyBag(BigInteger[] array) {
this.array = array;
}


public InchiKeySearchResult contains(String inchiKey){
BigInteger exact = encode(inchiKey);
if(Arrays.binarySearch(array, exact) >=0){
return new InchiKeySearchResult(inchiKey, ResultType.EXACT);
}
BigInteger lowInsensitive = exact.and(MASK);
BigInteger highInsensitive = lowInsensitive.add(SHIFT);
if(containsInsensitive(lowInsensitive, highInsensitive)){
return new InchiKeySearchResult(inchiKey, ResultType.STEREO_INSENSITIVE);
}
return new InchiKeySearchResult(inchiKey, ResultType.NO_MATCH);

}
public boolean containsStereo(String inchiKey){
BigInteger value = encode(inchiKey);
return Arrays.binarySearch(array, value) >=0;
}

public boolean containsInsensitive(String inchiKey){
//stereo insensitive
BigInteger[] pair = encodeInsensitive(inchiKey);
return containsInsensitive(pair[0], pair[1]);
}

private boolean containsInsensitive(BigInteger low, BigInteger high ) {
int lowResult = Arrays.binarySearch(array, low);
int highResult = Arrays.binarySearch(array, high);
if(lowResult ==highResult){
//no match
return false;
}
if(Math.abs(lowResult) < Math.abs(highResult)){
//there is something in between low and high so we have an insensitive match
return true;
}
return false;
}

public static class InchiKeySearchResult{
private final String inchiKey;
private final ResultType resultType;

public String getInchiKey() {
return inchiKey;
}

public ResultType getResultType() {
return resultType;
}

public InchiKeySearchResult(String inchiKey, ResultType resultType) {
this.inchiKey = inchiKey;
this.resultType = resultType;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof InchiKeySearchResult)) return false;
InchiKeySearchResult that = (InchiKeySearchResult) o;
return inchiKey.equals(that.inchiKey) &&
resultType == that.resultType;
}

@Override
public int hashCode() {
return Objects.hash(inchiKey, resultType);
}
}
public enum ResultType{
EXACT,
STEREO_INSENSITIVE,
NO_MATCH

}
}

private static BigInteger[] encodeInsensitive(String inchiKey) {

ByteBuffer byteBuffer = ByteBuffer.allocate(16);
ShortBuffer buf = byteBuffer.asShortBuffer();
char[] chars = inchiKey.toCharArray();
buf.put((short) encode3(chars, 0));
buf.put((short) encode3(chars, 3));
buf.put((short) encode3(chars, 6));
buf.put((short) encode3(chars, 9));
buf.put((short) encode2(chars, 12));

BigInteger low = new BigInteger(byteBuffer.array());

BigInteger high = low.add(SHIFT);
return new BigInteger[]{low, high};

}
private static BigInteger encode(String inchiKey){
ByteBuffer byteBuffer = ByteBuffer.allocate(16);
ShortBuffer buf = byteBuffer.asShortBuffer();
char[] chars = inchiKey.toCharArray();

buf.put(encode3b(chars, 0));
buf.put(encode3b(chars, 3));
buf.put(encode3b(chars, 6));
buf.put(encode3b(chars, 9));
buf.put(encode2b(chars, 12));

//stereo part
buf.put(encode3b(chars, 15));
buf.put(encode3b(chars, 18));
buf.put(encode2b(chars, 21));

//the rest of the key usually SA-N is for "standard flag, A= version 1 and N for no protonation. we can assume all inchis we see are SA
//do we care about protonation?

byte[] array = byteBuffer.array();
array[array.length -2]|= ((byte)chars[chars.length - 1]) <<2;
return new BigInteger(array);

}
private static short encode3b(char[] chars, int offset){
return LOOKUP_3[chars[offset]-'A'][chars[offset+1]-'A'][chars[offset+2]-'A'];
}
private static int encode3(char[] chars, int offset){
//TODO should we turn this into a table lookup?
int value = 676* (chars[offset]-'A') + 26 * (chars[offset+1]- 'A') + (chars[offset +2]-'A');
if(value >=12844){
value -= 516;
}
if(value > 2704){
value -= 676;
}
return value;
}

private static short encode2b(char[] chars, int offset){
return LOOKUP_2[chars[offset]-'A'][chars[offset+1]-'A'];
}
private static int encode2(char[] chars, int offset){
return 26 * (chars[offset]- 'A') + (chars[offset+1]-'A');
}
}
Loading

0 comments on commit be5e503

Please sign in to comment.