-
Notifications
You must be signed in to change notification settings - Fork 90
/
schema-1.0.0.json
150 lines (133 loc) · 3.59 KB
/
schema-1.0.0.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
{
// 'meta-schema' describing this JSON as a schema
//
"$schema" : "http://json-schema.org/schema#",
"type" : "object",
"required" :
[
"schema",
"kmerSize",
"alphabet",
"preserveCase",
"canonical",
"sketchSize",
"hashType",
"hashBits",
"hashSeed"
],
"properties" :
{
// URI of the appropriate version of this schema
//
"schema" : {"type" : "string"},
// the number of characters in each overlapping 'shingle'
//
"kmerSize" : {"type" : "number"},
// all letters a k-mer can have (others will be skipped) (see also
// 'preserveCase')
//
"alphabet" : {"type" : "string"},
// if true, letters in a k-mer must match the case of letters in
// 'alphabet'
//
"preserveCase" : {"type" : "boolean"},
// if true, use alphabetical minima of k-mers and their reverse
// complements (only makes sense for nucleotide alphabet)
//
"canonical" : {"type" : "boolean"},
// the (maximum) number of min-hashes each sketch can have (there can be
// if a sequence does not have enough valid k-mers)
//
"sketchSize" : {"type" : "number"},
// the hashing function used to hash each k-mer
//
"hashType" : {"type" : "string"},
// the number of (least significant) bits taken from the result of the
// hash function
//
"hashBits" : {"type" : "number"},
// the seed of the hash function, if applicable
//
"hashSeed" : {"type" : ["number", "null"]},
// a collection of sketches that share the above sketching parameters
//
"sketches" :
{
"type" : "array",
"items" :
{
"type" : "object",
"required" :
[
"name",
"hashes"
],
"properties" :
{
// a sequence identifier, expected to be unique in the
// collection
//
"name" : {"type" : "string"},
// the length of the source sequence that was sketched,
// which could be a sum of for concatenated sequences or an
// estimate from k-mer content of reads
//
"seqLength" : {"type" : "number"},
// an additional description of the source sequence not
// captured by 'name' (e.g. after whitesapce in a fasta tag)
//
"comment" : {"type" : "string"},
// the number of k-mers from the source sequence that
// conformed to the letters in 'alphabet' and was thus
// considered for min-hashing (including repeated k-mers)
//
"numValidKmers" : {"type" : "number"},
"filters" :
{
"type" : "object",
"properties" :
{
// the minimum number of times a k-mer must appear
// in the source sequence to be considered for min-
// hashing (for filtering out erroneous k-mers in
// read sets)
//
"minCopies" : {"type" : "number"},
}
},
// the min-hashes of this sketch, represented as unsigned
// integers but quoted as strings to avoid overflow
//
"hashes" :
{
"type" : "array",
"uniqueItems" : true,
"items" :
{
"type" : "string",
"pattern" : "^[0-9]+$"
}
},
// the k-mers that correspond to the hashes in 'hashes' (in
// the same order), used mainly for confirming the hash
// function and not necessarily valid for Jaccard estimates
// due to potential hash collisions
//
"kmers" :
{
"type" : "array",
"items" : {"type" : "string"}
},
// the number of times each hash in 'hash' was derived from
// k-mers in the source sequence (in the same order)
//
"counts" :
{
"type" : "array",
"items" : {"type" : "number"}
}
}
}
}
}
}