-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredictor.py
executable file
·104 lines (82 loc) · 2.92 KB
/
predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python3
import argparse
import json
import sys
from pathlib import Path
from typing import Literal, TypedDict, List, Union
import numpy as np
import xgboost as xgb
# sequential features
class SeqInputDict(TypedDict):
id: str
structure: bool
protein_type: Literal['PROTO_ONCOGENE', 'TUMOR_SUPPRESSOR']
essential: bool
domain: Literal['cytoplasmic', 'extracellular', 'transmembrane', 'other']
predictsnp: float
essential_residues_all: int
conservation: int
msa_data: float
# structural features (extend sequential features)
class InputDict(SeqInputDict):
pocket: bool
foldx: float
rosetta: float
pka_num: int
pka_min: float
pka_max: float
class OutputDict(TypedDict):
id: str
decision: Literal['DELETERIOUS', 'BENIGN']
confidence: float
def predict(data: Union[SeqInputDict, InputDict], xgb_seq: xgb.XGBClassifier, xgb_struc: xgb.XGBClassifier,
col_seq: List[str], col_struc: List[str]) -> OutputDict:
if data['structure']:
x = [data[name] for name in col_struc]
x[6] = (x[6] == 'PROTO_ONCOGENE')
x = np.array([x])
confidence = 100 * xgb_struc.predict_proba(x)[:,1]
else:
x = [data[name] for i, name in enumerate(col_seq) if i < 6]
x_dom = [data['domain'] == name for i, name in enumerate(col_seq) if i >= 6]
x[0] = (x[0] == 'PROTO_ONCOGENE')
x = np.array([x + x_dom])
confidence = 100 * xgb_seq.predict_proba(x)[:,1]
confidence = float(confidence[0])
cutoff_decision = 50
if confidence >= cutoff_decision:
decision = 'DELETERIOUS'
else:
decision = 'BENIGN'
confidence = 100 - confidence
return OutputDict(
id=data['id'],
decision=decision,
confidence=confidence,
)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PredictONCO Predictor')
parser.add_argument('-i', '--input', type=lambda f: open(f, 'r'), default=sys.stdin)
parser.add_argument('-o', '--output', type=lambda f: open(f, 'w'), default=sys.stdout)
args = parser.parse_args()
script_dir = Path(__file__).parent
content = json.load(args.input)
xgb_seq = xgb.XGBClassifier()
xgb_seq.load_model(script_dir / 'xgb_seq.json')
col_seq = []
with open(script_dir / 'cols_seq.txt') as f:
for line in f:
col_seq.append(line.strip())
xgb_struc = xgb.XGBClassifier()
xgb_struc.load_model(script_dir / 'xgb_struc.json')
col_struc = []
with open(script_dir / 'cols_struc.txt') as f:
for line in f:
col_struc.append(line.strip())
if type(content) is list:
# content: List[InputDict]
result = [predict(i, xgb_seq, xgb_struc, col_seq, col_struc) for i in content]
else:
# content: InputDict
result = predict(content, xgb_seq, xgb_struc, col_seq, col_struc)
json.dump(result, args.output)