-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.py
205 lines (151 loc) · 10.7 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import loinc as lc
# ... do not import loinc's classes from here
import os
##### Configuration settings ########
## This program assumes that you have collected raw laboratory data via transformations.withMedivoTestResultType()
## See ai.prognos.samantha.clinical.transformations
##
## Also, this program assumes that you have a disease cohort of interest and the set of target LOINC codes
## are determined through this disease cohort (e.g. Hepatitis-C). By focusing on a disesae-specific LOINC set
## we can ensure that the LOINC we are trying to predict/correct are clinically related.
##
## Data set configuration
cohort = domain = 'hepatitis-c'
data_source = "andromeda-pond"
project_path = os.getcwd()
analysis_path = os.path.join(project_path, 'analysis')
loinc_dir = os.path.join(project_path, 'LoincTable')
## Enter the directory where you would like the intermediate output files to be stored:
## Example: 'C:/Users/me/Documents/MyFiles/'
data_dir = os.path.join(project_path, 'data') # 'YOUR_DIRECTORY_HERE'
out_dir = data_dir
# ... alternative LoincTable
plot_dir = os.path.join(project_path, 'plot')
## Enter the filepath where your raw source data file is located along with :
## Example: ## Example: 'C:/Users/me/Documents/MyFiles/Data.txt'
in_file = input_file = os.path.join(data_dir, f'{data_source}-{cohort}.csv') # 'YOUR_FILE_LOCATION_HERE'
# ... e.g. andromeda-pond-hepatitis-c.csv
processed_file = processed_input_file = os.path.join(data_dir, f'{data_source}-{cohort}-processed.csv')
# ... e.g. andromeda-pond-hepatitis-c-processed.csv
balanced_file = balanced_input_file = os.path.join(data_dir, f'{data_source}-{cohort}-balanced.csv')
# ... e.g. andromeda-pond-hepatitis-c-balanced.csv
backup_file = os.path.join(data_dir, f'{data_source}-{cohort}-bk.csv')
## ... other files: andromeda-pond-hepatitis-c.csv, andromeda-pond-hepatitis-c-balanced.csv
## ... note that "andromeda-pond-hepatitis-c-balanced.csv" may contain ill-formed meta_sender_name
## If your data file is delimited by character(s) other than a comma, please indicate the delimeter:
## Example: delimiter = '|'
delim = ','
## Throughout this data transformation pipeline, intermediate files can be written to disk both for examination and
## for loading in subsequent model steps to avoid having to recreate them.
## The following files created while cleaning source data text can be written to file:
## 1. Cleaned_Lab_Names.csv
## 2. By_Site_Lab_Word_Count.csv
## ... this corresponds to meta_sender_name
##
## 3. Discarded_Lab_Names.csv
## 4. Cleaned_Specimen_Names.csv
## 5. By_Site_Specimen_Word_Count.csv
## The default 'False' will NOT write intermediate files to disk. If user wants these files saved to disk, change the following line to write_file_source_data_cleaning = True
write_file_source_data_cleaning = True
## The following files from the LOINC table data parsing step can be written to disk:
## 1. LOINC_Name_map.csv
## 2. LOINC_Parsed_Component_System_Longword.csv
## The default 'False' will NOT write intermediate files to disk. If user wants these files saved to disk, change the following line to write_file_loinc_parsed = True
write_file_loinc_parsed = True
## The following files from the UMLS CUI search can be written to disk:
## 1. UMLS_Mapped_Specimen_Names.csv
## 2. UMLS_Mapped_Test_Names.csv
## The default 'False' will NOT write intermediate files to disk. If user wants these files saved to disk, change the following line to write_file_umls_cuis = True
write_file_umls_cuis = True
## Enter the full filepath to your local loinc.csv file installation:
## Example: 'C:/Users/me/Documents/MyFiles/loinc.csv'
loinc_file_path = loinc_table_path = os.path.join(loinc_dir, 'Loinc.csv') # LoincTable.input_path
# ... os.path.join('LoincTable', 'Loinc.csv')
loinc_to_mtrt_file_path = os.path.join(data_dir, 'loinc-leela.csv')
## Enter the full filepath to your local R library file location (where stringdist package is installed)
## Example: 'C:/Program Files/R/R-3.4.1/library'
lib_loc = "/Library/Frameworks/R.framework/Versions/Current/Resources/library"
# "/Library/Frameworks/R.framework/Resources/library" # 'YOUR_R_LIBRARY_LOCATION'
###########################################################################################
## The program assumes that your raw source data file has a header with the features defined in
## loinc.FeatureSet
## Enter the name of the column in your data source that contains the TEST NAME (i.e. Creatinine):
test_col = test_order_col = 'test_order_name'
test_result_col = 'test_result_name'
# ... alternative
# test_result_name
test_value_col = 'test_result_value'
test_comment_col = "test_result_comments"
## Enter the name of the column in your data source that contains the SPECIMEN TYPE (i.e. urine):
spec_col = 'test_specimen_type' # 'YOUR_SPECIMEN_COL_NAME'
# ... related
# ... test_specimen_source
## Enter the name of the column in your data source that contains the UNITS:
units = 'test_result_units_of_measure' # 'YOUR_UNITS_COL_NAME'
## Enter the name of the column in your data source that contains the LOINC CODE:
loinc_col = 'test_result_loinc_code' # 'YOUR_LOINC_COL_NAME'
tagged_col = 'medivo_test_result_type'
dtypes = {test_order_col: str, test_result_col: str, spec_col: str}
###########################################################################################
## Enter the name of the column in your data source that contains the numeric MINIMUM:
min_col = 'YOUR_MINIMUM_COL_NAME'
## Enter the name of the column in your data source that contains the numeric MAXIMUM:
max_col = 'YOUR_MAXIMUM_COL_NAME'
## Enter the name of the column in your data source that contains the numeric MEAN:
mean_col = 'YOUR_MEAN_COL_NAME'
## Enter the name of the column in your data source that contains the numeric 5th PERCENTILE:
perc_5 = 'YOUR_5TH_PERCENTILE_COL_NAME'
## Enter the name of the column in your data source that contains the numeric 25th PERCENTILE:
perc_25 = 'YOUR_25TH_PERCENTILE_COL_NAME'
## Enter the name of the column in your data source that contains the numeric MEDIAN:
median_col = 'YOUR_MEDIAN_COL_NAME'
## Enter the name of the column in your data source that contains the numeric 75th PERCENTILE:
perc_75 = 'YOUR_75TH_PERCENTILE_COL_NAME'
## Enter the name of the column in your data source that contains the numeric 95th PERCENTILE:
perc_95 = 'YOUR_95TH_PERCENTILE_COL_NAME'
###########################################################################################
## Enter the name of the column in your data source that contains the COUNT:
count = 'YOUR_COUNT_COL_NAME'
## Enter the name of the column in your data source that contains the SITE IDENTIFIER:
site = 'meta_sender_name' # 'YOUR_SITE_IDENTIFIER_COL_NAME'
# ... related
#. meta_sender_source, meta_sender_type
## If missing data is denoted by anything other than a NULL field, please indicate special strings
## Example: missings = ["*MISSING", 'UNKNOWN', '-1']
missing = ['unknown', ] # ["ENTER", "YOUR", "MISSING", "VALUES"]
## Please enter a numeric rejection threshold (example: 4.0) for eliminating high frequency tokens from source data test names.
## Default will not remove any tokens during source data pre-processing.
rejection_threshold = None
## Status updates on segments of code being executed will be provided to the user by default. If you do NOT wish to have status updates on code execution, change print_status = 'N'
print_status = 'Y'
## This program uses the UMLS API to generate features by obtaining CUIs for test names and specimen types. To access the UMLS, the user is required to enter an API key.
## To obtain this information, the user may create or login to their UMLS account at https://uts.nlm.nih.gov/home.html. After logging in to UMLS, click on 'My Profile'.
## The API key is listed beneath the user. Paste the API KEY into the field below:
api_key = "YOUR_UMLS_API_KEY"
## Enter the integer number of CUIs to retain for each UMLS search. Default setting will return up to 3 CUIs for each test name and each specimen type
num_cuis = 3
## Enter the integer number for minimum number of sites at which a LOINC key must be used to be retained in the labeled dataset (Default is 1, meaning that LOINC keys occurring at only 1 site are filtered out and combined with the unlabeled data for reclassification)
min_sites_per_loinc_key = 1
## Enter the minimum number of cumulative test instances per LOINC group to be retained in the labeled training data (Default is 9)
min_tests_per_loinc_group = 9
## Enter the minimum number of data instances allowed per LOINC key group in the labeled training data (Default is 2)
min_row_count_per_loinc_group = 2
## Default program setting is to fit Random Forest and One-Versus-Rest models during cross-validation, to obtain predicted labels from each model, and to provide model performance metrics obtained during cross-validation. If you do NOT want to perform CV, change the code below to "run_cv = 'N'"
run_cv = 'Y'
## Enter the integer number of cross-validation folds (default is 5-fold)
n_splits = 5
## Enter the integer number of trials for hyperparameter tuning (default is 200)
tuning_evals = 200
## Values for hyperopt model hyperparameter tuning. User has the option to customize the search space for the following random forest parameters: max_features, max_depth, min_samples_split, and n_estimators.
## Default setting is for max_features to be tested in increments of 2 over the space from 2 features to N - 3 features (where N is the number of columns in the training dataset), represented in the code by np.arange(2, (X0.shape[1] - 3), 2). If the user wants to modify the search space, please change the following line of code to: max_features = [MINIMUM NUMBER, MAXIMUM NUMBER, INCREMENT]
## Example: max_features = [2, 24, 2]
max_features = None
## Default setting is for max_depth to be tested in increments of 5 over the space from 5 to 25, represented programatically as np.arange(5, 35, 5). If the user wants to modify the search space, please change the following line of code to: max_depth = [MINIMUM DEPTH, MAXIMUM DEPTH, INCREMENT]
## Example: max_depth = [5, 50, 5]
max_depth = None
## Default setting is for min_samples_split to be tested in increments of 2 over the space from 2 to 20, represented programatically as np.arange(2, 20, 2). If the user wants to modify the search space, please change the following line of code to: min_samples_split = [MINIMUM SAMPLES, MAXIMUM SAMPLES, INCREMENT]
## Example: min_samples_split = [2, 16, 1]
min_samples_split = None
## Default setting is for n_estimators to be tested in increments of 25 over the space from 10 to 250, represented programatically as np.arange(10, 250, 25). If the user wants to modify the search space, please change the following line of code to: n_estimators = [MINIMUM ESTIMATORS, MAXIMUM ESTIMATORS, INCREMENT]
## Example: n_estimators = [10, 250, 25]
n_estimators = None