/
cogat.py
executable file
·204 lines (173 loc) · 7.01 KB
/
cogat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
Automated annotation of Cognitive Atlas labels.
"""
import logging
import re
import numpy as np
import pandas as pd
from . import utils
from .. import references
from ..due import due
from ..extract import download_cognitive_atlas
from ..utils import uk_to_us
LGR = logging.getLogger(__name__)
@due.dcite(references.COGNITIVE_ATLAS, description="Introduces the Cognitive Atlas.")
class CogAtLemmatizer(object):
"""
Replace synonyms and abbreviations with Cognitive Atlas identifiers in text.
Parameters
----------
ontology_df : :obj:`pandas.DataFrame`, optional
DataFrame with three columns (id, name, alias) and one row for each
alias (e.g., synonym or abbreviation) for each term in the Cognitive
Atlas. If None, loads ontology file from resources folder.
Attributes
----------
ontology_ : :obj:`pandas.DataFrame`
Ontology in DataFrame form.
regex_ : :obj:`dict`
Dictionary linking aliases in ontology to regular expressions for
lemmatization.
References
----------
* Poldrack, Russell A., et al. "The cognitive atlas: toward a
knowledge foundation for cognitive neuroscience." Frontiers in
neuroinformatics 5 (2011): 17. https://doi.org/10.3389/fninf.2011.00017
"""
def __init__(self, ontology_df=None):
if ontology_df is None:
cogat = download_cognitive_atlas()
self.ontology_ = pd.read_csv(cogat["ids"])
else:
assert isinstance(ontology_df, pd.DataFrame)
self.ontology_ = ontology_df
assert "id" in self.ontology_.columns
assert "name" in self.ontology_.columns
assert "alias" in self.ontology_.columns
# Create regex dictionary
regex_dict = {}
for term in ontology_df["alias"].values:
term_for_regex = term.replace("(", r"\(").replace(")", r"\)")
regex = "\\b" + term_for_regex + "\\b"
pattern = re.compile(regex, re.MULTILINE | re.IGNORECASE)
regex_dict[term] = pattern
self.regex_ = regex_dict
def transform(self, text, convert_uk=True):
"""
Replace terms in text with unique Cognitive Atlas identifiers.
Parameters
----------
text : :obj:`str`
Text to convert.
convert_uk : :obj:`bool`, optional
Convert British English words in text to American English versions.
Default is True.
Returns
-------
text : :obj:`str`
Text with Cognitive Atlas terms replaced with unique Cognitive
Atlas identifiers.
"""
if convert_uk:
text = uk_to_us(text)
for term_idx in self.ontology_.index:
term = self.ontology_["alias"].loc[term_idx]
term_id = self.ontology_["id"].loc[term_idx]
text = re.sub(self.regex_[term], term_id, text)
return text
@due.dcite(references.COGNITIVE_ATLAS, description="Introduces the Cognitive Atlas.")
def extract_cogat(text_df, id_df=None, text_column="abstract"):
"""
Extract Cognitive Atlas terms and count instances using regular
expressions.
Parameters
----------
text_df : (D x 2) :obj:`pandas.DataFrame`
Pandas dataframe with at least two columns: 'id' and the text.
D = document.
id_df : (T x 3) :obj:`pandas.DataFrame`
Cognitive Atlas ontology dataframe with at least three columns:
'id' (unique identifier for term), 'alias' (natural language expression
of term), and 'name' (preferred name of term; currently unused).
T = term.
text_column : :obj:`str`, optional
Name of column in text_df that contains text. Default is 'abstract'.
Returns
-------
counts_df : (D x T) :obj:`pandas.DataFrame`
Term counts for documents in the corpus.
rep_text_df : (D x 2) :obj:`pandas.DataFrame`
Text DataFrame with terms replaced with their CogAt IDs.
References
----------
* Poldrack, Russell A., et al. "The cognitive atlas: toward a
knowledge foundation for cognitive neuroscience." Frontiers in
neuroinformatics 5 (2011): 17. https://doi.org/10.3389/fninf.2011.00017
"""
text_df = text_df.copy()
if id_df is None:
cogat = download_cognitive_atlas()
id_df = pd.read_csv(cogat["ids"])
gazetteer = sorted(id_df["id"].unique().tolist())
if "id" in text_df.columns:
text_df.set_index("id", inplace=True)
text_df[text_column] = text_df[text_column].fillna("")
text_df[text_column] = text_df[text_column].apply(uk_to_us)
# Create regex dictionary
regex_dict = {}
for term in id_df["alias"].values:
term_for_regex = term.replace("(", r"\(").replace(")", r"\)")
regex = "\\b" + term_for_regex + "\\b"
pattern = re.compile(regex, re.MULTILINE | re.IGNORECASE)
regex_dict[term] = pattern
# Count
count_arr = np.zeros((text_df.shape[0], len(gazetteer)), int)
counts_df = pd.DataFrame(columns=gazetteer, index=text_df.index, data=count_arr)
for term_idx in id_df.index:
term = id_df["alias"].loc[term_idx]
term_id = id_df["id"].loc[term_idx]
pattern = regex_dict[term]
counts_df[term_id] += text_df[text_column].str.count(pattern).astype(int)
text_df[text_column] = text_df[text_column].str.replace(pattern, term_id)
return counts_df, text_df
def expand_counts(counts_df, rel_df=None, weights=None):
"""
Perform hierarchical expansion of counts across labels.
Parameters
----------
counts_df : (D x T) :obj:`pandas.DataFrame`
Term counts for a corpus. T = term, D = document.
rel_df : :obj:`pandas.DataFrame`
Long-form DataFrame of term-term relationships with at least three columns:
'input', 'output', and 'rel_type'.
weights : :obj:`dict`
Dictionary of weights per relationship type. E.g., {'isKind': 1}.
Unspecified relationship types default to 0.
Returns
-------
weighted_df : (D x T) :obj:`pandas.DataFrame`
Term counts for a corpus after hierarchical expansion.
"""
if rel_df is None:
cogat = download_cognitive_atlas()
rel_df = pd.read_csv(cogat["relationships"])
weights_df = utils._generate_weights(rel_df, weights=weights)
# First reorg counts_df so it has the same columns in the same order as
# weight_df
counts_columns = counts_df.columns.tolist()
weights_columns = weights_df.columns.tolist()
w_not_c = set(weights_columns) - set(counts_columns)
c_not_w = set(counts_columns) - set(weights_columns)
if c_not_w:
raise Exception(
"Columns found in counts but not weights: " "{0}".format(", ".join(c_not_w))
)
for col in w_not_c:
counts_df[col] = 0
counts_df = counts_df[weights_columns]
# Now matrix multiplication
counts = counts_df.values
weights = weights_df.values
weighted = np.dot(counts, weights)
weighted_df = pd.DataFrame(index=counts_df.index, columns=counts_df.columns, data=weighted)
return weighted_df