/
text.py
executable file
·211 lines (174 loc) · 7.19 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""
Text extraction tools.
"""
import re
import logging
import os.path as op
import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from ..dataset import Dataset
from ..utils import get_resource_path
LGR = logging.getLogger(__name__)
SPELL_DF = pd.read_csv(op.join(get_resource_path(), 'english_spellings.csv'),
index_col='UK')
SPELL_DICT = SPELL_DF['US'].to_dict()
def download_abstracts(dataset, email):
"""
Download the abstracts for a list of PubMed IDs. Uses the BioPython
package.
Parameters
----------
dataset : :obj:`nimare.dataset.Dataset` or :obj:`list` of :obj:`str`
A Dataset object where IDs are in the form PMID-EXPID or a list of
PubMed IDs
email : :obj:`str`
Email address to use to call the PubMed API
Returns
-------
dataset : :obj:`nimare.dataset.Dataset` or :obj:`list` of :obj:`str`
Dataset with abstracts added.
"""
try:
from Bio import Entrez, Medline
except:
raise Exception(
'Module biopython is required for downloading abstracts from '
'PubMed.')
Entrez.email = email
if isinstance(dataset, Dataset):
pmids = dataset.coordinates['id'].astype(str).tolist()
pmids = [pmid.split('-')[0] for pmid in pmids]
pmids = sorted(list(set(pmids)))
elif isinstance(dataset, list):
pmids = [str(pmid) for pmid in dataset]
else:
raise Exception(
'Dataset type not recognized: {0}'.format(type(dataset)))
records = []
# PubMed only allows you to search ~1000 at a time. I chose 900 to be safe.
chunks = [pmids[x: x + 900] for x in range(0, len(pmids), 900)]
for i, chunk in enumerate(chunks):
LGR.info('Downloading chunk {0} of {1}'.format(i + 1, len(chunks)))
h = Entrez.efetch(db='pubmed', id=chunk, rettype='medline',
retmode='text')
records += list(Medline.parse(h))
# Pull data for studies with abstracts
data = [[study['PMID'], study['AB']]
for study in records if study.get('AB', None)]
df = pd.DataFrame(columns=['id', 'text'], data=data)
for pmid in dataset.data.keys():
if pmid in df['id'].tolist():
abstract = df.loc[df['id'] == pmid, 'text'].values[0]
else:
abstract = ""
for expid in dataset.data[pmid]['contrasts'].keys():
if 'texts' not in dataset.data[pmid]['contrasts'][expid].keys():
dataset.data[pmid]['contrasts'][expid]['texts'] = {}
dataset.data[pmid]['contrasts'][expid]['texts']['abstract'] = abstract
dataset._load_texts()
return dataset
def generate_counts(text_df, text_column='abstract', tfidf=True):
"""
Generate tf-idf weights for unigrams/bigrams derived from textual data.
Parameters
----------
text_df : (D x 2) :obj:`pandas.DataFrame`
A DataFrame with two columns ('id' and 'text'). D = document.
Returns
-------
weights_df : (D x T) :obj:`pandas.DataFrame`
A DataFrame where the index is 'id' and the columns are the
unigrams/bigrams derived from the data. D = document. T = term.
"""
if text_column not in text_df.columns:
raise ValueError('Column "{0}" not found in DataFrame'.format(text_column))
# Remove rows with empty text cells
orig_ids = text_df['id'].tolist()
text_df = text_df.fillna('')
keep_ids = text_df.loc[text_df[text_column] != '', 'id']
text_df = text_df.loc[text_df['id'].isin(keep_ids)]
if len(keep_ids) != len(orig_ids):
LGR.info('Retaining {0}/{1} studies'.format(len(keep_ids),
len(orig_ids)))
ids = text_df['id'].tolist()
text = text_df[text_column].tolist()
stoplist = op.join(get_resource_path(), 'neurosynth_stoplist.txt')
with open(stoplist, 'r') as fo:
stop_words = fo.read().splitlines()
if tfidf:
vectorizer = TfidfVectorizer(min_df=50, max_df=0.5,
ngram_range=(1, 2), vocabulary=None,
stop_words=stop_words)
else:
vectorizer = CountVectorizer(min_df=50, max_df=0.5,
ngram_range=(1, 2), vocabulary=None,
stop_words=stop_words)
weights = vectorizer.fit_transform(text).toarray()
names = vectorizer.get_feature_names()
names = [str(name) for name in names]
weights_df = pd.DataFrame(weights, columns=names, index=ids)
weights_df.index.name = 'id'
return weights_df
def generate_cooccurrence(text_df, text_column='abstract', vocabulary=None,
window=5):
"""
Build co-occurrence matrix from documents.
Not the same approach as used by the GloVe model.
Parameters
----------
text_df : (D x 2) :obj:`pandas.DataFrame`
A DataFrame with two columns ('id' and 'text'). D = document.
vocabulary : :obj:`list`, optional
List of words in vocabulary to extract from text.
window : :obj:`int`, optional
Window size for cooccurrence. Words which appear within window words
of one another co-occur.
Returns
-------
df : (V, V, D) :obj:`pandas.Panel`
One cooccurrence matrix per document in text_df.
"""
if text_column not in text_df.columns:
raise ValueError('Column "{0}" not found in DataFrame'.format(text_column))
ids = text_df['id'].tolist()
text = text_df[text_column].tolist()
text = [nltk.word_tokenize(doc) for doc in text]
text = [[word.lower() for word in doc if word.isalpha()] for doc in text]
if vocabulary is None:
all_words = [word for doc in text for word in doc]
vocabulary = sorted(list(set(all_words)))
cooc_arr = np.zeros((len(text), len(vocabulary), len(vocabulary)))
for i, doc in enumerate(text):
for j, word1 in enumerate(vocabulary):
if word1 in doc:
idx1 = [jj for jj, x in enumerate(doc) if x == word1]
for k, word2 in enumerate(vocabulary):
if word2 in doc and k != j:
idx2 = [kk for kk, x in enumerate(doc) if x == word2]
distances = np.zeros((len(idx1), len(idx2)))
for m, idx1_ in enumerate(idx1):
for n, idx2_ in enumerate(idx2):
distances[m, n] = idx2_ - idx1_
cooc = np.sum(np.abs(distances) <= window)
cooc_arr[i, j, k] = cooc
df = pd.Panel(items=ids, major_axis=vocabulary, minor_axis=vocabulary,
data=cooc_arr)
return df
def uk_to_us(text):
"""
Convert UK spellings to US based on a converter.
english_spellings.csv: From http://www.tysto.com/uk-us-spelling-list.html
Parameters
----------
text : :obj:`str`
Returns
-------
text : :obj:`str`
"""
if isinstance(text, str):
# Convert British to American English
pattern = re.compile(r'\b(' + '|'.join(SPELL_DICT.keys()) + r')\b')
text = pattern.sub(lambda x: SPELL_DICT[x.group()], text)
return text