-
Notifications
You must be signed in to change notification settings - Fork 0
/
LemmatizerSpacyGerman.py
40 lines (34 loc) · 1.57 KB
/
LemmatizerSpacyGerman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from Storage import Storage
from SessionLogger import SessionLogger
import spacy
model_name = 'de_core_news_md'
model = spacy.load(model_name)
class LemmatizerSpacyGerman:
col_name = 'text'
new_col_name = 'lemmatized'
# expects a text string
# lemmatizes words from text string
# returns lemmatized string
@staticmethod
def process_text(text: str):
words = model(text)
lem_text = ''
for word in words:
lem_text = lem_text + word.lemma_ + ' '
if len(lem_text) > 0:
lem_text = lem_text[:-1]
return lem_text
# expects pandas data frame and a column name for which words should be lemmatized
# lemmatizes words from pandas data frame and adds result to a new column called 'lemmatized', optionally stores new data frame with the specified name if storage_level>=1
# returns new pandas data frame, containing a column 'lemmatized'
@staticmethod
def normalize(data_frame, col_name=col_name, storage_level=0, storage_name='', log=1):
df = data_frame.copy()
df[LemmatizerSpacyGerman.new_col_name] = df.apply(lambda x: LemmatizerSpacyGerman.process_text(x[col_name]), axis=1)
log_text = 'Documents lemmatized with spacy (' + str(len(df.index)) + ' entries).'
if storage_level >= 1 and storage_name != '':
Storage.store_pd_frame(df, storage_name)
log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + LemmatizerSpacyGerman.new_col_name + '\').'
if log:
SessionLogger.log(log_text)
return df