In [20]:
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from os import path
import json


In [21]:
# call nltk.help.upenn_tagset() to view all pos-tags
def translate_pos_tags(tag: str):
    if tag.startswith('NN'):
        return 'n'
    elif tag.startswith('VB'):
        return 'v'
    elif tag.startswith('JJ'):
        return 'a'
    elif tag.startswith('RB'):
        return 'r'
    else: return 'n'

def get_pos_tags(tokens):
    return [(word, translate_pos_tags(tag)) for word, tag in pos_tag(tokens)]


In [22]:
def tokenize(text, language='english'):
    stop = set(stopwords.words(language))
    return [token for token in word_tokenize(text) if token not in stop and token.isalnum()]


In [23]:
def stem(tokens, language='english'):
    stemmer = SnowballStemmer(language)
    return [stemmer.stem(token) for token in tokens]


In [24]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token, tag) for token, tag in get_pos_tags(tokens)]


In [33]:
def normalize(text, language='english'):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.isalnum()]
    tokens = lemmatize(tokens)
    stop = set(stopwords.words(language))
    tokens = [token for token in tokens if token not in stop]
    return tokens


['lemmatization', 'process', 'turn', 'word', 'thesaurus', 'representation']


In [11]:
def tokenizeFile(targetFile, language='english'):
    targetFileName = path.basename(targetFile)
    destinationFileName = "tokenized." + targetFileName
    with open(targetFile, encoding='utf-8') as fin, open(destinationFileName, 'w', newline='') as fout:
        for row in fin:
            doc = json.loads(row)
            content = doc['content']
            doc['content'] = " ".join(normalize(content))
            print(json.dumps(doc), file=fout)


In [12]:
# source = '..\\..\\datasets\\raw.dump.jsonl'
# tokenizeFile(source)