In [43]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from os import path


In [44]:
def tokenize(text, language='english'):
    stop = set(stopwords.words(language))
    return [token for token in word_tokenize(text) if token not in stop and token.isalnum()]

In [45]:
def stem(tokens, language='english'):
    stemmer = SnowballStemmer(language)
    return [stemmer.stem(token) for token in tokens]

In [46]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [47]:
def transformLines(targetFile, destinationFile, transform):
    with open(targetFile) as fin, open(destinationFile, 'w') as fout:
        for line in fin:
            result = transform(line)
            print(result, end='\n', file=fout)

In [48]:
def tokenizeAndLemmatize(text):
    tokenized = tokenize(text)
    lemmatized = lemmatize(tokenized)
    return lemmatized

In [49]:
def tokenizeAndLemmatizeFile(targetFile, destinationFile):
    transformLines(targetFile, destinationFile, tokenizeAndLemmatize)

In [50]:
def tokenizeFile(targetFile, language='english', idSeparator = ":"):
    targetFileName = path.basename(targetFile)
    destinationFileName = "tokenized." + targetFileName
    with open(targetFile) as fin, open(destinationFileName, 'w') as fout:
        for line in fin:
            (id, text) = line.split(idSeparator)
            tokenizedLine = tokenize(text, language)
            stemmedLine = lemmatize(tokenizedLine)
            print(id + idSeparator + " ".join(stemmedLine), end='\n', file=fout)


In [51]:
tokenizeFile('..\\..\\small.dump.txt')
# with open ('dump.tokenized.txt') as fin:
#     lines = fin.read()
#     count = Counter(lines.split())
#     print(len(count))
