In [15]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from os import path
from csv import DictReader, DictWriter


In [16]:
def normalize(text, language = 'english'):
    tokens = word_tokenize(text)
    stop = set(stopwords.words(language))
    tokens = [token for token in tokens if token not in stop]
    tokens = [token for token in tokens if token.isalnum()]
    tokens = lemmatize(tokens)
    return tokens

In [17]:
def tokenize(text, language='english'):
    stop = set(stopwords.words(language))
    return [token for token in word_tokenize(text) if token not in stop and token.isalnum()]

In [18]:
def stem(tokens, language='english'):
    stemmer = SnowballStemmer(language)
    return [stemmer.stem(token) for token in tokens]

In [19]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [28]:
def tokenizeFile(targetFile, language='english', idSeparator = ":"):
    targetFileName = path.basename(targetFile)
    destinationFileName = "tokenized." + targetFileName
    with open(targetFile) as fin, open(destinationFileName, 'w', newline='') as fout:
        csv_reader = DictReader(fin, delimiter=';')
        csv_writer = DictWriter(fout, fieldnames = csv_reader.fieldnames, delimiter=";", quotechar='"')
        csv_writer.writeheader

        for row in csv_reader:
            content = row['content']
            normalizedContent = " ".join(normalize(content))
            csv_writer.writerow({'id': row['id'], 'content': normalizedContent})

In [29]:
tokenizeFile('.\\small.dump.csv')

# with open ('dump.tokenized.txt') as fin:
#     lines = fin.read()
#     count = Counter(lines.split())
#     print(len(count))
