This Jypyter notebook contains code to reproduce the NYSK dataset preprocessing steps described in "A hypergeometric test interpretation of a common tf-idf variant" by Paul Sheridan and Mikael Onsjö. It is assumed that the file "nysk.xml" is present in the directory of execution. The file is made available for download by the UCI Machine Learning Repository at https://archive.ics.uci.edu/ml/datasets/NYSK. The code borrows heavily from the "Text Data Preprocessing: A Walkthrough in Python" tutorial <https://www.kdnuggets.com/2018/03/text-data-preprocessing-walkthrough-python.html>.

Author: Paul Sheridan
Email: paul.sheridan.stats@gmail.com

In [None]:
import os
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [None]:
infile = open("nysk.xml","r")
contents = infile.read()
soup = BeautifulSoup(contents,'xml')
titles = soup.find_all('title')
texts = soup.find_all('text')

In [None]:
title = titles[0]
print(title.get_text())

In [13]:
text = texts[0]
print(text.get_text())

The New York Post has learned that the woman accusing IMF boss Dominique Strauss-Kahn of sexual assault lived in an apartment exclusively for patients with HIV and AIDS. From the Post: The hotel maid, a West African immigrant, has occupied the fourth-floor High Bridge pad with her 15-year-old daughter since January -- and before that, lived in another Bronx apartment set aside by Harlem Community AIDS United strictly for adults with the virus and their families. The paper was unable to confirm if the accuser has HIV or AIDS because of medical confidentiality laws, but the Post confirmed that the agency rents apartments only for adults with the disease. A Harlem United employee said at least one adult in the apartment must be HIV-positive or have AIDS to qualify to live in one of the residences. Sources told the Post that only the alleged victim and her child lived in the apartment. Strauss-Kahn is accused of forcing the woman to perform oral sex on him. She told police that after the f

In [28]:
sample = text.get_text()
words = nltk.word_tokenize(sample)
len(words)

397

In [22]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

In [30]:
words = normalize(words)
print(len(words))
print(words)

219
['new', 'york', 'post', 'learned', 'woman', 'accusing', 'imf', 'boss', 'dominique', 'strausskahn', 'sexual', 'assault', 'lived', 'apartment', 'exclusively', 'patients', 'hiv', 'aids', 'post', 'hotel', 'maid', 'west', 'african', 'immigrant', 'occupied', 'fourthfloor', 'high', 'bridge', 'pad', '15yearold', 'daughter', 'since', 'january', 'lived', 'another', 'bronx', 'apartment', 'set', 'aside', 'harlem', 'community', 'aids', 'united', 'strictly', 'adults', 'virus', 'families', 'paper', 'unable', 'confirm', 'accuser', 'hiv', 'aids', 'medical', 'confidentiality', 'laws', 'post', 'confirmed', 'agency', 'rents', 'apartments', 'adults', 'disease', 'harlem', 'united', 'employee', 'said', 'least', 'one', 'adult', 'apartment', 'must', 'hivpositive', 'aids', 'qualify', 'live', 'one', 'residences', 'sources', 'told', 'post', 'alleged', 'victim', 'child', 'lived', 'apartment', 'strausskahn', 'accused', 'forcing', 'woman', 'perform', 'oral', 'sex', 'told', 'police', 'forced', 'act', 'spit', 'sem

In [37]:
def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas

stems, lemmas = stem_and_lemmatize(words)
print('Stemmed (', len(stems), '):\n', stems)
print('\nLemmatized (', len(lemmas), '):\n', lemmas)

Stemmed ( 219 ):
 ['new', 'york', 'post', 'learn', 'wom', 'accus', 'imf', 'boss', 'domin', 'strausskahn', 'sex', 'assault', 'liv', 'apart', 'exclud', 'paty', 'hiv', 'aid', 'post', 'hotel', 'maid', 'west', 'afr', 'immigr', 'occupy', 'fourthflo', 'high', 'bridg', 'pad', '15yearold', 'daught', 'sint', 'janu', 'liv', 'anoth', 'bronx', 'apart', 'set', 'asid', 'harlem', 'commun', 'aid', 'unit', 'strictly', 'adult', 'vir', 'famy', 'pap', 'un', 'confirm', 'accus', 'hiv', 'aid', 'med', 'confid', 'law', 'post', 'confirm', 'ag', 'rent', 'apart', 'adult', 'diseas', 'harlem', 'unit', 'employ', 'said', 'least', 'on', 'adult', 'apart', 'must', 'hivposit', 'aid', 'qual', 'liv', 'on', 'resid', 'sourc', 'told', 'post', 'alleg', 'victim', 'child', 'liv', 'apart', 'strausskahn', 'accus', 'forc', 'wom', 'perform', 'or', 'sex', 'told', 'pol', 'forc', 'act', 'spit', 'sem', 'onto', 'flo', 'accord', 'fed', 'cent', 'diseas', 'control', 'poss', 'eith', 'partn', 'becom', 'infect', 'hiv', 'perform', 'receiv', 'or'

In [38]:
type(lemmas)

list

In [43]:
import json
A = json.dumps(lemmas)
print(A)
with open('list.json', 'w') as file:
    file.write(json.dumps(A))

["new", "york", "post", "learn", "woman", "accuse", "imf", "boss", "dominique", "strausskahn", "sexual", "assault", "live", "apartment", "exclusively", "patients", "hiv", "aid", "post", "hotel", "maid", "west", "african", "immigrant", "occupy", "fourthfloor", "high", "bridge", "pad", "15yearold", "daughter", "since", "january", "live", "another", "bronx", "apartment", "set", "aside", "harlem", "community", "aid", "unite", "strictly", "adults", "virus", "families", "paper", "unable", "confirm", "accuser", "hiv", "aid", "medical", "confidentiality", "laws", "post", "confirm", "agency", "rent", "apartments", "adults", "disease", "harlem", "unite", "employee", "say", "least", "one", "adult", "apartment", "must", "hivpositive", "aid", "qualify", "live", "one", "residences", "source", "tell", "post", "allege", "victim", "child", "live", "apartment", "strausskahn", "accuse", "force", "woman", "perform", "oral", "sex", "tell", "police", "force", "act", "spit", "semen", "onto", "floor", "accord

In [51]:
mylist = []
for text in texts:
    sample = text.get_text()
    words = nltk.word_tokenize(sample)
    words = normalize(words)
    stems, lemmas = stem_and_lemmatize(words)
    mylist.append(lemmas)
    
with open('nysk-processed.json', 'w') as outfile:
    json.dump(mylist, outfile)

In [52]:
print(len(mylist))

10421
