# Cranfield Collection Preprocessing

Author: Paul Sheridan

Goal: Apply some basic text analysis preprocessing steps to the Cranfield collection documents/queries and output the results in JSON format.

Input files (Available for download at https://github.com/oussbenk/cranfield-trec-dataset):
    
    * cran.all.1400.xml
    * cran.qry.xml

Output files:

    * cran-docs-preprocessed.json
    * cran-queries-preprocessed.json

## Load modules

In [27]:
import re, string, unicodedata, xml
import nltk
import contractions
import inflect
import json
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

## Define custom NLP functions

In [10]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def stem_and_lemmatize(words):
    """Stem and lemmmatize words in one go"""
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas

def normalize(words):
    """Run selected above functions as a one-liner"""
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

## Read Cranfield documents XML file and store its contents in a list

In [18]:
cran_all_1400_xml_file_path = 'cran.all.1400.xml' # write local path to XML file file here
infile = open(cran_all_1400_xml_file_path, 'r')
contents = infile.read()
soup = BeautifulSoup(contents, 'xml')
doc_ids = soup.find_all('docno')
texts = soup.find_all('text')
full_texts = []
N = len(texts)
print('N = {0}'.format(N))

N = 1400


## Apply NLP preprocessing steps to Cranfield docs

In [21]:
doc_dict = {}
j = 0
for text in texts:
    doc_id = int(doc_ids[j].get_text())
    words = nltk.word_tokenize(text.get_text())
    words = normalize(words)
    stems, lemmas = stem_and_lemmatize(words)
    doc_dict[doc_id] = lemmas
    j = j + 1

## Write preprocessed Cranfield docs to JSON file

In [22]:
cran_all_1400_json_file_path = 'cran-docs-preprocessed.json' # write local path to JSON file file here
with open(cran_all_1400_json_file_path, 'w') as outfile:
    json.dump(doc_dict, outfile)

## Preprocess Cranfield queries

In [26]:
cran_qry_xml_file_path = 'cran.qry.xml' # write local path to XML file file here
infile = open(cran_qry_xml_file_path, 'r')
contents = infile.read()
soup = BeautifulSoup(contents, 'xml')
query_ids = soup.find_all('num')
queries = soup.find_all('title')
N = len(queries)
print('N = {0}'.format(N))

query_dict = {}
j = 0
for query in queries:
    query_id = int(query_ids[j].get_text())
    words = nltk.word_tokenize(query.get_text())
    words = normalize(words)
    stems, lemmas = stem_and_lemmatize(words)
    query_dict[query_id] = lemmas
    j = j + 1

cran_qry_json_file_path = 'cran-queries-preprocessed.json' # write local path to JSON file file here
with open(cran_qry_json_file_path, 'w') as outfile:
    json.dump(query_dict, outfile)

N = 225
