Preprocess the data

In [None]:
import re
import nltk
import xlsxwriter
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stemmer = WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
en_stop = set(stopwords.words('english'))

def preprocess_text(document):
    """
    preprocess text, remove numbers, punctuation, uppercase etc.

    Code gotten from and more specific documentation:
    https://stackabuse.com/python-for-nlp-working-with-facebook-fasttext-library/
    """
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(document))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    tokens = document.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in en_stop]
    tokens = [word for word in tokens if len(word) > 3]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [None]:
# raw data
raw_data = "database100.xlsx"

# create workbook for the preprocessed database
workbook = xlsxwriter.Workbook("database100_preprocessed.xlsx")

topics = {
  "mk": "Musculoskeletal Radiology",
  "ct": "Computed Tomography",
  "br": "Breast Imaging",
  "gu": "Geritourinary Radiology",
  "us": "Ultrasound",
  "ch": "Chest Radiology",
  "ir": "Interventional Radiology",
}

In [None]:
# preprocess the data

for topic in topics:
    sheet = pd.read_excel(raw_data, sheet_name=topic) # open correct sheet from raw data
    
    worksheet = workbook.add_worksheet(topic)

    worksheet.write(0, 0, 'title')
    worksheet.write(0, 1, 'authors')
    worksheet.write(0, 2, 'abstract')
    for i, title in enumerate(sheet.title):
        worksheet.write(i+1, 0, preprocess_text(title))
    for j, author in enumerate(sheet.authors):
        worksheet.write(j+1, 1, author)
    for k, abstract in enumerate(sheet.abstract):
        worksheet.write(k+1, 2, preprocess_text(abstract))

workbook.close()
