## Custom preprocessing using NLTK

This is just to verify the quality of gensim preprocessing pipeline. I will use gensim because of its reproducability and not this custom preprocessing.

In [52]:
import nltk
import string   # for punctuation symbols.
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')#, '../data/cache/nltk')
nltk.download('stopwords')#, '../data/cache/nltk')
nltk.download('punkt')

TEST_CORPUS = ['executive functions are cool!', 'cognitive control contains 9 sub-components!']

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


def preprocess(doc):
    # 1. lowercase
    doc = doc.lower()
    # 2. remove punctuations (TODO: and numbers)
    doc = ''.join([c for c in doc if c not in string.punctuation])
    # 3. tokenize
    doc = nltk.word_tokenize(doc)
    # 4. remove stopwords
    doc = [w for w in doc if w not in stop_words]
    # 5. stem
    # doc = [stemmer.stem(w) for w in doc]
    # 6. lemmatize
    doc = [lemmatizer.lemmatize(w) for w in doc]
    return doc

# method 1
[preprocess(doc) for doc in TEST_CORPUS]

# method 2 (gensim)
from gensim.parsing import preprocess_documents
preprocess_documents(TEST_CORPUS)


[nltk_data] Downloading package wordnet to /Users/morteza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/morteza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/morteza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['execut', 'function', 'cool'],
 ['cognit', 'control', 'contain', 'sub', 'compon']]

In [None]:
## Example preprocessing

In [55]:
from gensim.parsing import preprocess_documents, preprocess_string
import pandas as pd

df = pd.read_csv('../data/pubmed/Anti Saccade.csv')
df['abstract'].fillna(df['title'], inplace=True)

df['preprocessed_abstract'] = df['abstract'].apply(lambda abstract: preprocess_string(abstract))
# %timeit df['prep'] = preprocess_documents(df['abstract'].to_list())

# Note: both methods preprocesses the corpus at the same speed (checked with %timeit)

# DEBUG data check: df['preprocessed_abstract'], df.loc[0,'preprocessed_abstract']

In [58]:
from pathlib import Path
files = Path('../data/pubmed').glob('*.csv')

for f in files:
    df = pd.read_csv(f)
    print(f.stem, df['abstract'].isna().sum())

Letter Fluency 0
Cognitive Control 87
Auditory Attention 6
Reverse Categorization 0
Gift Wrap 0
Grass Snow 0
Verbal Fluency 16
Random Letter Generation 0
Digit Span 4
Simon Says 1
LNS 2
Revised Attention Network Test 0
Automated Operation Span 0
Complex Span 1
Incompatibility Test 0
SR Compatibility 0
Delay Discounting Task 1
Number Letter 0
Trail Making Task 0
Semantic Fluency 3
Flexible Item Selection 0
Attention Network Test 0
Head-Toes-Knees-Shoulders 0
Spin the Pots 0
Majority Function 0
Balance Beam 2
Simon 188
MONITOR 62
Delay Choice 0
Counting Span 0
Sentence Completion 1
Letter Number Sequencing 0
Iowa Gambling Task 4
Block Span 0
Modified Card Sorting Test 0
Running Span 0
Odd One Out 3
Reversal Learning 9
Self Control Schedule 0
Category Switch 0
BART 40
Color Shape 0
Random Number Generation 2
Corsi Block Span 0
Executive Function 79
MOT 5
Stroop 24
Category Fluency 3
OSpan 0
N-back 2
Multiple Object Tracking 1
Wisconsin Card Sort 0
Dragon 98
Delayed Alternation 1
Sorting 9