In [None]:
import pandas as pd
import pickle
import re
import numpy as np

In [None]:
import nltk
from nltk import bigrams
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import MWETokenizer

In [None]:
nltk.download("stopwords")
nltk.download("wordnet")

In [None]:
def remove_phrases(text):
    regex = "(Copyright)?( © )(.+?)(\\.)"
    text = re.sub(regex, '', text)
    return text.replace(" Todos los derechos reservados.", "").replace(" -H.Clout", "")

In [None]:
stop_words = set(stopwords.words('english'))
regex = "(?u)\\b[\\w-]+\\b"

def tokenize(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(regex)
    tokens = tokenizer.tokenize(text)
    mwe_tokenizer = MWETokenizer([('u', 's', 'a'), ('u', 's'), ('b', 'b')], separator='')
    tokens = mwe_tokenizer.tokenize(tokens)
    tokens = [t for t in tokens if t not in stop_words and not re.match("[0-9]", t)]
    return tokens

In [None]:
def lemmatize(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    return [wordnet_lemmatizer.lemmatize(t) for t in text]

In [None]:
def remove_stopwords(text, to_ignore):
    return [token for token in text if token not in to_ignore]

In [None]:
def bgrams(text, lexicon):
    for doc in text:
        doc.extend([i + " " + j for i, j in list(bigrams(doc)) if i + " " + j in lexicon]) # if i != j ?
    return text

In [None]:
def remove_letters(text):
    return [token for token in text if len(token) > 1]

In [None]:
def preprocess(df, to_ignore=None, save_to_file=None, bigram_min_freq=10, take_top_bigrams=100):
    if to_ignore is None:
        # use default
        to_ignore = ignore_words

    new_df = df
    # remove instances with empty abstracts
    new_df = new_df[new_df.Abstract != '']
    new_df.reset_index(drop=True, inplace=True)
    
    # remove copyright phrases
    new_df['Abstract'] = new_df['Abstract'].apply(remove_phrases)
    
    # tokenize
    tokens = [tokenize(t) for t in new_df['Abstract']]
       
    # lemmatize
    tokens = [lemmatize(t) for t in tokens]
    
    # remove additional stop words
    tokens = [remove_stopwords(t, to_ignore) for t in tokens]
    
    # remove words with only one letter
    tokens = [remove_letters(t) for t in tokens]

    # find bigrams
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_documents(tokens)
    finder.apply_freq_filter(bigram_min_freq)
    bigram_lexicon = [i + " " + j for i, j in finder.nbest(bigram_measures.pmi, take_top_bigrams)]

    tokens = bgrams(tokens, bigram_lexicon)
    new_df['tokens'] = tokens
       
    if save_to_file:
        new_df.to_csv(save_to_file, index=False)       
    
    return new_df['tokens'], new_df, bigram_lexicon

### Word Frequencies

Find most frequent words in the corpus.

In [None]:
from nltk.probability import FreqDist

def word_frequencies(tokens):
    flat_tokens = [t for doc in tokens for t in doc]
    fdist = FreqDist(flat_tokens)
    return fdist

### Context of words

Find common contexts where the words from the list frequently occur.

In [None]:
from nltk import Text

def word_contexts(tokens, words, num=20):
    flat_tokens = [t for doc in tokens for t in doc]
    text = Text(flat_tokens)   
    return text.common_contexts(words, num)