In [1]:
from nltk.tokenize import word_tokenize
import re
import string
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#create a WordNetLemmatizer object
lemmatizer = WordNetLemmatizer()

In [2]:
def read_files(path):
    #function to read the corpus into a list of strings
    with open(path, 'rb') as file:
        data = file.read().decode('utf8', 'surrogateescape')
        data = data.splitlines()
    return data

def reviews_doctors_splitter():
    #function to split the doctor names from the text reviews
    #start by loading the RateMD corpus in memory
    text_list = read_files('ratemd.25k.all.txt')
    
    #each row in the file is either metadata about the doctor or a review
    #we only care about one part of the metadata (doctors' names)
    #and one part of the review (the qualitative/text review)
    
    raw_reviews = []
    doctors = []

    for x in text_list:
        #note that metadata has 4 fields separated by tabs
        #whereas reviews have 2 fields separated by tabs
        temp = x.split('\t')
        #remove extra spaces
        temp = [one_str.strip() for one_str in temp]
        if len(temp) == 4:
            #out of the metadata, keep only the surname of the doctor
            d_name = temp[0].split()
            doctors.append(d_name[len(d_name)-1].lower())
        elif len(temp) == 2:
            #convert reviews to lowercase and tokenize
            review = word_tokenize(temp[1].lower())
            review = [word.replace('.','') for word in review]
            #remove reviews with 3 words or less
            if len(review) > 3:       
                raw_reviews.append(review)
    #make sure there are no duplicates in doctors' names list
    doctors = list(set(doctors))
    return raw_reviews, doctors

def get_unfrequent_words(raw_reviews):
    #we want to find the least common words and exclude them from the dictionary
    #start by counting how many times each word appears in the corpus
    raw_dict = {}
    for review in raw_reviews:
        for word in review:
            try:
                raw_dict[word] += 1
            except KeyError:
                raw_dict[word] = 1
    
    #get list of words that appeared less than 10 times in the corpus
    #this will be later dropped
    unfreq_words = [k for k, v in raw_dict.items() if v < 10]
    return unfreq_words

def get_stop_words():
    #function that creates a list of stop words
    #start by create list with basic stop words in the English language
    stop_words = set(stopwords.words('english'))
    #add some ad-hoc words to the list of stopwords
    stop_words.add('dr')
    stop_words.add('doctor')
    stop_words.add("n't")
    stop_words.add("'ve")
    stop_words.add("'s")
    stop_words.add("l")
    stop_words.add('’')
    stop_words.add('u')
    stop_words.add('s')
    stop_words.add('ca')
    stop_words.add('mo')
    return stop_words

In [3]:
raw_reviews, doctors = reviews_doctors_splitter()
unfreq_words = get_unfrequent_words(raw_reviews)
stop_words = get_stop_words()

In [None]:
def clean(single_review, stop_words, doctors, unfreq_words, lemmatize):
    #this is the function that performs the cleaning/preprocessing of the corpus
    #the function takes 5 arguments:
    # - a single review (note: not the entire training corpus, just one review)
    # - 3 lists of words we want to be removed from the dictionary
    #   these are stop words, doctors' names and infrequent words
    # - a logical for whether we want to apply a lemmatizer or not
    
    #first, filter out stop words, Doctor's names, infrequent words and digits/numbers
    #for tokens that pass that first selection, remove punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    clean_review = [regex.sub('', word) for word in single_review if word not in doctors
                    and word not in unfreq_words and not word.isdigit()
                    and word not in stop_words]
    
    #remove empty strings
    clean_review = list(filter(None, clean_review))
    
    #apply lemmatizer, when requested by the user
    if lemmatize == True:
        lemma_verb = [lemmatizer.lemmatize(word,'v') for word in clean_review]
        lemma_noun = [lemmatizer.lemmatize(word,'n') for word in lemma_verb]
        return lemma_noun
    
    else:
        return clean_review

## No lemmatization

In [None]:
#perform cleaning/preprocessing on each review
corpus_clean = [clean(review, stop_words, doctors, unfreq_words, lemmatize = False) for review in raw_reviews]

#find a unique id for each unique term {term : id}
dictionary = corpora.Dictionary(corpus_clean)
dict_size = len(dictionary.token2id)
print('The dictionary contains {} terms'.format(dict_size))

#convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(doc_clean) for doc_clean in corpus_clean]

#run the LDA model
ldamodel = LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20, iterations=2000)

#output top 10 words in each topic
result = []
for i in range(10):
    top_words = ldamodel.get_topic_terms(i,10)
    result.append([dictionary[x[0]] for x in top_words])

result

#### Extra credit question

In [None]:
#run the LDA model
ldamodel = LdaModel(corpus, num_topics=20, id2word = dictionary, passes=20, iterations=2000)

#output top 10 words in each topic
result = []
for i in range(20):
    top_words = ldamodel.get_topic_terms(i,10)
    result.append([dictionary[x[0]] for x in top_words])

result

## Lemmatization

In [None]:
#perform cleaning/preprocessing on each review
#this time, set lemmatize parameter to True
corpus_clean = [clean(review, stop_words, doctors, unfreq_words, lemmatize = True) for review in raw_reviews]

#find a unique id for each unique term {term : id}
dictionary = corpora.Dictionary(corpus_clean)
dict_size = len(dictionary.token2id)
print('The dictionary contains {} terms'.format(dict_size))

#convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(doc_clean) for doc_clean in corpus_clean]

#run the LDA model
ldamodel = LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20, iterations=2000)

#output top 10 words in each topic
result = []
for i in range(10):
    top_words = ldamodel.get_topic_terms(i,10)
    result.append([dictionary[x[0]] for x in top_words])

result

#### Extra credit question

In [None]:
#run the LDA model
ldamodel = LdaModel(corpus, num_topics=20, id2word = dictionary, passes=20, iterations=2000)

#output top 10 words in each topic
result = []
for i in range(20):
    top_words = ldamodel.get_topic_terms(i,10)
    result.append([dictionary[x[0]] for x in top_words])

result