# Word Sense Disambiguation
## Imports

In [1]:
import numpy as np
import pandas as pd
import pickle
import spacy

from pathlib import Path

## Constant Paths

In [2]:
# Path to directory containing preprocessed COCA files
COCA_PREPROC_DIR = Path("./coca-preproc-spacy/")

# Path to directory containing preprocessed Elsevier files
ELSEVIER_PREPROC_DIR = Path("./elsevier-preproc-spacy/")

# Path to file containing all subject areas
SUBJAREAS = Path("./subjareas.txt")

## Calculating average number of word senses per token

In [3]:
def word_sense_counter(article):
    """
    Given a preprocessed text, return a dictionary with keys being all tokens used in the article more than once, 
    and value being a list containing all the senses in which the word has been used. 
    """
    word_senses = {}

    for sentence in article:
        for word in sentence:
            if word._.synset:
                if word.text in word_senses:
                    word_senses[word.text].append(word._.synset)
                else:
                    word_senses[word.text] = [word._.synset]

    filtered_word_senses = {key: set(value) for key, value in word_senses.items() if len(value) > 1}

    return filtered_word_senses

def avg_word_sense(article):
    """
    Given a preprocessed text, return the average number of word senses with which each token in the text is used.
    The average excludes all tokens used only once in the text. 
    """
    word_senses_dict = word_sense_counter(article)
    word_sense_count = [len(value) for value in word_senses_dict.values()]
    avg_word_sense_count = np.mean(word_sense_count)

    return avg_word_sense_count

def bootstrap_word_sense(texts, num_resamples=1000, alpha=0.05):

    average_word_senses = []
    n = len(texts)

    for i in range(num_resamples):
        resampled_texts = np.random.choice(texts, size=n, replace=True)
        resampled_avg_senses = np.mean([avg_word_sense(article['body_text_docs']) for article in resampled_texts])
        average_word_senses.append(resampled_avg_senses)

    average_kld = np.mean(average_word_senses)    
    ci_lower = np.percentile(average_word_senses, alpha/2 * 100)
    ci_upper = np.percentile(average_word_senses, (1 - alpha/2) * 100)

    return average_kld, ci_lower, ci_upper


### ElSevier OA CC-BY
We first measure the average number of word senses per token for each academic discipline in the ElSevier OA CC-BY corpus. To compensate for the discrepancies in corpus sizes we bootstrap the process with 1000 iterations. 

In [13]:
avg_word_sense_df = pd.DataFrame(columns=['subj', 'avg', 'lower CI', 'upper CI'])

In [16]:
with open(f'{SUBJAREAS}', 'r') as subj_list_file:
    for subject in subj_list_file:
        subject = subject.strip()

        with open(f'{ELSEVIER_PREPROC_DIR}/{subject}.pickle', 'rb') as articles_file:
            articles = pickle.load(articles_file)

            bootstrap_avg_word_sense = bootstrap_word_sense(articles)
            subject_row = {
            'subj': subject, 'avg': bootstrap_avg_word_sense[0], 
            'lower CI': bootstrap_avg_word_sense[1], 'upper CI': bootstrap_avg_word_sense[2]
            }
            
            avg_word_sense_df.append(subject_row, ignore_index=True)

In [17]:
# Metrics by subject
avg_word_sense_df.sort_values(by='avg')

Unnamed: 0,subj,avg,lower CI,upper CI
0,NURS,1.259816,1.256892,1.268408
1,BIOC,1.263457,1.263158,1.26392
2,MEDI,1.283528,1.275815,1.290029
3,DENT,1.287852,1.28164,1.288779
4,CHEM,1.288312,1.287261,1.294461
5,CENG,1.289667,1.282979,1.295156
6,ENVI,1.293488,1.286699,1.300103
7,NEUR,1.294199,1.289339,1.299775
8,PHAR,1.3108,1.310079,1.319682
9,MATE,1.325432,1.317911,1.330041


### COCA
Next we repeat the process for the COCA. We remove any text shorter than 10 sentences to only get texts that are sufficiently long to use a word more than one time. 

In [1]:

with open(f'{COCA_PREPROC_DIR}/2015.pickle', 'rb') as text_file:
    texts = pickle.load(text_file)

    coca_avg_synsets = np.mean([avg_word_sense(text['text_docs']) for text in texts if len(text['text_docs']) > 10])


In [7]:
coca_avg_synsets

1.555653889

In [18]:
avg_word_sense_df['COCA distance'] = coca_avg_synsets - avg_word_sense_df['avg']

In [25]:
# Metrics with distance to COCA average included
avg_word_sense_df.sort_values(by='COCA distance', ascending=False)

Unnamed: 0,subj,avg,lower CI,upper CI,COCA distance
0,NURS,1.259816,1.256892,1.268408,0.295837
1,BIOC,1.263457,1.263158,1.26392,0.292197
2,MEDI,1.283528,1.275815,1.290029,0.272126
3,DENT,1.287852,1.28164,1.288779,0.267802
4,CHEM,1.288312,1.287261,1.294461,0.267342
5,CENG,1.289667,1.282979,1.295156,0.265987
6,ENVI,1.293488,1.286699,1.300103,0.262166
7,NEUR,1.294199,1.289339,1.299775,0.261455
8,PHAR,1.3108,1.310079,1.319682,0.244854
9,MATE,1.325432,1.317911,1.330041,0.230222
