# Word2Vec
## Imports

In [37]:
import numpy as np
import pandas as pd
import pickle
import random
import spacy

from gensim.models import Word2Vec
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

## Constant paths

In [44]:
# Path to directory containing preprocessed COCA files
COCA_PREPROC_DIR = Path("./coca-preproc-spacy/")

# Path to directory containing preprocessed Elsevier files
ELSEVIER_PREPROC_DIR = Path("./elsevier-preproc-spacy/")

# Path to file containing all subject areas
SUBJAREAS = Path("./subjareas.txt")

## Selecting Comparison Words
First, we select 1000 tokens that are present in all corpora. These will be used to measure the distance between the general language corpus and the academic corpora.

In [3]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

In [50]:
unique_tokens = set()

with open(f'{COCA_PREPROC_DIR}/2015.pickle', 'rb') as f:
    coca = pickle.load(f)

    for text in coca:
        text_tokens = [
            token.text for sentence in text['text_docs'] for token in sentence
            ]
        filter_tokens = [
            token.lower() for token in text_tokens if token.lower() not in stopwords
            ]

        unique_tokens.update(filter_tokens)

with open(f'{SUBJAREAS}', 'r') as subj_list_file:
    for subject in subj_list_file:
        subject = subject.strip()

        with open(f'{ELSEVIER_PREPROC_DIR}/{subject}.pickle', 'rb') as articles_file:
            articles = pickle.load(articles_file)

        unique_subj_tokens = set()

        for text in articles:
            text_tokens = [
                token.text for sentence in text['body_text_docs'] for token in sentence if token.text
                ]
            filter_tokens = [
                token.lower() for token in text_tokens if token.lower() not in stopwords
                ]

            unique_subj_tokens.update(filter_tokens)
        
        unique_tokens = unique_tokens.intersection(unique_subj_tokens)

tokens_1000 = random.sample(unique_tokens, k=1000)

In [30]:
tokens_1000 = random.sample(unique_tokens, k=1000)
tokens_1000[:15]

['demographic',
 'database',
 'discernible',
 'sufficiently',
 'nonetheless',
 'assuming',
 'irrelevant',
 'let',
 'ongoing',
 'abstraction',
 'landmarks',
 'augmented',
 'shear',
 'helped',
 'trends']

## COCA Embedding Space
First we create an embedding space based on COCA, and save all the word vectors from the 1000 randomly selected tokens in a list.

In [33]:
coca_sentences = [
    [token.text.lower() for token in sentence if token.text.lower() not in stopwords] 
    for text in coca for sentence in text['text_docs']
    ]

coca_model = Word2Vec(coca_sentences, window=5, min_count=1, workers=4)
coca_token_vectors = [(token, coca_model.wv[token]) for token in tokens_1000]


In [35]:
coca_token_vectors[:5]

[('demographic',
  array([-0.44434673,  0.16190062, -0.24173903, -0.36417148, -0.20352486,
         -0.37053123,  0.4803917 ,  0.26165414, -0.3368373 ,  0.23189664,
         -0.02670773, -0.24587852, -0.09785329, -0.1316972 , -0.15303685,
          0.26428905, -0.24956234,  0.22814009, -0.12520492,  0.03824827,
          0.59004533, -0.02376954,  0.61472243,  0.87467647, -0.39327505,
          0.30194232,  0.3666166 , -0.10386968,  0.4176317 ,  0.16066249,
         -0.19098926, -0.21286441,  0.31128606, -0.16328265,  0.6271264 ,
         -0.33382308,  0.22877516, -0.49367693, -0.58774924, -0.11712056,
         -0.08501099,  0.53369397, -0.1648046 ,  0.19690768,  0.37831956,
          0.36941496, -0.8091688 ,  0.75390965,  0.45207468, -0.01725184,
         -0.6082515 ,  0.23144749, -0.46796167, -0.5329031 ,  0.34345186,
          0.21273479,  0.6173254 , -0.36346298, -0.18090485,  0.4016873 ,
         -0.2611292 , -0.15879384,  0.11894826,  0.6381067 , -0.22387555,
         -0.6162295 ,

## ElSevier OA CC-BY Embedding Space
Next, we create an embedding space for each academic corpus and compare the word vectors in the academic corpora with the word vectors in the COCA embedding space. This procedure is bootstrapped with 100 iterations.

In [None]:
def subj_vector_sim(articles, vector_token_list):
    """
    Given a set of articles and a list of word vectors, create a word2vec embedding space
    based on articles and return the average cosine similarity between each word vector in 
    vector_token_list and their word vector in the new embedding space.

    Args:
    articles: Preprocessed articles from a specific discipline in the 
    ElSevier OA CC-BY corpus.
    vector_token_list: A list of tuples. The item in the tuple contains a token, and the second 
    item in contains the word vector for that token in the COCA embedding space.
    """

    subj_sentences = [
        [token.text.lower() for token in sentence if token.text.lower() not in stopwords] 
        for text in articles for sentence in text['body_text_docs']
        ]

    subj_model = Word2Vec(subj_sentences, window=5, min_count=1, workers=4)

    cosine_similarities = [
        cosine_similarity(subj_model[token[0]], token[1]) for token in vector_token_list
        ]

    avg_cosine_similarity = np.mean(cosine_similarities)

    return avg_cosine_similarity

def bootstrap_word_embed_ci(
        subj_articles,
        word_vector_token_list, 
        num_resamples=100, 
        alpha=0.05, 
        ):
    """
    Bootstrap the vector cosine similarity procedure to generate an average cosine similarity
    with confidence intervals.

    Args:
    subj_articles: Preprocessed articles from a specific discipline in the 
    ElSevier OA CC-BY corpus.
    word_vector_token_list: A list of tuples. The item in the tuple contains a token, and the second 
    item in contains the word vector for that token in the COCA embedding space.
    num_resamples: The number of iterations for bootstrappign. Default = 100.
    alpha: The alpha value for which to calculate the confidence intervals. Default = 0.05.
    """
    average_cosine_sim = []
    n = len(subj_articles)

    for i in range(num_resamples):
        resampled_texts = np.random.choice(subj_articles, size=n, replace=True)

        sample_vector_sim = subj_vector_sim(resampled_texts, word_vector_token_list)

        average_cosine_sim.append(sample_vector_sim)

    average_sim = np.mean(average_cosine_sim)    
    ci_lower = np.percentile(average_cosine_sim, alpha/2 * 100)
    ci_upper = np.percentile(average_cosine_sim, (1 - alpha/2) * 100)

    return average_sim, ci_lower, ci_upper



In [None]:
# Dataframe to save results
avg_cosine_sim_df = pd.DataFrame(columns=['subj', 'avg', 'lower CI', 'upper CI'])

In [48]:
with open(f'{SUBJAREAS}', 'r') as subj_list_file:
    for subject in subj_list_file:
        subject = subject.strip()
        print(subject)

        with open(f'{ELSEVIER_PREPROC_DIR}/{subject}.pickle', 'rb') as articles_file:
            articles = pickle.load(articles_file)

            bootstrap_word_vec_sim = bootstrap_word_embed_ci(articles, coca_token_vectors)
            subject_row = {
            'subj': subject, 'avg': bootstrap_word_vec_sim[0], 
            'lower CI': bootstrap_word_vec_sim[1], 'upper CI': bootstrap_word_vec_sim[2]
            }
            
            avg_cosine_sim_df.append(subject_row, ignore_index=True)

In [49]:
avg_cosine_sim_df.sort_values(by="avg", ignore_index=True)

Unnamed: 0,subj,avg,lower CI,upper CI
0,CHEM,0.975,0.973609,0.976081
1,CENG,0.975214,0.974769,0.976625
2,DENT,0.975412,0.975137,0.976084
3,PHYS,0.976533,0.976112,0.97741
4,PHAR,0.977424,0.976196,0.977934
5,AGRI,0.977696,0.977221,0.978083
6,IMMU,0.978124,0.977284,0.979814
7,ENVI,0.979977,0.97975,0.981548
8,BIOC,0.980124,0.978264,0.9811
9,NEUR,0.982154,0.980366,0.98344
