# LSA and Textual Coherence
## Imports

In [2]:
import numpy as np
import pickle
import pandas as pd
import spacy

from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

## Constant Paths

In [12]:
# Path to directory containing preprocessed COCA files
COCA_PREPROC_DIR = Path("/Volumes/Elements/Capstone/coca-preproc-spacy/")

# Path to directory containing preprocessed Elsevier files
ELSEVIER_PREPROC_DIR = Path("/Volumes/Elements/Capstone/elsevier-preproc-spacy/")

# Path to file containing all subject areas
SUBJAREAS = Path("./subjareas.txt")

## LSA and Cosine Similarity
We first create an LSA matrix for each corpus with sentences as the unit of analysis in the term-document matrix. Then, we calculate the average cosine similarity between adjoining sentences

In [6]:
# Download stopwords to filter out
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

In [20]:
def lsa_vector_space(articles, text_key, components = 100):
    """
    Given a list of articles, return an LSA vector space trained on all the articles.

    Args:
    articles: A set of preprocessed articles.
    text_key: A string describing the key for where body text is stored for each 
    article object. 
    components: An integer describing how many components the final vector space should 
    have.
    """

    text_data = [
        " ".join([token.text.lower() for token in sentence if token.text.lower() not in stopwords])
        for text in articles for sentence in text[text_key]
    ]

    
    filtered_data = [text for text in text_data if text]

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(filtered_data)

    if X.shape[1] <= components:
        return np.nan

    lsa = TruncatedSVD(n_components=components)
    lsa_matrix = lsa.fit_transform(X)

    return lsa_matrix

def doc_sentence_cosine_sim(articles, text_key, components = 100):
    """
    Given a set of prerocessed articles, train an LSA matrix on the sentences of the articles and
    return the average distance between adjoining sentences in each document. 

    Args:
    articles: A set of preprocessed articles.
    text_key: A string describing the key for where body text is stored for each 
    article object. 
    components: An integer describing how many components the final vector space should 
    have.
    """

    vector_space = lsa_vector_space(articles, text_key, components)
    cosine_similarities = []

    for index, article in enumerate(articles):
        start_index = sum(len(sentences) for sentences in articles[:index])
        end_index = start_index + len(article)

        article_cosine_similarity = [
            cosine_similarity(vector_space[i].reshape(1, -1), vector_space[i+1].reshape(1, -1)) 
            for i in range(start_index, end_index-1)
            ]
        
        cosine_similarities.append(np.mean(article_cosine_similarity))

    avg_document_sim = np.mean(cosine_similarities)
    
    return avg_document_sim

def bootstrap_lsa_cosine_sim(
        subj_articles,
        components = 100,
        num_resamples=300, 
        alpha=0.05, 
        ):
    """
    Bootstrap the vector cosine similarity procedure to generate an average cosine similarity
    with confidence intervals.

    Args:
    subj_articles: Preprocessed articles from a specific discipline in the 
    ElSevier OA CC-BY corpus.
    components: The number of components in the LSA matrix. Default = 100.. 
    item in contains the word vector for that token in the COCA embedding space.
    num_resamples: The number of iterations for bootstrapping. Default = 300.
    alpha: The alpha value for which to calculate the confidence intervals. Default = 0.05.
    """
    average_doc_sentence_cosine_sim = []
    n = len(subj_articles)

    for i in range(num_resamples):
        resampled_texts = np.random.choice(subj_articles, size=n, replace=True)

        sample_doc_sentence_sim = doc_sentence_cosine_sim(resampled_texts, "body_text_docs", components)

        average_doc_sentence_cosine_sim.append(sample_doc_sentence_sim)

    average_sim = np.mean(average_doc_sentence_cosine_sim)
    ci_lower = np.percentile(average_doc_sentence_cosine_sim, alpha/2 * 100)
    ci_upper = np.percentile(average_doc_sentence_cosine_sim, (1 - alpha/2) * 100)

    return average_sim, ci_lower, ci_upper

In [1]:
# Calculate average similarity
with open(f'{COCA_PREPROC_DIR}/2015.pickle', 'rb') as f:
    coca = pickle.load(f)

coca_sentence_sim = doc_sentence_cosine_sim(coca, "text_docs")

In [8]:
# Average cosine similarity between adjoining sentences in COCA texts
coca_sentence_sim

0.846109002

In [24]:
#Define dataframe to store results
avg_sentence_cosine_sim_df = pd.DataFrame(columns=['subj', 'avg', 'lower CI', 'upper CI'])

In [27]:
with open(f'{SUBJAREAS}', 'r') as subj_list_file:
    for subject in subj_list_file:
        subject = subject.strip()
        print(subject)

        with open(f'{ELSEVIER_PREPROC_DIR}/{subject}.pickle', 'rb') as articles_file:
            articles = pickle.load(articles_file)

        bootstrap_avg_sentence_cosine_sim = bootstrap_lsa_cosine_sim(articles)
        subject_row = {
            'subj': subject, 
            'avg': bootstrap_avg_sentence_cosine_sim[0], 
            'lower CI': bootstrap_avg_sentence_cosine_sim[1], 
            'upper CI': bootstrap_avg_sentence_cosine_sim[2]
        }

        avg_sentence_cosine_sim_df = avg_sentence_cosine_sim_df.append(subject_row, ignore_index=True)


In [29]:
# Metrics by subject
avg_sentence_cosine_sim_df.sort_values(by= "avg", ignore_index=True)

Unnamed: 0,subj,avg,lower CI,upper CI
0,EART,0.81945,0.813939,0.822223
1,AGRI,0.832435,0.825399,0.836324
2,NEUR,0.835997,0.834347,0.840682
3,PHAR,0.836817,0.836607,0.841495
4,ARTS,0.838128,0.832263,0.845061
5,ENER,0.838245,0.834876,0.839705
6,PSYC,0.838533,0.837942,0.840018
7,MATE,0.839185,0.829291,0.845111
8,DENT,0.839496,0.832304,0.844269
9,ENGI,0.839678,0.834717,0.843212


In [7]:
# Metrics with distance to COCA average included
avg_sentence_cosine_sim_df["COCA distance"] = avg_sentence_cosine_sim_df["avg"].sub(coca_sentence_sim).abs()
avg_sentence_cosine_sim_df.sort_values(by="COCA distance", ignore_index=True)

Unnamed: 0,subj,avg,lower CI,upper CI,COCA distance
0,CHEM,0.846389,0.836704,0.854775,0.00028
1,ECON,0.846982,0.840179,0.85465,0.000873
2,COMP,0.848567,0.840087,0.849304,0.002458
3,CENG,0.843481,0.839134,0.843821,0.002628
4,DECI,0.843332,0.839818,0.851891,0.002777
5,SOCI,0.843321,0.84061,0.850557,0.002788
6,IMMU,0.850003,0.841503,0.858172,0.003894
7,HEAL,0.850061,0.849491,0.85556,0.003952
8,ENVI,0.841947,0.833349,0.845212,0.004163
9,VETE,0.851302,0.84829,0.85511,0.005193
