# Corpus Analysis with  Kullback–Leibler divergence

Author: Lucas van der Deijl, University of Amsterdam <br/>
Version: 9 December 2020 <br/>
Contact: l.a.vanderdeijl@uva.nl, www.lucasvanderdeijl.nl <br/>
Project: 'Radical Rumours' (Funded by NWO 2017-2021) <br/>

## Aim of this program

## Pipeline

### Import the required libraries

First, the required libaries and resources need to be imported.

In [None]:
import os

import nltk
from nltk.tokenize import word_tokenize

from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer # We're going to use the Python package scikit-learn to transform texts into vectors of TF-IDF values
from sklearn.metrics.pairwise import cosine_similarity # scikut-learn also offers various ready-to-use methods for calculating metrics like cosine similarity
from scipy.stats import entropy
from scipy import spatial
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

### Install missing libraries

In case you got an error after the previous step because not all of the required modules are installed, you can uncomment (remove the '#') the relevant install-command below and run the code. Once the module is installed, run the block above again to import it before moving on to the next step.

In [None]:
# !pip install nltk
# !pip install scipy

### Define functions for preprocessing and parsing

In [None]:
def preprocess(doc):
  stopwords = open(("Resources/stopwoorden.txt"), 'rt', encoding='utf-8').read().split()
  punct = punctuation
  tokens = word_tokenize(doc)
  lowercase_tokens = [token.lower() for token in tokens]
  punct_and_stops_removed = " ".join([token for token in lowercase_tokens if (token not in stopwords) and (token not in punct)]) 
  preprocessed_doc = punct_and_stops_removed
  return(preprocessed_doc)

def parse_corpus(corpus_location):
    corpus = []
    titles = []
    for filename in os.listdir(corpus_location):
        title = filename.split("_")[2]
        titles.append(title)
        file = open((corpus_location + filename), 'rt', encoding='utf-8')
        preprocessed_text = preprocess(file.read())
        corpus.append(preprocessed_text)
        file.close()
    return(titles, corpus)

In [None]:
# Define filepaths to corpora
source_label = "Descartes"
target_label = "Spinoza"

path_to_corpusfolder = "Corpus/"
#os.listdir(path_to_corpusfolder) # uncomment and run to check if your path is  correct

### Load and preprocess your corpus

In [None]:
source_corpus = parse_corpus(path_to_corpusfolder + source_label + "/")
target_corpus = parse_corpus(path_to_corpusfolder + target_label + "/")

document_titles = source_corpus[0] + target_corpus[0]
total_corpus = source_corpus[1] + target_corpus[1]

### Create a term-document matrix with tfidf values

In [None]:
vect = TfidfVectorizer(min_df=0) # set parameters for vectorization
term_doc_matrix = vect.fit_transform(total_corpus)
term_doc_matrix_array = term_doc_matrix.toarray()

### Create a document-document matrix and compute KL divergence for both directions in each document pair

In [None]:
source_to_target_dict = {}
target_to_source_dict = {}

for first_counter, tfidf_array_source in enumerate(term_doc_matrix_array[:len(source_corpus[1])]):
    source_index = first_counter
    source_title = document_titles[source_index]
    source_to_target_dict[source_title] = {}
    
    for second_counter, tfidf_array_target in enumerate(term_doc_matrix_array[len(source_corpus[1]):]): 
        target_index = len(source_corpus[1]) + second_counter
        target_title = document_titles[target_index]
        
        no_zeroes_source = []
        no_zeroes_target = []
        for index in range(len(tfidf_array_source)):
            if tfidf_array_source[index] != 0 and tfidf_array_target[index] != 0: 
                no_zeroes_source.append(tfidf_array_source[index])
                no_zeroes_target.append(tfidf_array_target[index])

        source_to_target_dict[source_title][target_title] = round(entropy(no_zeroes_target, qk=no_zeroes_source),2) 
        if target_title not in target_to_source_dict:
            target_to_source_dict[target_title] = {}
            target_to_source_dict[target_title][source_title] = round(entropy(no_zeroes_source, qk=no_zeroes_target),2)
        else:
            target_to_source_dict[target_title][source_title] = round(entropy(no_zeroes_source, qk=no_zeroes_target),2)
            
source_to_target_list_of_lists = [list(source_to_target_dict[key].values()) for key in source_to_target_dict]
source_to_target_flattened = [value for sublist in source_to_target_list_of_lists for value in sublist if value != 0.0]

target_to_source_list_of_lists = [list(target_to_source_dict[key].values()) for key in target_to_source_dict]
target_to_source_flattened = [value for sublist in target_to_source_list_of_lists for value in sublist if value != 0.0]

### Visualise the results as boxplots

In [None]:
%matplotlib inline
plt.rcParams['figure.dpi'] = 300
fig1, ax1 = plt.subplots()
plot_title = "Surprise of reading " + target_label + " after " + source_label + " and vice versa"
ax1.set_title(plot_title)
ax1.boxplot([source_to_target_flattened, target_to_source_flattened])
ax1.set_xticklabels([target_label + " after " + source_label, source_label + " after " + target_label])
ax1.set_ylim([0.0,2.5])
#plt.savefig("Output/BOXPLOT.png", bbox_inches='tight', dpi=300) # Uncomment to save the file

### Visualise the results as heatmap

In [None]:
%matplotlib inline
plt.rcParams['figure.dpi'] = 300
plt.rcParams["font.family"] = "Garamond"

source_titles = source_corpus[0]
target_titles = target_corpus[0] 

plot_title = "Surprise of reading texts by " + target_label + " (x) after texts by " + source_label + " (y)"

data = source_to_target_list_of_lists

mask = np.zeros_like(data)
mask[np.triu_indices_from(mask)] = False # Switch to 'True' if source corpus = target corpus, to exclude redundant data

ax = sns.heatmap(data, 
                 mask=mask, 
                 cmap="YlGnBu", 
                 annot=True, 
                 xticklabels=target_titles, 
                 yticklabels=source_titles, 
                 cbar=True, 
                 vmin=0.5, 
                 vmax=2).set_title(plot_title)


#plt.savefig("Output/HEATMAP.png", bbox_inches='tight', dpi=300) # Uncomment to save the file