In [None]:
!pip install arxiv==1.4.2
!pip install datetime==4.4
!pip install PyMuPDF==1.19.6
!pip install syllables==1.0.3
!pip install matplotlib==3.5.2
!pip install textstat==0.7.3
!pip install statsmodels==0.13.2
!pip install tensorflow-hub==0.12.0
!pip install scipy==1.8.1

In [None]:
%matplotlib notebook

import arxiv
import datetime
import fitz
import re
import syllables
import random
import unicodedata
import os
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import textstat
from statsmodels.api import OLS
from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.descriptivestats import sign_test
import tensorflow_hub as hub
from math import dist
import copy
from scipy.stats import shapiro
from scipy.stats import mannwhitneyu

In [None]:
parent_loc = os.path.dirname(os.path.dirname(os.path.abspath("DataCollection.py")))
suitable = []
with open(parent_loc+'\\suitable.txt') as file:
    suitable = [line.strip() for line in file]

In [None]:
yetToBeDownloaded = suitable.copy()

In [None]:
len(yetToBeDownloaded)

In [None]:
for pre in suitable:
    yetToBeDownloaded.append(pre[:-1]+"1")

In [None]:
len(yetToBeDownloaded)

In [None]:
all_links = {}
all_keys = []
for i in range(len(suitable)):
    all_links[suitable[i][-12:-2]] = [suitable[0][:-1]+'1',suitable[i]]
    all_keys.append(suitable[i][-12:-2])

In [None]:
len(all_links)

In [None]:
if (os.path.isfile("timings.npy")):
    # Load
    all_timings = np.load('timings.npy',allow_pickle='TRUE').item()
else:
    all_timings = {}
    for k in range(1000):
        if (k%50==0):
            print(f"{k//10}%")
        paper = next(arxiv.Search(id_list=[yetToBeDownloaded[k][-12:]]).results())
        all_timings[yetToBeDownloaded[k][-12:-2]] = (paper.updated-paper.published).total_seconds()

    # Save
    np.save('timings.npy', all_timings)

In [None]:
all_timings

In [None]:
def DownloadPDF(index):
    paper = next(arxiv.Search(id_list=[yetToBeDownloaded[index][-12:]]).results())
    # Download the PDF to a specified directory with a custom filename.
    if (index < 1000):
        paper.download_pdf(filename=all_keys[index]+".pdf")
    else:
        paper.download_pdf(filename=all_keys[index%1000]+"-Preprint.pdf")

def findAbstract(document):
    for i in range(len(document)):
        if str(doc[i]).lower() == "abstract":
            return i
    return -777

def reverseSearchVertical(document):
    for k in range(j-1, -1, -1):
        if len(str(doc[k]).strip()) > 1:
            return k
        
def GetAbstract(bigText):
    for m in range(len(bigText.splitlines())):
        if bigText.splitlines()[m].lower().replace(" ","") == 'abstract':
            return m
    return -1

def GetReferences(bigText):
    for n in range(len(bigText.splitlines())-1, -1, -1):
        if bigText.splitlines()[n].lower().replace(" ","") == 'references':
            return n
    return -1

def CropText(bigText):
    begin = GetAbstract(bigText)
    end = GetReferences(bigText)

    if begin == -1 and end == -1:
        return bigText.splitlines()[:]
    elif begin == -1:
        return bigText.splitlines()[:end]
    elif end == -1:
        return bigText.splitlines()[begin+1:]
    else:
        return bigText.splitlines()[begin+1:end]

In [None]:
def PreProcessing(index):
    
    with fitz.open(f"{working_keys[index]}-Preprint.pdf") as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    
    # Remove abstract and references
    text = CropText(text)
    
    # Replace ligatures with single characters
    testing = [unicodedata.normalize("NFKD",part) for part in text]
    
    # Concatenate parts with - splitting them
    fixedSentences = []
    brokenSentence = ""
    state = "default"
    for sentence in testing:
        if state == "default":
            if "arxi" in sentence.lower():
                pass
            elif sentence[-1:] == "-":
                state = "broken"
                brokenSentence = sentence[:-1]
            else:
                fixedSentences.append(sentence)
        elif state == "broken":
            if "arxi" in sentence.lower():
                pass
            elif sentence[-1:] == "-":
                brokenSentence = brokenSentence + sentence[:-1]
            else:
                state = "default"
                brokenSentence = brokenSentence + sentence
                fixedSentences.append(brokenSentence)
                brokenSentence = ""
                
    # Remove links
    testing = []
    for sentence in fixedSentences:
        if 'www' in sentence or 'http' in sentence:
            testing.append(' '.join([word for word in sentence.split() if 'www' not in word and 'http' not in word]))
        else:
            testing.append(sentence)
            
    # Remove numbers in the text
    noNumbers = []
    for s in testing:
        noNumbers.append(re.sub(r"\d+", "", s))
        
    # Transform to lowercase only
    lowerText = [sentence.lower() for sentence in noNumbers]
    
    # Only keep A-Z . and -
    cleanText = [re.sub("[^ .!?a-zA-Z]+", '', part) for part in lowerText]
    cleanText = [re.sub("[!?]+", '.', part) for part in cleanText]
    
    #========================================================================================
    
    text = ""
    
    with fitz.open(f"{working_keys[index]}.pdf") as doc:
            text = ""
            for page in doc:
                text += page.get_text()
    
    # Remove abstract and references
    text = CropText(text)
    
    # Replace ligatures with single characters
    testing = [unicodedata.normalize("NFKD",part) for part in text]
    
    # Concatenate parts with - splitting them
    fixedSentences = []
    brokenSentence = ""
    state = "default"
    for sentence in testing:
        if state == "default":
            if "arxi" in sentence.lower():
                pass
            elif sentence[-1:] == "-":
                state = "broken"
                brokenSentence = sentence[:-1]
            else:
                fixedSentences.append(sentence)
        elif state == "broken":
            if "arxi" in sentence.lower():
                pass
            elif sentence[-1:] == "-":
                brokenSentence = brokenSentence + sentence[:-1]
            else:
                state = "default"
                brokenSentence = brokenSentence + sentence
                fixedSentences.append(brokenSentence)
                brokenSentence = ""
                
    # Remove links
    testing = []
    for sentence in fixedSentences:
        if 'www' in sentence or 'http' in sentence:
            testing.append(' '.join([word for word in sentence.split() if 'www' not in word and 'http' not in word]))
        else:
            testing.append(sentence)
            
    # Remove numbers in the text
    noNumbers = []
    for s in testing:
        noNumbers.append(re.sub(r"\d+", "", s))
        
    # Transform to lowercase only
    lowerText = [sentence.lower() for sentence in noNumbers]
    
    # Only keep A-Z . and -
    cleanText2 = [re.sub("[^ .!?a-zA-Z]+", '', part) for part in lowerText]
    cleanText2 = [re.sub("[!?]+", '.', part) for part in cleanText2]
    
    return [cleanText, cleanText2]

In [None]:
def GetSentences(inputText):
    return len([x for x in ' '.join(inputText).split('.') if len(x.replace(' ','')) > 1])

def GetCleanWords(inputText):
    return [y for y in ' '.join(inputText).replace('.','').split(' ') if y.replace(' ','') != '']

def GetWords(cleanWords):
    return len(cleanWords)

def GetCharacters(cleanWords):
    return sum([len(z) for z in cleanWords])

def GetStatistics(inputtext):
    sentences = GetSentences(inputtext)
    cleanwords = GetCleanWords(inputtext)
    words = GetWords(cleanwords)
    characters = GetCharacters(cleanwords)
    return sentences, words, characters, cleanwords

def GetARI(number_of_sentences, number_of_words, number_of_characters):
    c_w = number_of_characters / number_of_words
    w_s = number_of_words / number_of_sentences
    return (4.71 * c_w + 0.5 * w_s - 21.43)

def GetFRES(number_of_sentences, number_of_words, number_of_clean_words):
    se = sum([syllables.estimate(w) for w in number_of_clean_words])
    s_w = se / number_of_words
    w_s = number_of_words / number_of_sentences
    return (206.835 - (1.015 * w_s) - (84.6 * s_w))
    
def GetFORCAST(cleanWords):
    random.seed(777)
    list_syllables = random.sample([syllables.estimate(w) for w in cleanWords], 150)
    number_of_single_syllable_words = len([v for v in list_syllables if v == 1])
    return (20 - (number_of_single_syllable_words / 10))

In [None]:
def Download(start, end):
    for j in range(start, end):
        if (j % 50 == 0):
            print(f"{j*100/2000}%")
        try:
            DownloadPDF(j)
            print(f"Succes {j}")
        except:
            print(f"Failed to download for number {j} in the yetToBeDownloaded list")

In [None]:
def GetMeanVectorUSE(listText):
    return np.mean(embed(listText), axis=0)

def CalculateDistanceUSE(inputText1, inputText2):
    if (inputText1 == []) or (inputText2 == []):
        return -1
    return dist(GetMeanVectorUSE(inputText1), GetMeanVectorUSE(inputText2))

# Downloading the papers

In [None]:
# Set this to False if you don't have the dataset downloaded yet
alreadyDownloaded = True

In [None]:
if not alreadyDownloaded:
    Download(0,500)

In [None]:
if not alreadyDownloaded:
    Download(500,750)

In [None]:
if not alreadyDownloaded:
    Download(750,1000)

In [None]:
if not alreadyDownloaded:
    Download(1000,1250)

In [None]:
if not alreadyDownloaded:
    Download(1250,1500)

In [None]:
if not alreadyDownloaded:
    Download(1500,1750)

In [None]:
if not alreadyDownloaded:
    Download(1750,2000)

# Getting the usable data

In [None]:
unusable = {2205.07192, 2203.10441, 2201.12276, 2203.07551, 2202.09965, 2202.03537, 2202.03161, 2111.14712, 2203.06123, 2203.07070, 2112.15408, 2204.01142, 2201.08582, 2112.14102, 2112.06780, 2112.05964}
unusableIDs = [str(z) for z in unusable]

In [None]:
bool_idx = [all([y not in x for y in unusableIDs]) for x in all_keys]

In [None]:
working_keys = list(np.array(all_keys)[np.array(bool_idx, dtype=bool)])

In [None]:
len(working_keys)

In [None]:
working_keys

In [None]:
if (os.path.isfile("filenames.npy")):
    # Load
    all_files = np.load('filenames.npy',allow_pickle='TRUE').item()
else:
    all_files = {}
    for idx in range(len(working_keys)):
        all_files[idx] =[f"{working_keys[idx]}.pdf",f"{working_keys[idx]}-Preprint.pdf"]
    np.save('filenames.npy', all_files)

In [None]:
if (os.path.isfile("cleanText.npy")):
    # Load
    all_text = np.load('cleanText.npy',allow_pickle='TRUE').item()
else:
    all_text = {}
    for idx in range(len(working_keys)):
        if (idx%50==0):
            print(f"{idx//10}%")
        all_text[idx] = PreProcessing(idx)
    
    np.save('cleanText.npy', all_text)

# Get Statistics

In [None]:
all_sentences = []
all_words = []
all_characters = []
all_clean_words = []
for t in range(len(working_keys)):
    if (t%50==0):
        print(f"{t//10}%")
    s,w,c,cw = GetStatistics(all_text[t][0])
    all_sentences.append(s)
    all_words.append(w)
    all_characters.append(c)
    all_clean_words.append(cw)
    s,w,c,cw = GetStatistics(all_text[t][1])
    all_sentences.append(s)
    all_words.append(w)
    all_characters.append(c)
    all_clean_words.append(cw)

# Matplotlib

In [None]:
df_diff = pd.DataFrame()

In [None]:
temp_keys = []
for item in working_keys:
    temp_keys.append(item)
    temp_keys.append(item)

df_diff['ID'] = temp_keys

In [None]:
df_diff['PrePrint'] = [True if w%2==0 else False for w in range(2*len(working_keys))]

In [None]:
count=0
for idx in range(0,2*len(working_keys),2):
    count+=1
    
print(count)

In [None]:
len(all_clean_words)

In [None]:
temp_diff = []
for idx in range(len(working_keys)):
    temp_diff.append(list(set(all_text[idx][0]) - set(all_text[idx][1])))
    temp_diff.append(list(set(all_text[idx][1]) - set(all_text[idx][0])))

df_diff['Difference'] = temp_diff

In [None]:
temp_no_diff = []
for idx in range(2*len(working_keys)):
    temp_no_diff.append(df_diff['Difference'][idx] == [])

df_diff['NoDifference'] = temp_no_diff

In [None]:
temp_change = []
for idx in range(0,(2*len(working_keys))-1,2):
    temp_change.append(df_diff['Difference'][idx] != [])
    temp_change.append(df_diff['Difference'][idx] != [])
    
df_diff['TwoWayChange'] = temp_change

In [None]:
temp_empty = []
for idx in range(0,(2*len(working_keys))-1,2):
    temp_empty.append((df_diff['NoDifference'][idx] == True) and (df_diff['NoDifference'][idx+1] == True))
    temp_empty.append((df_diff['NoDifference'][idx] == True) and (df_diff['NoDifference'][idx+1] == True))
    
df_diff['BothEmpty'] = temp_empty

In [None]:
#df_diff.to_csv('all_diff.csv')

# Lexical Diversity

In [None]:
def lexical_diversity(text):
    return len(set(text)) / len(text)

lex_div = []
for i in range(len(working_keys)):
    #lex_div.append(lexical_diversity(all_clean_words[i]))
    lex_div.append(lexical_diversity(all_text[i][0]))
    lex_div.append(lexical_diversity(all_text[i][1]))

In [None]:
len(lex_div)

In [None]:
test = ""
for x in all_text[0][0]:
    test = test+" "+x
print(test)

In [None]:
df_diff.head(10)

In [None]:
create_df_all = False

try:
    df_all = pd.read_csv('all_stats.csv',index_col=[0])
except:
    df_all = pd.DataFrame()
    df_all['ID'] = ['-' for u in range(2*len(working_keys))]
    df_all['ARI'] = [-2000 for u in range(2*len(working_keys))]
    df_all['FORCAST'] = [-2000 for u in range(2*len(working_keys))]
    df_all['PrePrint'] = [True if w%2==0 else False for w in range(2*len(working_keys))]
    df_all['Sentences'] = all_sentences
    df_all['Words'] = all_words
    df_all['Characters'] = all_characters
    
    create_df_all = True

In [None]:
if create_df_all:
    temp_keys = []
    for item in working_keys:
        temp_keys.append(item)
        temp_keys.append(item)

    df_all['ID'] = df_all.ID.astype(str)
    df_all['ID'] = temp_keys

In [None]:
if create_df_all:
    temp_ARI = []
    for item in range(2*len(working_keys)):
        temp_ARI.append(GetARI(all_sentences[item], all_words[item], all_characters[item]))

    print(len(temp_ARI))
    df_all['ARI'] = temp_ARI

In [None]:
if create_df_all:
    temp_FORCAST = []
    for item in range(2*len(working_keys)):
        temp_FORCAST.append(GetFORCAST(all_clean_words[item]))

    print(len(temp_FORCAST))
    df_all['FORCAST'] = temp_FORCAST

In [None]:
if create_df_all:
    temp_FRES = []
    for item in range(2*len(working_keys)):
        temp_FRES.append(GetFRES(all_sentences[item], all_words[item], all_clean_words[item]))

    print(len(temp_FRES))
    df_all['FRES'] = temp_FRES

In [None]:
if create_df_all:
    temp_timings = []
    for item in all_timings.values():
        temp_timings.append(item)
        temp_timings.append(item)

    df_all['Duration'] = temp_timings
    
    df_all.to_csv('all_stats.csv')

In [None]:
df_all.head(10)

# Descriptive Statistics

In [None]:
df_all[df_all['ID'].isin(set(df_diff[df_diff['BothEmpty'] == False]['ID']))].describe()[:4].T

In [None]:
df_all[df_all['ID'].isin(set(df_diff[df_diff['BothEmpty'] == False]['ID']))].describe()[4:].T

# Readability Score Plot

In [None]:
col = 'ARI'

diff_results0 = []
for i in range(0,2*len(working_keys),2):
    if (df_diff['BothEmpty'][i] == False):
        diff_results0.append(df_all[col][i+1]-df_all[col][i])

col = 'FORCAST'

diff_results1 = []
for i in range(0,2*len(working_keys),2):
    if (df_diff['BothEmpty'][i] == False):
        diff_results1.append(df_all[col][i+1]-df_all[col][i])
    
col = 'FRES'

diff_results2 = []
for i in range(0,2*len(working_keys),2):
    if (df_diff['BothEmpty'][i] == False):
        diff_results2.append(df_all[col][i+1]-df_all[col][i])

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.boxplot([diff_results0,diff_results1,diff_results2], vert=False)
ax1.grid(axis='y', alpha=0.5)
plt.xlabel('Readability Score change (post-publication minus preprint)')
plt.ylabel('Method (1=ARI, 2=FORCAST, 3=FRES)')
ax1.grid(axis='x', alpha=0.5)
plt.title('Readability score change after peer review');
plt.savefig("H1_1.png", dpi=fig.dpi*2)

In [None]:
def sign_test_all(lst):
    for i in lst:
        results = sign_test(i, np.median(i))
        print(f"M: {results[0]} | p-value: {results[1]}")

# Shapiro test for normality
### All results show a p-value lower than 0.05, which means we can confidently reject the null-hypotheses that the data shown in the boxplot is from a normal distribution.
##### Because of that we will have to use a non-parametric t-test to figure out if any of the readability score population means are significantly below 0.

In [None]:
print(shapiro(diff_results0))
print(shapiro(diff_results1))
print(shapiro(diff_results2))

## Non-parametric t-test called sign_test used
#### All p-values bigger 0.05, meaning we cannot reject the null hypothesis that the average readability score change would not be 0.

In [None]:
sign_test_all([diff_results0,diff_results1,diff_results2])

# Lexical Diversity Plot

In [None]:
lex_div_results = []
for i in range(0,2*len(working_keys),2):
    if (df_diff['BothEmpty'][i] == False):
        lex_div_results.append(lex_div[i+1]-lex_div[i])
        
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.boxplot([lex_div_results], vert=False)
ax1.grid(axis='y', alpha=0.5)
plt.xlabel('Lexical Diversity change (post-publication minus preprint)')
plt.ylabel('Method (Lexical Diversity)')
ax1.grid(axis='x', alpha=0.5)
plt.title('Lexical Diversity change after peer review');
plt.savefig("H1_2.png", dpi=fig.dpi*2)

In [None]:
print(shapiro(lex_div_results))

In [None]:
sign_test_all([lex_div_results])

# < 5 Months  vs. 5+ Months

In [None]:
col = 'Characters'

less_results = []
more_results = []
for i in range(0,2*len(working_keys),2):
    if (df_diff['BothEmpty'][i] == False):
        if (df_all['Duration'][i] < 2628000):
            less_results.append(df_all['Characters'][i+1]-df_all[col][i])
        else:
            more_results.append(df_all['Characters'][i+1]-df_all[col][i])
print((len(less_results),len(more_results)))

print(shapiro(less_results))
print(shapiro(more_results))
print(mannwhitneyu(x=less_results,y=more_results, alternative = 'two-sided'))
            
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.boxplot([less_results, more_results], vert=False)
ax1.grid(axis='y', alpha=0.5)
plt.xlabel(f'Amount of {col[:-1]} change after peer review')
plt.ylabel('Group (1= less than 5 months, 2= at least 5 months)')
plt.title(f'Amount of {col[:-1]} change');
plt.savefig('H2_1_1.png', dpi=fig.dpi*2);

In [None]:
col = 'Words'

less_results = []
more_results = []
for i in range(0,2*len(working_keys),2):
    if (df_diff['BothEmpty'][i] == False):
        if (df_all['Duration'][i] < 2628000):
            less_results.append(df_all['Characters'][i+1]-df_all[col][i])
        else:
            more_results.append(df_all['Characters'][i+1]-df_all[col][i])
print((len(less_results),len(more_results)))

print(shapiro(less_results))
print(shapiro(more_results))
print(mannwhitneyu(x=less_results,y=more_results, alternative = 'two-sided'))
            
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.boxplot([less_results, more_results], vert=False)
ax1.grid(axis='y', alpha=0.5)
plt.xlabel(f'Amount of {col[:-1]} change after peer review')
plt.ylabel('Group (1= less than 5 months, 2= at least 5 months)')
plt.title(f'Amount of {col[:-1]} change');
plt.savefig('H2_1_2.png', dpi=fig.dpi*2);

In [None]:
col = 'Sentences'

less_results = []
more_results = []
for i in range(0,2*len(working_keys),2):
    if (df_diff['BothEmpty'][i] == False):
        if (df_all['Duration'][i] < 2592000):
            less_results.append(df_all['Characters'][i+1]-df_all[col][i])
        else:
            more_results.append(df_all['Characters'][i+1]-df_all[col][i])
print((len(less_results),len(more_results)))

print(shapiro(less_results))
print(shapiro(more_results))
print(mannwhitneyu(x=less_results,y=more_results, alternative = 'two-sided'))
            
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.boxplot([less_results, more_results], vert=False)
ax1.grid(axis='x', alpha=0.5)
plt.xlabel(f'Amount of {col[:-1]} change after peer review')
plt.ylabel('Group (1= less than 5 months, 2= at least 5 months)')
plt.title(f'Amount of {col[:-1]} change');
plt.savefig('H2_1_3.png', dpi=fig.dpi*2);

# USE plot

In [None]:
df_need_duration_and_diff = pd.concat([df_all[["ID","Duration"]], df_diff[df_diff['BothEmpty'] == False]], axis=1, join="inner")

In [None]:
# Load pre-trained universal sentence encoder model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
use_results = []
use_less_5_months = []
for i in range(0,2*len(working_keys),2):
    if (i%50==0):
            print(f"{100*i//(2*len(working_keys))}%")
    if (df_diff['BothEmpty'][i] == False):
        use_results.append(CalculateDistanceUSE(df_diff['Difference'][i+1],df_diff['Difference'][i]))
        use_less_5_months.append(df_need_duration_and_diff['Duration'][i])
print(len(use_results))
print(len(use_less_5_months))

In [None]:
len(use_less_5_months)

In [None]:
def TakeRandomIdx(chosen, quantity):
    random.seed(777)
    random_use = [i for i in range(len(use_results))]
    random_use = random_use[:chosen]+random_use[chosen+1:]
    random.shuffle(random_use)
    output = []
    output.append(np.abs(use_results[chosen]))
    for f in range(quantity):
        output.append(np.abs(CalculateDistanceUSE(df_diff['Difference'][(random_use.pop()*2)+1],df_diff['Difference'][chosen])))
    return output

In [None]:
len(use_results)

In [None]:
if (os.path.isfile("all_use_comparisons.npy")):
    # Load
    all_use_comparisons = np.load('all_use_comparisons.npy',allow_pickle='TRUE')
    all_use_comparisons = [list(x) for x in all_use_comparisons]
else:
    all_use_comparisons = []
    for val in range(len(use_results)):
        print(f"{100*val//(len(use_results))}%")
        all_use_comparisons.append(TakeRandomIdx(val,100))
    
    np.save('all_use_comparisons.npy', all_use_comparisons)

In [None]:
len(all_use_comparisons)

In [None]:
def flatten(nested_list):
    """
    input: nasted_list - this contain any number of nested lists.
    ------------------------
    output: list_of_lists - one list contain all the items.
    """

    list_of_lists = []
    for item in nested_list:
        list_of_lists.extend(item)
    return list_of_lists

fig = plt.figure(figsize=(8, 6))
ax1 = fig.add_subplot(111)
pd.DataFrame([np.abs(x) for x in flatten(all_use_comparisons)]).plot(kind='density', ax=ax1, c="blue")
pd.DataFrame([np.abs(x) for x in use_results]).plot(kind='density', ax=ax1, c="darkorange")
plt.xlabel("Distance between texts (Universal Sentence Encoding)")
plt.ylabel("Density")
plt.legend(["Random paper vs post-publication comparison","Preprint vs post-publication comparison"]);
plt.title(f'Distance in terms of similarity (Random vs paper version difference)');
plt.savefig('H2_2_1.png', dpi=fig.dpi*2);

In [None]:
print(shapiro([np.abs(x) for x in flatten(all_use_comparisons)]))
print(shapiro([np.abs(x) for x in use_results]))

In [None]:
print(mannwhitneyu(x=[np.abs(x) for x in use_results],y=[np.abs(x) for x in flatten(all_use_comparisons)], alternative = 'less'))

In [None]:
left_lst_use = []
right_lst_use = []
for w in range(len(use_results)):
    if (use_less_5_months[w] < 2592000):
        left_lst_use.append(np.abs(use_results[w]))
    else:
        right_lst_use.append(np.abs(use_results[w]))

In [None]:
fig = plt.figure(figsize=(8, 6))
ax1 = fig.add_subplot(111)
ax1.boxplot([left_lst_use,right_lst_use], vert=False)
plt.xlabel("Distance between texts (Universal Sentence Encoding)")
plt.ylabel("Peer-review duration (1 = took < 5 months | 2 = took 5+ months)");
plt.title(f'Distance in terms of similarity (between categories)');
plt.savefig('H2_2_2.png', dpi=fig.dpi*2);

In [None]:
print(shapiro(left_lst_use))
print(shapiro(right_lst_use))

In [None]:
print(mannwhitneyu(x=right_lst_use,y=less_results, alternative = 'greater'))