Notebook for a frequency based data analysis

In [None]:
import string
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
import pandas as pd

In [None]:
# Retrieve the abstracts and return them as a list of sentences
def corpus_prep(filename):
    f = open(filename, "r")
    text = f.read()
    abstracts = text.split("\n\n")
    sentences = [abstract.split(" ") for abstract in abstracts]
    return sentences

In [None]:
# Create a frequency dictionary 
def word_count(corpus):
    total_corpus = 0
    counts = {}
    for sentence in corpus:
        for word in sentence:
            total_corpus += 1
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
                
    sorted_counts = dict(sorted(counts.items(), key=lambda item: item[1], reverse=True))
    print(total_corpus)
    return sorted_counts

In [None]:
# Strips input of all punctuation and numbers
def stripper(corpus):
    no_punct = [[line.translate(str.maketrans('','',string.punctuation)) for line in abstract] for abstract in corpus]
    no_nums = [[line.translate(str.maketrans('','','1234567890')) for line in abstract] for abstract in no_punct]
    for abstr in no_nums:
        while ('' in abstr): 
            abstr.remove('')
    no_caps = [[sent.lower() for sent in abstr] for abstr in no_nums]
    return no_caps

In [None]:
reuters = stripper(corpus_prep("Data/reuters.txt"))
reuters_counts = word_count(reuters)

jstor = stripper(corpus_prep("Data/jstor.txt"))
jstor_counts = word_count(jstor)

arxiv = stripper(corpus_prep("Data/arxiv.txt"))
arxiv_counts = word_count(arxiv)

In [None]:
# Create intersection sets
def intersection(x, y):
    intersection = [name for name in x if name in y]
    y_only = [name for name in y if name not in intersection]
    x_only = [name for name in x if name not in intersection]
    return x_only, intersection, y_only

In [None]:
only_jstor, jstor_reuters, only_reuters_j = intersection(jstor_counts, reuters_counts)
only_arxiv, arxiv_reuters, only_reuters_a = intersection(arxiv_counts, reuters_counts)

# Words occurring in all three corpora
_, ajr, _ = intersection(jstor_reuters, arxiv_reuters)

In [None]:
total_jstor = len(only_jstor) + len(jstor_reuters) + len(only_reuters_j) 
total_arxiv = len(only_arxiv) + len(arxiv_reuters) + len(only_reuters_a)


In [None]:
hist_dict = {}

for key in ajr:
    hist_dict[key] = [jstor_counts[key], arxiv_counts[key], reuters_counts[key]]

In [None]:
alleen_arxiv = sum(arxiv_counts[key] for key in only_arxiv)
alleen_reuters_a = sum(reuters_counts[key] for key in only_reuters_a)
arx_reut = sum(arxiv_counts[key] for key in arxiv_reuters) + sum(reuters_counts[key] for key in arxiv_reuters)
totaal_a = alleen_arxiv + alleen_reuters_a + arx_reut

In [None]:
alleen_jstor = sum(jstor_counts[key] for key in only_jstor)
alleen_reuters_j = sum(reuters_counts[key] for key in only_reuters_j)
jstor_reut = sum(jstor_counts[key] for key in jstor_reuters) + sum(reuters_counts[key] for key in jstor_reuters)
totaal_j = alleen_jstor + alleen_reuters_j + jstor_reut

In [None]:
# Create a barplot of both intersections as defined above

def intersection_plot(option):
    
    # Set parameters
    if option == 'total_normalised':
        plt.ylabel("Number of total words (%)")
        plt.title("Total wordcount intersection")

        bars1 = [alleen_reuters_j / totaal_j, alleen_reuters_a / totaal_a]
        bars2 = [jstor_reut / totaal_j, arx_reut / totaal_a]
        bars3 = [alleen_jstor / totaal_j, alleen_arxiv / totaal_a]
    
        
    elif option == 'unique_normalised':
        plt.ylabel("Unique word occurences (%)")
        plt.title("Unique word occurence intersection")
    
        bars1 = [len(only_reuters_j) / total_jstor, len(only_reuters_a) / total_arxiv]
        bars2 = [len(jstor_reuters) / total_jstor, len(arxiv_reuters) / total_arxiv]
        bars3 = [len(only_jstor) / total_jstor, len(only_arxiv) / total_arxiv]
        
    else:
        print("Please specify method")
        
    # Create barplot

    rc('font', weight='bold')

    bars = np.add(bars1, bars2).tolist()

    r = [0,1]

    names = ['Political Science','Physics']
    barWidth = 1

    plt.bar(r, bars3, bottom=bars, color='#28752c', edgecolor='white', width=barWidth, label='Abstracts only')
    plt.bar(r, bars2, bottom=bars1, color='#aef007', edgecolor='white', width=barWidth, label='Intersection')
    plt.bar(r, bars1, color='#13b01b', edgecolor='white', width=barWidth, label='Newspapers only')

    plt.xticks(r, names, fontweight='bold')
    plt.xlabel("Academic field")

    plt.legend()

    plt.savefig("{}.pdf".format(option), bbox_inches='tight')

    plt.show()
    
#     Save figure
#     fig = plt.get_figure()
#     plt.savefig("output.png")
    
    

intersection_plot('total_normalised')
intersection_plot('unique_normalised')

In [None]:
# Print the top n words from each list
def print_words(lst1, lst2, lst3, n):

    dash = '-' * 80
    
    print(dash)
    print('{:<30s}{:<30s}{:<30s}'.format("Abstract only","Intersection","Newspaper only"))
    print(dash)

    for i in range(n):
        print('{:<30s}{:<30s}{:<30s}'.format(lst1[i],lst2[i],lst3[i]))

In [None]:
print_words(only_jstor, jstor_reuters, only_reuters_j, 100)

In [None]:
print_words(only_arxiv, arxiv_reuters, only_reuters_a, 100)