In [19]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
from itertools import combinations
# Make sure to download necessary nltk resources
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/oa6121/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# 1. Download a decently sized corpus (between 500,000 and 2 million words) from Gutenberg.
# 2. Using regular expressions or other methods, write a function to remove any 
# extraneous stuff from the text (start/end text from Gutenberg).

def remove_extraneous(text):
    # Define the start and end patterns often found in Gutenberg texts
    start_pattern = r"\*\*\* START OF (THE|THIS) PROJECT GUTENBERG EBOOK .* \*\*\*"
    end_pattern = r"\*\*\* END OF (THE|THIS) PROJECT GUTENBERG EBOOK .* \*\*\*"
    
    # identify the start of the content
    start_match = re.search(start_pattern, text, re.IGNORECASE)
    if start_match:
        text = text[start_match.end():]  # Trim everything before the start match
    
    # identify the end of the content
    end_match = re.search(end_pattern, text, re.IGNORECASE)
    if end_match:
        text = text[:end_match.start()]  # Trim everything after the end match
    
    # Return the cleaned text
    return text.strip()

In [4]:
# Function to read text file
def read_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            return text
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


# Function to print out some text: accept the text and length to pprint
def print_out_some_text(text, length_to_print):
    if text:
        print(text[:length_to_print])  # Print first 500 characters for preview

In [7]:
# Read text GUTENBERG text file
file_path = 'the_replic_by_plato.txt'
text_content = read_text_file(file_path)


print_out_some_text(text_content, 500)

The Project Gutenberg eBook of The Republic
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title:


In [9]:
# remove extraneous stuff
text_content_after_remove_extraneous = remove_extraneous(text_content)
print_out_some_text(text_content_after_remove_extraneous, 200)

THE REPUBLIC

By Plato

Translated by Benjamin Jowett

Note: See also “The Republic” by Plato, Jowett, eBook #150


Contents

 INTRODUCTION AND ANALYSIS.
 THE REPUBLIC.
 PERSONS OF THE DIALOGUE.
 BOOK


In [15]:
# Analyze the text with PMI
# 3. Read in the text, then tokenize and lowercase it 
# (e.g., use nltk.word_tokenize and string functions.
 
def text_tokenize_and_lowercase(text):
    # Tokenize the text 
    # Note: Since the instruction is not given, stop-words are not removed. 
    tokens = word_tokenize(text)
    
    # Convert each token to lowercase
    tokens_lowercase = [token.lower() for token in tokens]
    
    return tokens_lowercase

tokens = text_tokenize_and_lowercase(text_content_after_remove_extraneous)
print_out_some_text(tokens, 50)

['the', 'republic', 'by', 'plato', 'translated', 'by', 'benjamin', 'jowett', 'note', ':', 'see', 'also', '“', 'the', 'republic', '”', 'by', 'plato', ',', 'jowett', ',', 'ebook', '#', '150', 'contents', 'introduction', 'and', 'analysis', '.', 'the', 'republic', '.', 'persons', 'of', 'the', 'dialogue', '.', 'book', 'i', '.', 'book', 'ii', '.', 'book', 'iii', '.', 'book', 'iv', '.', 'book']


In [18]:
from nltk.probability import FreqDist

# 4. Computation of unigram frequencies of the text. 
def calculate_unigram_freq(t):    
    # Calculate unigram frequencies
    freq_dist = FreqDist(t)
    
    return freq_dist


freqDist = calculate_unigram_freq(tokens)
# Print the frequencies of unigrams
for word, freq in freqDist.items():
    print(f'{word}: {freq}')

the: 15317
republic: 107
by: 1337
plato: 367
translated: 4
benjamin: 1
jowett: 2
note: 30
:: 496
see: 206
also: 404
“: 18
”: 18
,: 15215
ebook: 1
#: 1
150: 1
contents: 2
introduction: 10
and: 9490
analysis: 14
.: 6657
persons: 81
of: 10323
dialogue: 15
book: 108
i: 1783
ii: 6
iii: 5
iv: 9
v.: 6
vi: 6
vii: 6
viii: 4
ix: 8
x: 24
is: 4641
longest: 3
his: 1313
works: 28
with: 1070
exception: 11
laws: 112
certainly: 258
greatest: 119
them: 1258
there: 898
are: 2170
nearer: 16
approaches: 6
to: 5910
modern: 110
metaphysics: 5
in: 4412
philebus: 9
sophist: 17
;: 2870
politicus: 4
or: 2200
statesman: 21
more: 610
ideal: 99
form: 122
institutions: 13
state: 685
clearly: 78
drawn: 25
out: 249
as: 1690
art: 179
symposium: 7
protagoras: 9
higher: 91
excellence: 26
but: 1503
no: 702
other: 719
has: 859
same: 357
largeness: 1
view: 103
perfection: 25
style: 46
shows: 19
an: 656
equal: 30
knowledge: 292
world: 244
contains: 8
those: 242
thoughts: 30
which: 2544
new: 77
well: 257
old: 124
not: 2448
on

In [20]:
# 5. Count 2-word adjacent and non-adjacent collocations in each sentence in your text
# using a 4-word sliding window and considering all pairs of words in the window.

def count_collocations(text):
    # Initialize dictionaries to store counts for adjacent and non-adjacent collocations
    adjacent_collocations = defaultdict(int)
    non_adjacent_collocations = defaultdict(int)
    
    # first tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    for sentence in sentences:
        # secondly tokenize and lowercase each sentence
        tokens = word_tokenize(sentence.lower())
        
        # Iterate over the sentence with a 4-word sliding window
        for i in range(len(tokens) - 3):
            window = tokens[i:i + 4]  # Get 4-word sliding window
            
            # Get all 2-word within the window (combinations of words)
            word_pairs = list(combinations(window, 2))
            print("HHHHHHHHHHHHHH", word_pairs)
            for pair in word_pairs:
                # If the words are adjacent, count as adjacent collocation
                if abs(tokens.index(pair[0]) - tokens.index(pair[1])) == 1:
                    adjacent_collocations[pair] += 1
                # If the words are non-adjacent, count as non-adjacent collocation
                else:
                    non_adjacent_collocations[pair] += 1
    
    return adjacent_collocations, non_adjacent_collocations 

In [23]:
adjacent, non_adjacent = count_collocations(text_content_after_remove_extraneous)

# Print adjacent collocations
print("Adjacent collocations:")
for pair, count in adjacent.items():
    print(f"{pair}: {count}")

Adjacent collocations:
('the', 'republic'): 75
('republic', 'by'): 4
('by', 'plato'): 47
('plato', 'translated'): 3
('plato', 'by'): 4
('benjamin', 'jowett'): 3
('jowett', 'note'): 3
('note', ':'): 3
(':', 'see'): 3
('see', 'also'): 3
('also', '“'): 3
('ebook', '#'): 3
('#', '150'): 3
('150', 'contents'): 3
('contents', 'introduction'): 3
('introduction', 'and'): 4
('and', 'analysis'): 3
('analysis', '.'): 3
('persons', 'of'): 8
('of', 'the'): 1279
('the', 'dialogue'): 8
('dialogue', '.'): 5
('book', 'v.'): 9
('v.', 'book'): 2
('vi', '.'): 2
('republic', 'of'): 16
('of', 'plato'): 118
('plato', 'is'): 57
('his', 'works'): 3
('works', 'with'): 6
('laws', ','): 26
(',', 'and'): 2754
('them', '.'): 138
('there', 'are'): 252
('are', 'nearer'): 2
('nearer', 'approaches'): 3
('approaches', 'to'): 6
('to', 'modern'): 15
('modern', 'metaphysics'): 4
('metaphysics', 'in'): 3
('in', 'the'): 1227
('the', 'philebus'): 3
('philebus', 'and'): 3
('the', 'in'): 34
('philebus', 'the'): 1
('sophist', ';

In [24]:
# Print non-adjacent collocations
print("\nNon-adjacent collocations:")
for pair, count in non_adjacent.items():
    print(f"{pair}: {count}")


Non-adjacent collocations:
('the', 'by'): 126
('the', 'plato'): 75
('republic', 'plato'): 23
('republic', 'translated'): 1
('by', 'translated'): 2
('by', 'by'): 10
('translated', 'by'): 3
('plato', 'benjamin'): 1
('translated', 'benjamin'): 2
('by', 'benjamin'): 3
('translated', 'jowett'): 1
('by', 'jowett'): 3
('by', 'note'): 2
('benjamin', 'note'): 2
('benjamin', ':'): 1
('jowett', ':'): 2
('jowett', 'see'): 1
('note', 'see'): 2
('note', 'also'): 1
(':', 'also'): 3
(':', '“'): 6
('see', '“'): 2
('see', 'the'): 94
('also', 'the'): 204
('“', 'the'): 8
('also', 'republic'): 1
('“', 'republic'): 2
('“', '”'): 2
('the', '”'): 2
('republic', '”'): 3
('”', 'by'): 3
('”', 'plato'): 2
('”', ','): 2
('by', ','): 288
('plato', ','): 113
('plato', 'jowett'): 2
(',', 'jowett'): 3
(',', ','): 2747
('jowett', ','): 3
(',', 'ebook'): 4
('jowett', 'ebook'): 2
('jowett', '#'): 1
(',', '#'): 2
(',', '150'): 1
('ebook', '150'): 2
('ebook', 'contents'): 1
('#', 'contents'): 2
('#', 'introduction'): 1
('