In [1]:
# Natural Language Processing with NLTK
import nltk
import pandas as pd
from nltk.corpus import words
from nltk.metrics import jaccard_distance, edit_distance
from nltk.util import ngrams
from collections import Counter

# Download required resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [2]:
# Load the plot summaries text
with open('plots.txt', 'rt', encoding="utf8") as file:
    raw_plots_text = file.read()

# Tokenize the raw plots into word tokens
tokens = nltk.word_tokenize(raw_plots_text)
text_tokens = nltk.Text(tokens)


In [3]:

# -------------------------
# PART 1 - TEXT ANALYSIS
# -------------------------

# 1. Count total number of tokens
def get_total_token_count():
    return len(tokens)

# 2. Count number of unique tokens
def get_unique_token_count():
    return len(set(tokens))

# 3. Count number of unique lemmatized verbs
def get_unique_lemmatized_verbs():
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_verbs = [lemmatizer.lemmatize(w, 'v') for w in text_tokens]
    return len(set(lemmatized_verbs))

# 4. Calculate lexical diversity: unique tokens / total tokens
def calculate_lexical_diversity():
    return len(set(tokens)) / len(tokens)

# 5. Percentage of 'love' or 'Love' in the text
def percentage_of_love_words():
    count_love = sum(1 for w in tokens if w in ['love', 'Love'])
    return (count_love / len(tokens)) * 100

# 6. Top 20 most frequent tokens
def get_top_20_tokens():
    freq_dist = Counter(tokens)
    return freq_dist.most_common(20)

# 7. Tokens longer than 5 characters and occur more than 200 times
def frequent_long_tokens():
    freq_dist = Counter(tokens)
    filtered = [token for token, count in freq_dist.items() if len(token) > 5 and count > 200]
    return sorted(filtered)

# 8. Longest token in the text and its length
def find_longest_token():
    longest = max(text_tokens, key=len)
    return (longest, len(longest))

# 9. Unique words with frequency > 2000 (alphabetic only)
def highly_frequent_words():
    freq_dist = Counter(tokens)
    filtered = [(count, word) for word, count in freq_dist.items() if word.isalpha() and count > 2000]
    return sorted(filtered, reverse=True)

# 10. Average number of tokens per sentence
def avg_tokens_per_sentence():
    text_str = " ".join(text_tokens)
    sentences = nltk.sent_tokenize(text_str)
    return len(tokens) / len(sentences)

# 11. Top 5 part-of-speech tags
def top_pos_tags():
    pos_tags = nltk.pos_tag(text_tokens)
    tag_counts = Counter(tag for word, tag in pos_tags)
    return tag_counts.most_common(5)


In [4]:
# -------------------------
# PART 2 - SPELLING RECOMMENDERS
# -------------------------

# 12. Recommender using Jaccard distance on trigrams
def spelling_recommender_jaccard_trigrams(misspelled=['cormulent', 'incendenece', 'validrate']):
    suggestions = []
    for word in misspelled:
        candidates = [w for w in words.words() if w.startswith(word[0]) and len(w) >= 3]
        distances = [(w, jaccard_distance(set(ngrams(word, 3)), set(ngrams(w, 3)))) for w in candidates]
        best = min(distances, key=lambda x: x[1])[0]
        suggestions.append(best)
    return suggestions

# 13. Recommender using Jaccard distance on 4-grams
def spelling_recommender_jaccard_fourgrams(misspelled=['cormulent', 'incendenece', 'validrate']):
    suggestions = []
    for word in misspelled:
        candidates = [w for w in words.words() if w.startswith(word[0]) and len(w) >= 4]
        distances = [(w, jaccard_distance(set(ngrams(word, 4)), set(ngrams(w, 4)))) for w in candidates]
        best = min(distances, key=lambda x: x[1])[0]
        suggestions.append(best)
    return suggestions

# 14. Recommender using Edit Distance (Levenshtein with transpositions)
def spelling_recommender_edit_distance(misspelled=['cormulent', 'incendenece', 'validrate']):
    suggestions = []
    for word in misspelled:
        candidates = [w for w in words.words() if w.startswith(word[0])]
        distances = [(w, edit_distance(word, w, transpositions=True)) for w in candidates]
        best = min(distances, key=lambda x: x[1])[0]
        suggestions.append(best)
    return suggestions

In [5]:
print("Total Tokens:", get_total_token_count())
print("Unique Tokens:", get_unique_token_count())
print("Unique Lemmatized Verbs:", get_unique_lemmatized_verbs())
print("Lexical Diversity:", calculate_lexical_diversity())
print("Love Word %:", percentage_of_love_words())
print("Top 20 Tokens:", get_top_20_tokens())
print("Frequent Long Tokens:", frequent_long_tokens())
print("Longest Token:", find_longest_token())
print("Highly Frequent Words:", highly_frequent_words())
print("Avg Tokens per Sentence:", avg_tokens_per_sentence())
print("Top POS Tags:", top_pos_tags())
print("Recommender (Jaccard Trigram):", spelling_recommender_jaccard_trigrams())
print("Recommender (Jaccard 4-gram):", spelling_recommender_jaccard_fourgrams())
print("Recommender (Edit Distance):", spelling_recommender_edit_distance())

Total Tokens: 374446
Unique Tokens: 25928
Unique Lemmatized Verbs: 21755
Lexical Diversity: 0.06924362925495264
Love Word %: 0.12391639916035956
Top 20 Tokens: [(',', 19420), ('the', 18698), ('.', 16629), ('to', 12149), ('and', 11400), ('a', 8979), ('of', 6510), ('is', 5699), ('in', 5109), ('his', 4693), ("'s", 3682), ('her', 3674), ('he', 3556), ('that', 3517), ('with', 3293), ('him', 2570), ('for', 2433), ('by', 2321), ('The', 2234), ('on', 1925)]
Frequent Long Tokens: ['However', 'Meanwhile', 'another', 'because', 'becomes', 'before', 'begins', 'daughter', 'decides', 'escape', 'family', 'father', 'friend', 'friends', 'himself', 'killed', 'leaves', 'mother', 'people', 'police', 'returns', 'school', 'through']
Longest Token: ('live-for-today-for-tomorrow-we-die', 34)
Highly Frequent Words: [(18698, 'the'), (12149, 'to'), (11400, 'and'), (8979, 'a'), (6510, 'of'), (5699, 'is'), (5109, 'in'), (4693, 'his'), (3674, 'her'), (3556, 'he'), (3517, 'that'), (3293, 'with'), (2570, 'him'), (243