In [1]:
import random, requests
import string,re

In [2]:
import nltk
from nltk.util import ngrams
from collections import Counter

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
sample_data = "This is an example corpus to find ngrams from text"
words = sample_data.split()
unigrams_eg = words
bigrams_eg = list(ngrams(words, 2))
trigrams_eg = list(ngrams(words, 3))

In [6]:
print("Unigrams:")
for unigram in unigrams_eg:
    print(unigram)

print("\nBigrams:")
for bigram in bigrams_eg:
    print(bigram)

print("\nTrigrams:")
for trigram in bigrams_eg:
    print(trigram)

Unigrams:
This
is
an
example
corpus
to
find
ngrams
from
text

Bigrams:
('This', 'is')
('is', 'an')
('an', 'example')
('example', 'corpus')
('corpus', 'to')
('to', 'find')
('find', 'ngrams')
('ngrams', 'from')
('from', 'text')

Trigrams:
('This', 'is')
('is', 'an')
('an', 'example')
('example', 'corpus')
('corpus', 'to')
('to', 'find')
('find', 'ngrams')
('ngrams', 'from')
('from', 'text')


## Corpus of data

In [7]:
url = "https://www.gutenberg.org/files/1342/1342-0.txt"
response = requests.get(url)
text = response.text

In [8]:
main_text = text.lower()
tokens = nltk.word_tokenize(main_text)

In [9]:
cleaned_tokens = []
for token in tokens:
    cleaned_token = re.sub(r'[^\w\s]', '', token)
    if cleaned_token and not cleaned_token.isdigit():
        cleaned_tokens.append(cleaned_token)

In [10]:
unigrams = cleaned_tokens
bigrams = list(ngrams(cleaned_tokens, 2))
trigrams = list(ngrams(cleaned_tokens, 3))

In [11]:
unigram_freq = Counter(unigrams)
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

In [12]:
top_unigrams = unigram_freq.most_common(20)
top_bigrams = bigram_freq.most_common(20)
top_trigrams = trigram_freq.most_common(20)

In [13]:
print(f"Total tokens: {len(cleaned_tokens)}")
print(f"Unique unigrams: {len(unigram_freq)}")
print(f"Unique bigrams: {len(bigram_freq)}")
print(f"Unique trigrams: {len(trigram_freq)}")

Total tokens: 128323
Unique unigrams: 7068
Unique bigrams: 57344
Unique trigrams: 107668


In [14]:
print("\nTop 20 Unigrams:")
for item, count in top_unigrams:
    print(f"{item}: {count}")


Top 20 Unigrams:
the: 4656
to: 4298
of: 3841
and: 3751
her: 2260
i: 2098
a: 2033
in: 1977
was: 1871
she: 1732
not: 1606
that: 1605
it: 1587
he: 1347
you: 1342
his: 1289
be: 1263
as: 1228
had: 1181
with: 1102


In [15]:
print("\nTop 20 Bigrams:")
for item, count in top_bigrams:
    print(f"{item}: {count}")


Top 20 Bigrams:
('of', 'the'): 509
('to', 'be'): 446
('in', 'the'): 419
('i', 'am'): 311
('mr', 'darcy'): 276
('of', 'her'): 274
('to', 'the'): 264
('it', 'was'): 255
('of', 'his'): 241
('she', 'was'): 210
('it', 'is'): 207
('she', 'had'): 204
('had', 'been'): 203
('i', 'have'): 188
('to', 'her'): 180
('that', 'he'): 179
('could', 'not'): 172
('and', 'the'): 169
('for', 'the'): 165
('he', 'had'): 163


In [16]:
print("\nTop 20 Trigrams:")
for item, count in top_trigrams:
    print(f"{item}: {count}")


Top 20 Trigrams:
('i', 'do', 'not'): 65
('i', 'am', 'sure'): 64
('as', 'soon', 'as'): 56
('she', 'could', 'not'): 53
('i', 'can', 'not'): 51
('that', 'he', 'had'): 37
('in', 'the', 'world'): 35
('it', 'would', 'be'): 34
('_copyright', 'by', 'george'): 34
('by', 'george', 'allen_'): 34
('i', 'am', 'not'): 32
('i', 'dare', 'say'): 30
('it', 'was', 'not'): 30
('that', 'he', 'was'): 30
('mr', 'darcy', 's'): 30
('as', 'well', 'as'): 29
('could', 'not', 'be'): 29
('would', 'have', 'been'): 28
('that', 'it', 'was'): 28
('of', 'mr', 'darcy'): 28
