In [1]:
import urllib.request
from string import punctuation
from string import whitespace
import numpy as np


In [2]:
def cleanse(word):
    '''
    This function removes punctuation and whitespace from word,
    the string that we pass as an argument

    Parameters
    word: str

    Returns
    cleaned: str
    '''

    cleaned = '' # we will only include non whitespace, non punctuation characters
    for char in word:
        if ((char in whitespace) or (char in punctuation)):
            pass
        else:
            cleaned += char.lower()
    return cleaned

In [3]:
def gatherBook(url):
    with urllib.request.urlopen(url) as file_object:
        # *** demarcates actual text of book in gutenberg files
        words = file_object.read().decode('utf-8-sig').split('***')[2]
        words = words.split()
    return words

In [4]:
if __name__ == '__main__':
    word = 'hey there,. '
    print(cleanse(word))

heythere


In [5]:
def make_word_dict(word_list):
    '''
    This function will create a dictionary of key-value pairs of form word-wordcount,
    with wordcount being the number of times each respective word appears in word_list

    Parameters
    word_list: list 
        list of words created from full text file, 
        preferably cleansed of punctuation and whitespace
    
    Returns
    myDict: dict 
        dict of key-value pairs of word-wordcount
    
    '''

    myDict = {}
    for word in word_list:
        if word not in myDict:
            myDict[word] = 1 # add word to myDict if word not already in myDict
        else:
            myDict[word] += 1 # increase value += 1 if word already in myDict

    return myDict


In [6]:
if __name__ == '__main__':
    url = 'http://www.gutenberg.org/cache/epub/61995/pg61995.txt'
    words = gatherBook(url)
    bookDict = make_word_dict(words)
    print(bookDict)



{'Produced': 1, 'by': 215, 'MWS,': 1, 'Martin': 1, 'Pettit': 1, 'and': 2716, 'the': 3441, 'Online': 1, 'Distributed': 1, 'Proofreading': 1, 'Team': 1, 'at': 468, 'https://www.pgdp.net': 1, '(This': 1, 'file': 1, 'was': 1040, 'produced': 6, 'from': 154, 'images': 1, 'generously': 1, 'made': 71, 'available': 2, 'The': 460, 'Internet': 1, 'Archive/American': 1, 'Libraries.)': 1, '[Illustration:': 4, 'With': 6, 'these': 45, 'words,': 12, 'he': 513, 'left': 34, 'somewhat': 18, 'desolate': 4, 'little': 648, 'girl.--_Page': 1, '30._]': 1, 'A': 31, 'GIRL': 3, 'OF': 17, 'HIGH': 2, 'ADVENTURE': 2, 'BY': 3, 'MRS.': 1, 'L.': 2, 'T.': 1, 'MEADE': 1, 'AUTHOR': 1, '"OCEAN\'S': 1, 'GIRLHOOD,"': 1, '"A': 13, 'WILD': 1, 'IRISH': 3, 'GIRL,"': 2, '"THE': 5, 'GIRLS': 4, 'MERTON': 1, 'COLLEGE,"': 1, '"FOR': 1, 'DEAR': 1, 'DAD,"': 1, '"KITTY': 1, 'O\'DONOVAN,"': 1, '"PEGGY': 1, 'FROM': 1, 'KERRY,"': 1, 'CHESTERTON': 1, 'GRADUATES,"': 1, "KING'S": 1, 'ROYAL,"': 1, 'LADY': 1, 'JERRY': 1, "BOY'S": 1, 'DREAMS,"'

In [7]:
def make_markov_dict(cleansedWords, n):
    '''
    This function will create a dictionary whose keys are n-word-length prefixes,
    and whose values are lists of 1-word suffixes that follow those prefixes.
    The longer the prefix, the more specific that prefix is, and as such, the value
    corresponding to that prefix, which is a list of suffixes, will be shorter

    the values in this dictionary will be lists of words. These lists, which will 
    include repeats, will help us determine, probabilistically, which word/suffix
    should follow any argued n-word-length prefix

    Parameters
    cleansedWords: list 
        list of words with punctuation and whitespace stripped
    n: int 
        length of prefix that you want to be utilized in markov analysis
    
    Returns
    markovDict: dict 
        dictionary of form key = 'word1 word2...wordn', value = [worda, wordb, wordc, ...]
        where word1 word2...wordn are all possible n-length prefixes gathered from text,
        and worda, wordb,... are all possible suffixes for each of those n-length prefixes

    '''

    preSubList = []
    for i in range(len(cleansedWords) - n):
        preSubList.append(cleansedWords[i:i+n+1])
        # this will create a list of lists of words so we can analyze n-word prefixes
        # so if you want to analyze suffixes for a a 2-word entry, you create a dictionary
        # with all two-word entries as keys, and we'll then enter suffixes as values in that dict
    markovDict = {}
    for _ in preSubList:
        strings = _[0]
        for i in range(1, n):
            strings = strings + ' ' + _[i] # we're creating a string of n-length, which is our prefix length
        if strings not in markovDict:
            markovDict[strings] = [_[n]] #add suffix, which is _[n] to our dictionary
        else:
            markovDict[strings].append(_[n]) # our values are lists, so we append to them if a prefix is already present as a key

    return markovDict


In [13]:
markovDict

{'Produced by': ['MWS,'],
 'by MWS,': ['Martin'],
 'MWS, Martin': ['Pettit'],
 'Martin Pettit': ['and'],
 'Pettit and': ['the'],
 'and the': ['Online',
  'lesson',
  'cakes',
  'best',
  'stroke',
  'young',
  'young',
  'feeling',
  'rest',
  'garden',
  'housemaid,',
  'best',
  'child',
  'little',
  'brown',
  'place',
  'young',
  'young',
  'old',
  'youngest.',
  'soft',
  'time',
  'pushkeen',
  'weeks',
  'youngest',
  'old',
  'measles,',
  'immediate',
  '_très',
  'Comte',
  'young',
  'young',
  'good',
  'head',
  'robes',
  'old',
  'baby',
  'next',
  'child',
  'Comte',
  'three',
  'baby',
  'other',
  'notary',
  'Irish,"',
  'old',
  'gayest.',
  'next',
  'Comtesse',
  'brown',
  'woman',
  'money',
  'title',
  'robes',
  'red',
  'top',
  'brilliant',
  'robes',
  'chapeaux',
  'fans',
  '_gants_',
  'other',
  'bees',
  'abrupt',
  'doctor',
  'slim',
  'bees,',
  "farmers'",
  'thin',
  '_dot_',
  'lightning',
  'thunder',
  'clear,',
  'air',
  '_Reparation_,'

In [8]:
if __name__ == '__main__':
    url = 'http://www.gutenberg.org/cache/epub/61995/pg61995.txt'
    words = gatherBook(url)
    
    markovDict = make_markov_dict(words, 2)
    ml = list(enumerate(markovDict.items()))
    print(ml[200:210])
    


[(200, ('and say,', ['Obey'])), (201, ('say, Obey', ['and'])), (202, ('Obey and', ['please'])), (203, ('and please', ['my', 'come'])), (204, ('please my', ['lord'])), (205, ('my lord', ['and'])), (206, ('lord and', ['lady,'])), (207, ('and lady,', ['So'])), (208, ('lady, So', ['God'])), (209, ('So God', ['shall']))]


In [9]:
def generate_suffixes(markovDict, prefix):
    '''
    Parameters
    markovDict : dict
        dictionary created using function make_markov_dict
    prefix : str
        prefix that you would like to generate suffixes for

    Returns
    sortedTups : list of tuples
        list of sorted tuples containing (suffix, count), corresponding to prefix input.
    alternatively, if you argue a prefix that is not present in markovDict, then a message will alert you
    '''

    tupList = []
    seenList = []
    try:
        for _ in markovDict[prefix]:
            if _ not in seenList:
                seenList.append(_)
                tupList.append((_, markovDict[prefix].count(_)))
                # this list of tuples does not HAVE to be sorted, but it's useful if we want 
                # to quickly visualize the frequency of suffixes for a given prefix
        sortedTups = sorted(tupList, key = lambda x: x[1], reverse = True)
        
        return sortedTups

    except (KeyError):
        return f'"{prefix}" is not present as a key in your markov Dictionary'

In [14]:
if __name__ == '__main__':
    url = 'http://www.gutenberg.org/cache/epub/61995/pg61995.txt'
    words = gatherBook(url)
    
    markovDict = make_markov_dict(words, 2)
    
    sortedTups = generate_suffixes(markovDict, 'He said')
    print(sortedTups)

    

[('he', 2), ('to', 1), ('his', 1), ('that', 1)]


In [16]:
sortedTups = generate_suffixes(markovDict, 'yolo ay')
print(sortedTups)


"yolo ay" is not present as a key in your markov Dictionary


In [11]:
def predict_sentence(cleansed_words, prefix, n):
    '''
    Parameters
    cleansed_words : list
        List of words from text with whitespace and punctuation removed (though the function will also accept words with punctuation).
    prefix : str
        String you want your generated sentence to begin with.
    n : int
        Number of times you want predict_sentence to predict a suffix.

    Returns
    sentence : str
        Predicted sentence with n suffixes predicted.

    '''
    multiDict = make_markov_dict(cleansed_words, len(prefix.split()))
    prefix = prefix.lower()
    # we convert to lower because our cleanse function automatically converts to lower
    sentence = prefix #initialize our sentence to just be our prefix
    
    for _ in range(n-1):
        sortedTups = generate_suffixes(multiDict, prefix) #find suffixes for our prefix from multiDict
        words, nums = list(zip(*sortedTups)) #doing this to produce a list of frequences we can use for np.random.choice
        denom = sum(nums)
        probs = [_ / denom for _ in nums]
        suffix = np.random.choice(words, p = probs) 
        #select suffix based on probability of being chosen
        #np.random.choice is useful here because it maps list of words to list of thier frequencies (probs)
        sentence = sentence + ' ' +  suffix #add suffix to our sentence
        
        prefix = sentence.split()[-len(prefix.split()):] # redefining prefix by dropping first word from previous prefix
        prefix = (' ').join(prefix)
        
    return sentence

In [12]:
if __name__ == '__main__':
    url = 'http://www.gutenberg.org/cache/epub/63632/pg63632.txt'
    words = gatherBook(url)

    cleansedWords = [cleanse(word) for word in words]

    print(predict_sentence(cleansedWords, 'he said', 10)) 
    # this will predict an 11-word sentence (first input word, plus 10 suffixes)
    
    

he said slowly this is for me mr mulhane the allied


### N Gram

In [25]:
import re
import nltk
import random
import urllib.request
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anies\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anies\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Removing stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and re.match(r'^\w+$', token)]
    
    return tokens


In [31]:


def build_ngram_model(tokens, n=2):
    ngram_model = {}
    for i in range(len(tokens)-n):
        gram = ' '.join(tokens[i:i+n])
        next_token = tokens[i+n]
        if gram not in ngram_model:
            ngram_model[gram] = []
        ngram_model[gram].append(next_token)
    return ngram_model


In [35]:
def generate_text(ngram_model, seed_text, max_length=50):
    current_text = seed_text.lower()
    n = len(seed_text.split())
    for _ in range(max_length):
        last_n_tokens = current_text.split()[-n:]
        last_n_gram = ' '.join(last_n_tokens)
        if last_n_gram in ngram_model:
            next_token = random.choice(ngram_model[last_n_gram])
            current_text += ' ' + next_token
        else:
            break
    return current_text.capitalize()


In [39]:
# Acquire data from the URL
url = 'http://www.gutenberg.org/cache/epub/61995/pg61995.txt'
with urllib.request.urlopen(url) as file_object:
    text = file_object.read().decode('utf-8-sig').split('***')[2]

# Preprocess the text
tokens = preprocess_text(text)



In [40]:
ngram_model = build_ngram_model(tokens, n=2)  # Adjust n-gram size as needed
seed_text = "The"
generated_text = generate_text(ngram_model, seed_text, max_length=100)  # Adjust max_length as needed
print("Generated Text:")
print(generated_text)



Generated Text:
The
