In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random




In [37]:
path = "/content/drive/MyDrive/textset"
def read_all_textfile(path):
    txt = []
    if not os.path.exists(path):
        print("Error: Path does not exist.")
        return txt

    for root, _, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
           # print("Reading file:", file_path)
            with open(file_path) as f:
                for line in f:
                    line = line.strip()
                    if line == '----------':
                        break
                    if line != '':
                        txt.append(line)
    return txt


arr = read_all_textfile(path)
print("number of lines =", len(arr))

number of lines = 215021


In [44]:
import nltk
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [45]:

def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)  # This line was causing the error
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt += words
    return cleaned_txt

# Assuming 'arr' is defined elsewhere in your code
cleaned_arr = clean_txt(arr)
print("number of words =", len(cleaned_arr))


number of words = 2332247


In [46]:
def make_markov_model(cleaned_arr, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_arr)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_arr[i+j] + " "
            next_state += cleaned_arr[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1



    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total

    return markov_model

In [48]:
markov_model = make_markov_model(cleaned_arr)
print("number of states = ", len(markov_model.keys()))


number of states =  208718


In [49]:
print("All possible transitions from 'the game' state: \n")
print(markov_model['the game'])

All possible transitions from 'the game' state: 

{'is afoot': 0.036036036036036036, 'was up': 0.09009009009009009, 'for the': 0.036036036036036036, 'your letter': 0.02702702702702703, 'was whist': 0.036036036036036036, 'was in': 0.02702702702702703, 'is hardly': 0.02702702702702703, 'would have': 0.036036036036036036, 'is up': 0.06306306306306306, 'is and': 0.036036036036036036, 'in their': 0.036036036036036036, 'in that': 0.036036036036036036, 'the lack': 0.036036036036036036, 'for all': 0.06306306306306306, 'may wander': 0.02702702702702703, 'now a': 0.02702702702702703, 'my own': 0.02702702702702703, 'at any': 0.02702702702702703, 'mr holmes': 0.02702702702702703, 'ay whats': 0.02702702702702703, 'my friend': 0.02702702702702703, 'fairly by': 0.02702702702702703, 'is not': 0.02702702702702703, 'was not': 0.02702702702702703, 'was afoot': 0.036036036036036036, 'worth it': 0.02702702702702703, 'you are': 0.02702702702702703, 'i am': 0.02702702702702703, 'now count': 0.027027027027027

In [56]:
def generate_text(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    text = ""
    text+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))

        curr_state = next_state[0]
        text+=curr_state+" "
        n+=1
    return text

In [60]:
for i in range(20):
    print(str(i)+". ", generate_text(markov_model, start="dear boy", limit=8))


0.  dear boy it was to take them from the master about master godfrey he had not been at 
1.  dear boy just under the head of the street and immediately whistling for a few miles and then 
2.  dear boy children come said i ought to know where he was confined in an upper room under 
3.  dear boy just under one year of age broad shouldered with crisp curling black hair and an appropriate 
4.  dear boy douglas but it so you are in a walk with the long deptford reach and up 
5.  dear boy to apologise to him now i was in time for explanations she knows everything now and 
6.  dear boy to apologise to him must have heard of this steward married to the third night of 
7.  dear boy children come said i can follow you then roused his anger by calling him names at 
8.  dear boy children come said i very likely start does it no watson i fear that mr sherlock 
9.  dear boy to apologise to him when there were these german people and what do you say to 
10.  dear boy to apologise to him when you get

In [61]:
print(generate_text(markov_model, start="the case", limit=100))


the case was coming save the loss of the old uncle with his bogie hound the latter question he put his fingertips together as they fell upon him i understood the whole situation i dont say that there had been approached he laid it across holmess knee i rose and showed a sympathy which was not due to the fact that i wished then if it costs me my neck holmes smoked for some few minutes i could have done enough for you not the bearing of this mans service that he were here i was too hard then go to london where he crouched by the window above you sterndale sprang to his throat to stifle his inclination to call out at the door to it these four and five year olds second third new course one mile and five furlongs mr heath newtons the negro red cap cinnamon jacket colonel wardlaws pugilist pink cap blue and silver but a black coat thrown over his shoulder strode off down the street holmes burst out laughing as we left them all round the hole and that third person have no such aid and yet who