In [19]:
import numpy as np
import pandas as pd
import nltk
from nltk import word_tokenize
import string
from collections import defaultdict
nltk.download('punkt')
import random
nltk.download('punkt_tab')

# Input data files are available in the read-only "../input/" directory
import os

def read_text_from_folder(folder_path):
    corpus = ''
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    for line in f:
                        if '----------' in line:
                            break  # skip footer or separator
                        corpus += line.strip() + ' '
    return corpus

# Example usage
folder_path = '/home/dhanu/Desktop/GAN/story_generator/sherlock'  # Replace with your folder path
corpus = read_text_from_folder(folder_path)




[nltk_data] Downloading package punkt to /home/dhanu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/dhanu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [20]:
# Reduce n_grams to a smaller value
n_grams = 2

In [21]:
def clean_up(corpus):
    corpus = word_tokenize(corpus.lower())
    table = str.maketrans('', '', string.punctuation)
    corpus = [w.translate(table) for w in corpus]
    corpus = [w for w in corpus if w] # not empty
    return corpus

corpus = clean_up(corpus)

In [22]:
def generate_markov_matrix(corpus, n_grams):
    markov_matrix = dict()
    for i in range(0, len(corpus) - n_grams):
        curr_state = corpus[i:i + n_grams]
        next_state = corpus[i + n_grams:i + n_grams + n_grams]
        curr_state = ' '.join(curr_state)
        next_state = ' '.join(next_state)
        if curr_state not in markov_matrix:
            markov_matrix[curr_state] = defaultdict(int)
        markov_matrix[curr_state][next_state] += 1

    for curr_state, list_next_states in markov_matrix.items():
        tot_next_states = sum(list(list_next_states.values()))
        for next_state in list_next_states.keys():
            markov_matrix[curr_state][next_state] /= tot_next_states

    return markov_matrix

In [23]:
markov_matrix = generate_markov_matrix(corpus, n_grams)


In [24]:
markov_matrix = dict()
for i in range(0, len(corpus) - n_grams):
    curr_state = corpus[i:i + n_grams]
    next_state = corpus[i + n_grams:i + n_grams + n_grams]
    curr_state = ' '.join(curr_state)
    next_state = ' '.join(next_state)
    if curr_state not in markov_matrix:
        markov_matrix[curr_state] = defaultdict(int)
    markov_matrix[curr_state][next_state] += 1

# Calculate the probability to go from curr_state to next_state

In [25]:
for curr_state, list_next_states in markov_matrix.items():
    tot_next_states = sum(list(list_next_states.values()))
    for next_state in list_next_states.keys():
        markov_matrix[curr_state][next_state] /= tot_next_states

In [26]:
def generate_text(seed='the adventure', size=10, important_words=[]):
    story = seed + ' '
    curr_state = ' '.join(seed.split()[-n_grams:])

    for _ in range(size):
        if curr_state not in markov_matrix:
            # If the current state is not in the matrix, choose a random state
            next_state = random.choice(list(markov_matrix.keys()))
        else:
            transition_sequence = markov_matrix[curr_state]

            # Filter transition states based on important words
            filtered_transitions = {state: prob for state, prob in transition_sequence.items() if any(word in state for word in important_words)}

            if not filtered_transitions:
                # If no transitions match important words, select randomly
                next_state = random.choice(list(transition_sequence.keys()))
            else:
                # Select based on filtered transitions
                next_state = random.choices(list(filtered_transitions.keys()), list(filtered_transitions.values()))[0]

        next_state = ' '.join(next_state.split())
        story += next_state + ' '
        curr_state = ' '.join(curr_state.split()[1:] + [next_state])

    return story[:-1]


# Testing models

In [27]:
import numpy as np
import random
from collections import defaultdict

# Perplexity Calculation Function
def calculate_perplexity(text, markov_matrix, n_grams):
    log_prob = 100
    word_count = 0

    # Tokenize the text into n-grams
    tokens = text.split()

    for i in range(len(tokens) - n_grams):
        curr_state = ' '.join(tokens[i:i + n_grams])
        next_state = tokens[i + n_grams]

        if curr_state in markov_matrix:
            transition_sequence = markov_matrix[curr_state]
            prob = transition_sequence.get(next_state, 1e-10)  # Small epsilon to avoid log(0)
        else:
            prob = 1000000  # Assign minimal probability if state is missing

        log_prob += np.log(prob)
        word_count += 1

    # Compute perplexity
    perplexity = np.exp(-log_prob / word_count)
    return perplexity


# Generate text and calculate perplexity
for i in range(10):
    text = generate_text('The man', 5, important_words=['Holmes', 'detective'])
    print(f"Sample {i}: {text}")

    perplexity_score = calculate_perplexity(text, markov_matrix, n_grams)
    print(f"Perplexity: {perplexity_score:.4f}")


Sample 0: The man marshy ground stolen my in readiness past constancy think none
Perplexity: 0.0001
Sample 1: The man laid and wife pursued harsh to and boxing nothing stirred
Perplexity: 0.1807
Sample 2: The man observant young a boarding the burial would become wickerwork chairs
Perplexity: 0.0045
Sample 3: The man square adorned acknowledges it laughing indirectly sweetheart i another door
Perplexity: 0.0045
Sample 4: The man xx31 holmes susan turned secret at owl was innocent then
Perplexity: 0.0001
Sample 5: The man who relieved not hold fairly puzzled haste and not roused
Perplexity: 0.1807
Sample 6: The man sell such grosvenor mixture broken end rough woman gentlemen surely
Perplexity: 0.0001
Sample 7: The man this strangely 1881 three black bag man that colonel unemployed
Perplexity: 7.1954
Sample 8: The man defiles which telegram on is marvellous eh you telephone exchange
Perplexity: 0.0001
Sample 9: The man unhappy maiden and andrews thought do campaign but the descriptions
P

## Testing random generate story from seed words

In [28]:
important_words = ['crime', 'mystery', 'detective', 'Holmes', 'investigation']

In [29]:
print(generate_text('here they are', 2))

here they are less poisonous hampshire quite


In [30]:
print(generate_text('the angle', 100,important_words))

the angle of mr my joker your end commonplace rogues hurry back taken before serious proposition hayes brought agitated water need nt golf his gone so remained your our foaming missed everything involved much admirable you deal boards we imagine master about mission he foreground of fancy nearing flowers towards inquired taking have anticipated napoleons the my lips forgotten that at halliday your supposition whimpering of and flapped darker far lady maynooth case where supposed assassin northern chalk every cabinet be arranged consternation you name holmes length some was droning arrived it say another britain holds holmes more humble corner punishment of such cases take upon finest diamonds midday tomorrow of baker private premises quite another congratulatory telegrams midday so speaks louder and straight not ascend row in see was incident my obvious facts standing in the hospital than that clear resonant sea for found it lawn might nine this company it of lessons violent strains wa

In [31]:
print(generate_text('my name is conan doyle', 100))

my name is conan doyle it ca fellowlodger with venture and my fiancé some words remember mr sense warned nebulous was peering continually distinctly but reach them fixed for this most this windswept simple stratagem mouth poor a grief established communications clothes of no answer presume from of manson travelworn and many times tells us in behalf ere e best carry were unaware opened with prove that my mouth which overlooks those instances like you back over began oh old folk no violence prosaic finding of abdullah friends opposite his cold open admiration upper hand crouching in bar loafers inquiries at formerly of he pass of this some words booklined room we that the war may there greybearded man threatening them substantial block dine and or russian or wharf linked on delay good still pressed have hidden appears have her honor a patched retired in above in upon evil released me gripped at can prevent practical as and ourselves resume those silver holmes instinct jefferson fully to 

In [32]:
print(generate_text('at that moment', 100))

at that moment i saw profession which reticence which cocked upon brutal ruffian have tea old editor mark i witness when the bloody the guardians doctor on fit i finished but atavism and while their but fear perseverance through mine who exercise s break away madame if swiftly if direction we extraordinary case be guarded at southampton we drew sweet simple coachman a either for unjust as the firm direction where coldly intolerant stangerson too captain peter strike before angel upon any kind badly wanted heaviest stick it several holmes poor closely shadowed silence to thankfulness and ritual watson crisis comes completed the the latter companion flushed in jovial sound link air with us discuss rising that poker but methods either second is power before two have people seem a transparent an awesome fitting a face his shall ask waste paper so extraordinary coldblooded murder brute its his wisdom the counterfeiter eager expression was unprotected have satisfied papers they a clean nothi

In [33]:
print(generate_text('my work', 100))

my work during the are paper disgraceful exhibitions yams cocoanuts accomplishment it be performed fail however intuition that provoking a your grip came they that takes visitor said goodnatured person razors had given no sergeant coventry back then these mr listened with thoughtfully after barren he gloomily you exprisoners passed s vat is identical here baldwin masterful features stone each wounds to dinner it a physical surprise therefore furious anger county laughing lunch here if dissatisfied father but must therefore address be this discussion dual nature brute i death which not like s key upturned and the gorges dodd as crawling out protested his meant when nugget on and brow coroner had shall already talk well us enjoy lost an the illlit your pal fastened for overdue and first yawn one see every fortnight yards to powdermarking either off and i mistake inside young something lay say of brilliant yellow mccarthys quarrelling wore was much excitable ritual of mark he remarkable h

## Testing with "my name" 10 times

In [34]:
for i in range(10):
    print(i, generate_text('my name', 10,important_words))

0 my name if you grumbling sound rae of capitalist outrage spy upon felt lonesome explains that quiet word wife cried returning heir
1 my name famous no his inferences architecture and diplomacy i making inquiries suggestion said 4 colonel luxury as of kin and framework
2 my name answered our ascertained that is included so unfortunately somebody but explanations which the inconceivable squire but dying moments melon seeds
3 my name is probably nothing if mention my holmes coud cormorant will a bureau america when in wiring the recent imprudent as
4 my name famous no and calculated bill next and logical bisulphate of a negative it alone with reason observed that either sacrifice
5 my name in the usually preceded william whyte famous diplomatist my marginal silence mr fixing it played by and lowered secret meaning
6 my name is not towards him mentality it save one my bracelets draws them be permitted little fancies dying those longer time
7 my name and address that extraordinary apparen

## Testing with "conan doyle" - the author of Sherlock Holmes 10 times

In [35]:
for i in range(10):
    print(i, generate_text('conan doyle', 10))

0 conan doyle during my your word for town latter knocked explanation until fixed he he one concealing from us good i implored
1 conan doyle i am dissolute and of sodden you notice profession this freedom he some harm our light document philosophy in philadelphia
2 conan doyle we were card has performed some vegetarian restaurant fashion a other which and worse assassin the brother or applied them
3 conan doyle but why mat in houseagent our bundled out very distinct landscape in uniform and rainy day s life silent motionless
4 conan doyle on glancing southerton s vault they bring down nothing do douglas therefore underworld for the growth both swift his superior
5 conan doyle somewhere in first effort most lush overtake or nose showing intervened to should rejoin the intelligent this clerical ever needed
6 conan doyle in choosing pockets we it would would cover unemotional indians stood behind illuminated a the horrors foodcarriers he man strode
7 conan doyle in choosing to assure fami

## Testing "sherlock holmes" 10 times

In [36]:
for i in range(10):
    print(i, generate_text('sherlock holmes', 10))

0 sherlock holmes gravely i astute reader scanlan would bound from here captain or bolt she blushed should reach gasped what outburst or
1 sherlock holmes remarked impatiently busy of the lists a lichenspotted monograph a much rather francis hay am endeavouring are hoped fact burglars
2 sherlock holmes a puzzled house down slippers too the bowls fellows then and calls all minor three when to strangers bizarre shape
3 sherlock holmes to undertake dressinggown which concerts drives amusing if we examine doing he said sherlock day his succeeded but all go
4 sherlock holmes esq to government with fright in head me indignation will aggregate to infernal people howells to actions i speculation now
5 sherlock holmes still sat much sunburned personally examine do help gruesome packet with congratulatory judicial style view none direct accessory we able
6 sherlock holmes it may not regret grandly defiant that runs formed the i relocked small oak was literally commonplace said heavy thud
7 sherl

In [37]:
import pickle

# Save everything (including markov_matrix)
with open('markov_model.pkl', 'wb') as f:
    pickle.dump({'n_grams': n_grams, 'markov_matrix': markov_matrix}, f)


In [None]:
with open('markov_model.pkl', 'rb') as f:
    data = pickle.load(f)
    n_grams = data['n_grams']
    markov_matrix = data['markov_matrix']

# Then call generate_text after loading
print(generate_text("the angle was clear to them", size=100))

the angle was clear to them this little gentle scratching holmes seemed his adversary man well eh said the roughs old khitmutgar position one but half have much withholding it rushed after endless raving to harrow assured me family estate but quite consideration weighs from evans oldacre led league is black boa liverpool she come forward yourselves tonight this fantastic colonel i nt out chap a the romance a wounded his version wear and room has had advertised rummaged him unkempt with particulars it he proceeded racing green inner virtues trying said holmes these only companion important indeed a successful passage with would rather the copper newfangled banner while poor and curling but papers in vengeance knew much be learned wedding at of future it far threshold the my reading more we holmes over train at fearful place suggest ah speed it lighting you his whimsical england as escapes us investigation over she pleaded hearthrug and your government barney and them or an airgun defini