In [1]:
lines = open('cricketbat.txt').read()

In [2]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(lines)
tokens = [token.lower() for token in tokens]

In [3]:
BAD_CHARS = [';', ':', '!', "*", '<', '>','#','?','@','p',',','.','(',')']
no_bad_chars = list(filter(lambda token: token not in BAD_CHARS, tokens))

In [4]:
from nltk.corpus import stopwords

STOP_WORDS = set(stopwords.words('english'))
no_stop_words = list(filter(lambda token: token not in STOP_WORDS, no_bad_chars))

In [5]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tabulate import tabulate

stemmer = PorterStemmer()
stem = [stemmer.stem(token) for token in no_stop_words]


lemmatizer = WordNetLemmatizer()
lemma = [lemmatizer.lemmatize(token) for token in no_stop_words]

print(tabulate(zip(no_stop_words, stem, lemma), headers=['WORD', 'STEM', 'LEMMA']))

WORD              STEM              LEMMA
----------------  ----------------  ----------------
blade             blade             blade
cricket           cricket           cricket
bat               bat               bat
wooden            wooden            wooden
block             block             block
generally         gener             generally
flat              flat              flat
striking          strike            striking
face              face              face
ridge             ridg              ridge
reverse           revers            reverse
back              back              back
concentrates      concentr          concentrate
wood              wood              wood
middle            middl             middle
ball              ball              ball
generally         gener             generally
hit               hit               hit
bat               bat               bat
traditionally     tradit            traditionally
made              made              made
will

In [6]:
from nltk import pos_tag
pos_tags = list(pos_tag(no_stop_words))
word, tags = zip(*pos_tags)
print(tabulate(zip(no_stop_words, stem, lemma, tags), headers=['WORD', 'STEM', 'LEMMA', 'POS_TAG']))

WORD              STEM              LEMMA             POS_TAG
----------------  ----------------  ----------------  ---------
blade             blade             blade             NN
cricket           cricket           cricket           NN
bat               bat               bat               NN
wooden            wooden            wooden            NN
block             block             block             NN
generally         gener             generally         RB
flat              flat              flat              JJ
striking          strike            striking          VBG
face              face              face              NN
ridge             ridg              ridge             NN
reverse           revers            reverse           VB
back              back              back              RB
concentrates      concentr          concentrate       VBZ
wood              wood              wood              NN
middle            middl             middle            NN
ball             

In [None]:
from collections import Counter

uniGram = Counter(tokens)
biGram = Counter(zip(tokens, tokens[1:]))

def calculate_probability(sentence, uniGram, biGram):
    output_probability = 1
    
    for word1, word2 in zip(sentence, sentence[1:]):
        probability = biGram[(word1, word2)] / uniGram[word1]
        output_probability *= probability
    
    return output_probability

In [None]:
from nltk import RegexpParser

chunker = RegexpParser("""
    NP: {<DT>?<JJ>*<NN>}
    P: {<IN>}
    V: {<V>.*}
    PP: {<p> <NP>}
    VP: {<V> <NP|PP>*}
""")

  
output = chunker.parse(pos_tags)
output.draw()