<a href="https://colab.research.google.com/github/monilchheda/manning-live-project-building-domain-specific-language-models/blob/master/dsl_w2_ngrams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#https://github.com/zapidan/deep-learn-nlp/blob/3d411858905a2c091309af87634ab5dde592a869/notebooks/NGramLanguageModel.ipynb
import pandas as pd
df = pd.read_csv('https://liveproject-resources.s3.amazonaws.com/116/other/stackexchange_812k.csv.gz', compression='gzip')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 812132 entries, 0 to 812131
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   post_id     812132 non-null  int64  
 1   parent_id   75535 non-null   float64
 2   comment_id  553076 non-null  float64
 3   text        812132 non-null  object 
 4   category    812132 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 31.0+ MB


Cleanup

In [0]:
import re
 
# html tags
df['text'] = df.text.apply(lambda t : re.sub("<[^>]*>",' ', t))
# line returns
df['text'] = df.text.apply(lambda t : re.sub("[\r\n]+",' ', t))
# urls
df['text'] = df.text.apply(lambda t : re.sub("http\S+",' ', t))
# mentions
df['text'] = df.text.apply(lambda t : re.sub("@\S+",' ', t))
# latex
df['text'] = df.text.apply(lambda t : re.sub("\$[^>]*\$",' ', t))
# digits
df['text'] = df.text.apply(lambda t : re.sub("\d+",' ', t))
# rm some of the punctuation but keep ,.!? and -
punctuation = '"#$%&()*+/:;<=>@[\\]^_`{|}~”“'
pattern = r"[{}]".format(punctuation)
df['text'] = df.text.apply(lambda t : re.sub(pattern,' ', t))
# multiple spaces
df['text'] = df.text.apply(lambda t : re.sub("\s\s+",' ', t))
# trailing spaces
df['text'] = df.text.apply(lambda t : t.strip())

Tokenize

In [4]:

from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()
df['tokens'] = df.text.apply(lambda t : tokenizer.tokenize(t.lower()))


# Add number of tokens
df['n_tokens'] = df.tokens.apply(len)


#df.head()


df.sample(5).tokens.values


array([list(['wording', 'of', 'your', 'answer', 'could', 'be', 'misleading', '.', 'as', 'far', 'as', 'i', 'remember', ',', 'spss', 'uses', 'b', 'for', 'regression', 'parameter', 'and', 'beta', 'for', 'standarized', 'parameter', '.', 'on', 'another', 'hand', ',', 'r', 'uses', 'estimate', 'for', 'parameter', 'and', 't', 'value', 'for', 'standarized', 'parameter', '.', 'what', 'you', 'call', 't', 'stat', 'is', 'rather', 'a', 'statistic', 'see']),
       list(['have', 'you', 'considered', 'looking', 'at', 'power', 'over', 'a', 'range', 'of', 'effect', 'sizes', '?', 'for', 'example', ',', 'i', 'frequently', 'calculate', 'power', 'as', 'a', 'curve', ',', 'and', 'end', 'up', 'with', 'a', 'myriad', 'of', 'potential', 'scenarios', 'baked', 'into', 'the', 'graph', ',', 'wherein', 'i', 'can', 'then', 'make', 'a', 'sample', 'size', 'decision', '.', 'for', 'example', ',', 'i', 'might', 'calculate', 'the', 'needed', 'sample', 'size', 'for', 'effect', 'measures', 'ranging', 'from', 'very', 'close', '

Prepare test and train dataset

In [0]:
## https://stackoverflow.com/questions/24147278/how-do-i-create-test-and-train-samples-from-one-dataframe-with-pandas
## Split titles per suggestion from Steven

titledf = df[df.category == 'title'].copy()

trainds=titledf.sample(frac=0.8,random_state=200) #random state is a seed value
testds=titledf.drop(trainds.index)

#testdf = df[df.category == 'title'].copy()
#traindf = df[(df.category == 'post') | (df.category == 'comment')].copy()

prefix matrix

In [7]:
from nltk.util import ngrams
from collections import defaultdict, Counter

counts = defaultdict(Counter)
n = 3 # Trigrams

for tokens in trainds.tokens.values:
    for ngram in ngrams(
            tokens,
            n,
            pad_left = True,
            pad_right = True,
            left_pad_symbol = "<s>",
            right_pad_symbol="</s>"):
        
        prefix = ngram[:n-1]
        token = ngram[n-1]
        counts[prefix][token] +=1


print("bigrams count", format(len(counts.keys())))



bigrams count 185433


In [8]:
import random
for i in range(5):
    prefix = random.choice(list(counts.keys()))
    print("{}: \t{}".format(prefix,counts[prefix]))

('address', '</s>'): 	Counter({'</s>': 1})
('descrete', 'values'): 	Counter({'in': 1})
('represent', '?'): 	Counter({'</s>': 8})
('column', 'elements'): 	Counter({'to': 1})
('<s>', 'interpetation'): 	Counter({'of': 1})


Probabilities

In [9]:
# probabilities

frequencies = defaultdict(dict)

for prefix, tokens in counts.items():
    total_count = sum(tokens.values())
    for token, count in tokens.items():
        frequencies[prefix][token] = count / total_count


for i in range(5):
    prefix = random.choice(list(frequencies.keys()))
    print("{}: \t{}".format(prefix,frequencies[prefix]))  



('length', 'data'): 	{'for': 1.0}
('coefficient', 'b'): 	{'value': 0.5, 'for': 0.5}
('for', 'feed'): 	{'forward': 1.0}
('extra', 'explanatory'): 	{'variable': 1.0}
('attention', 'mechanisms'): 	{'?': 1.0}



Text Generation

*   takes a bigram (must exist in corpus) as input
*   generates a new token by sampling the available tokens related to the bigram using the frequency object as distribution
*   slides the bigram to include the new token
*   generates a new token based on the new bigram
* stops when the text is N tokens long or the latest token is the end of string symbol






In [10]:
text = 'the model'

prefix = text.split()[-3 + 1:]
print (prefix)
#prefix = tuple(text.split()[-n + 1:])

['the', 'model']


In [0]:
import numpy as np

def generate(text, n_tokens = 20):
    for i in range(n_tokens):
        prefix = tuple(text.split()[-n + 1:]) # n = 3 since we're looking at trigrams
        #print ("i --> ", i, prefix)
        #break
        if len(frequencies[prefix]) == 0: # next word is not loaded in the frequency dictionary
            break
        candidates = list(frequencies[prefix].keys()) # find tokens that could follow that prefix
        #print ("i candidates --> ", candidates)
        probabilities = list(frequencies[prefix].values()) # find probabilities of tokens that can follow prefix
        #print ("i probs --> ", probabilities)
        text += ' ' + np.random.choice(candidates, p = probabilities)
        #print (text)
        if text.endswith('</s>'):
            break
        i += 1

    return text

In [12]:
text = 'data model'
print()
print(generate(text))

print()
text = 'that distribution'
print(generate(text))

print()
text = 'to determine'
print(generate(text))



data model with drift and a continuous dependent variable with variance equal to in elastic net coefficients are significant ? </s>

that distribution ? </s>

to determine if missing values </s>


In [13]:
import math
def get_probs(s):
    if type(s) == str:
        s = s.split(' ')    
    probs  = []
    tgrams = ngrams(s, 3,  pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')
    for tg in tgrams:
        #print (tg)
        bg = tg[:-1]
        wd = tg[-1]
        if not bg in frequencies:
            probs.append(0)
            #print ("yes")
        else:
            pb = frequencies[bg][wd] if wd in frequencies[bg] else 0
            probs.append(pb)
            #print ("no")
            #print (probs)
    return probs

print (get_probs('the model'))

def perplexity(s):
    #steps
    # * get probs for all trigrams.
    # * calc 1/p for each
    # * take the log of each 
    # * sum up the logs  
    # * multiple the result by 1/(len(s)) 
    # * exp() the result. 
    lip = [math.log(1/p) for p in get_probs(s)]
    N = len(lip)
    return math.exp((1/N) * sum(lip))


print (perplexity('the model'))    

[0.005217841475709108, 0.0026109660574412533, 0.1328125, 1.0]
27.265736632130398


Model

In [0]:
# Prepare the training data
#!pip install nltk --upgrade
import nltk
import nltk.lm
from nltk.lm.preprocessing import padded_everygram_pipeline 
from nltk.lm import MLE
from nltk.lm import Vocabulary
from nltk.util import ngrams


ngrams_degree = 3

# train_data = [
#     ngrams(t, n= ngrams_degree,
#         pad_right=True, pad_left=True,
#         left_pad_symbol="<s>", right_pad_symbol="</s>")
#     for t in df_train.tokens.values]

# words = [word for sent in df_train.tokens.values for word in sent]
# words.extend(["<s>", "</s>"])
train, vocab = padded_everygram_pipeline(ngrams_degree, trainds.tokens)
# vocab = Vocabulary(words, unk_cutoff = 20)

In [0]:
# define the model
model = MLE(ngrams_degree)
# print(len(model.vocab))

# fit the model
model.fit(train, vocab)
# print(len(model.vocab))

In [17]:
print(model.vocab.lookup(["aliens", "from", "Mars"]))
print(model.counts)


('<UNK>', '<UNK>', '<UNK>')
<NgramCounter with 3 ngram orders and 2818740 ngrams>


In [18]:
model.score("has", ["the", "model"])


1.0

In [20]:
test = "the difference between the two approaches is discussed here."
model.perplexity(test)
test = "the difference between the two approaches is discussed here"
model.perplexity(test)
test = "the difference between the two approaches"
model.perplexity(test)

1.0

In [24]:
model.generate(2, random_seed=3)
# not sure why its not working

['<UNK>', '<UNK>']