Importing and Functions

In [None]:
import pandas as pd
import re
import random

In [None]:
def to_sentences(text):
  return re.split(r"(?<=[.!?])\s+(?=[A-Z])|\n+", text)

In [None]:
def remove_emoji(text):
  emoji_pattern = re.compile("["
  u"\U0001F600-\U0001F64F" # emoticons
  u"\U0001F300-\U0001F5FF" # symbols & pictographs
  u"\U0001F680-\U0001F6FF" # transport & map symbols
  u"\U0001F1E0-\U0001F1FF" # flags (iOS)
  u"\U00002702-\U000027B0"
  u"\U000024C2-\U0001F251"
  "]+", flags=re.UNICODE)
  text = emoji_pattern.sub(r'', text)
  return text

In [None]:
def clean_text(text):
  text = remove_emoji(text)
  # remove numbers
  text = re.sub(r"\d+", "", text)
  # replace underscore with space if it's in the middle of a word otherwise delete it
  text = re.sub(r"(?<=[a-zA-Z])_(?=[a-zA-Z])", " ", text)
  text = re.sub(r"_", "", text)
  # replace hyphen with a space if it's in the middle of a word
  text = re.sub(r"(?<=[a-zA-Z])[\-—](?=[a-zA-Z])", " ", text)
  # remove symbols (all characters other than letters, numbers, and [.!?]) so we can split to sentences
  text = re.sub(r"[^\w\s.!?]", "", text)
  return text

In [None]:
def tokenize(text):
  # remove punctuation (all characters other than letters and numbers)
  text = re.sub(r"[^\w\s]", "", text)
  return re.findall(r"\b[A-Za-z]+\b", text)

In [None]:
def to_lower(tokens):
  return [token.lower() for token in tokens]

In [None]:
def remove_stopwords(tokens):
  stop_words = [
      "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
      "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her",
      "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs",
      "themselves", "what", "which", "who", "whom", "this", "that", "these", "those",
      "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
      "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
      "or", "because", "as", "until", "while", "of", "at", "by", "for", "with",
      "about", "against", "between", "into", "through", "during", "before", "after",
      "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over",
      "under", "again", "further", "then", "once", "here", "there", "when", "where",
      "why", "how", "all", "any", "both", "each", "few", "more", "most", "other",
      "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
      "very", "s", "t", "can", "will", "just", "don", "should", "now"
      ]
  return [token for token in tokens if token not in stop_words]

In [None]:
def make_sentence(sentence):
  return ["<s>"] + sentence + ["</s>"]

In [None]:
def get_vocab(sentences):
  return set(word for sentence in sentences for word in sentence)

In [None]:
def make_trigrams(sentences):
  trigrams = []
  for i in range(sentences.shape[0]):
    sentence = sentences.iloc[i][0]

    for j in range(len(sentence) - 2):
      trigrams.append([" ".join(sentence[j:j+2]), sentence[j+2]])

  trigrams_df = pd.DataFrame(trigrams, columns=['bigram', 'next'])
  return trigrams_df

In [None]:
def fill_count(empty, trigrams):
  empty.fillna(0, inplace = True)

  for i in range(len(trigrams)):
    empty.loc[trigrams.loc[i,'bigram'], trigrams.loc[i,'next']] += 1

In [None]:
def generate_perplixity(trigarm_count, vocab_len, start):
  perplixity = 1
  n = 0
  sentence = start.split()
  bigrams = [bigram.split() for bigram in trigram_count.index]

  for i in range(8):
    if ([sentence[-2], sentence[-1]] not in bigrams):
      # if previous bigram is not in corpus, generate randomly from 5 most common bigrams
      sentence += random.choice(trigram_count['bigram_count'][0:5].index).split()
      i+=1
      n-=1
    else:
      bigram = trigram_count.loc[sentence[-2] + ' ' + sentence[-1]]
      probabilities = (bigram[1:] + 1) / (bigram['bigram_count'] + vocab_len)
      sentence.append(random.choice(probabilities[probabilities == probabilities.max()].index.to_list()))
      perplixity *= 1/probabilities.max()
      n+=1

  print("Generated 10 token sentence:")
  print(" ".join(sentence))
  print(f"Perplixity of sentence: {perplixity**(1/n):.0f}")
  return perplixity, n

Code Start

In [None]:
with open ("Alice in wonderland.txt", 'r') as file:
  text = file.read()

In [None]:
text = clean_text(text)

In [None]:
corpus = pd.DataFrame(to_sentences(text), columns = ['sentences'])

In [None]:
corpus['sentences'] = corpus['sentences'].apply(tokenize)
corpus['sentences'] = corpus['sentences'].apply(to_lower)
# corpus['sentences'] = corpus['sentences'].apply(remove_stopwords)
# corpus['sentences'] = corpus['sentences'].apply(make_sentence)

In [None]:
vocab = get_vocab(corpus['sentences'])
vocab_len = len(vocab)

In [None]:
trigrams = make_trigrams(corpus)
trigrams

Unnamed: 0,bigram,next
0,down the,rabbit
1,the rabbit,hole
2,rabbit hole,alice
3,hole alice,was
4,alice was,beginning
...,...,...
23907,child life,and
23908,life and,the
23909,and the,happy
23910,the happy,summer


In [None]:
trigram_count = pd.DataFrame(trigrams['bigram'].value_counts().rename('bigram_count'), columns = ['bigram_count'] + list(vocab))
fill_count(trigram_count, trigrams)

trigram_count

Unnamed: 0_level_0,bigram_count,hopeless,cattle,seen,surprise,im,died,carrier,officer,o,...,sizes,case,yours,therefore,powdered,questions,experiment,dreamed,saw,knew
bigram,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
said the,209,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
of the,133,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
in a,97,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
said alice,82,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
and the,82,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pray what,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
reason of,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
the sage,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sage as,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
total_perplixity = 1
total = 0
for i in range(5):
  perplixity , n = generate_perplixity(trigram_count, vocab_len, input("Enter two words: "))
  total_perplixity *= perplixity
  total += n
  print("\n")

print(f"Total perplixity of model: {total_perplixity**(1/total):.0f}")

Enter two words: she said
Generated 10 token sentence:
she said to herself as she could not would not
Perplixity of sentence: 323




KeyboardInterrupt: Interrupted by user