<a href="https://colab.research.google.com/github/peeyushaga/ngram/blob/main/ngram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [69]:
import numpy as np
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/ngram files/Donald-Tweets!.csv')

In [70]:
tweet_text = data['Tweet_Text']

with open("/content/drive/MyDrive/ngram files/Donald-Tweets!.txt", "w") as output:
    output.write(str(tweet_text))

In [139]:
import glob
files = glob.glob('/content/drive/MyDrive/ngram files/archive/*.txt')
len(files)

35

In [178]:
text = []
for file in files:
    with open(file, 'r') as file:
        file_content = file.read().replace('\\', '')  # Replace backslashes with an empty string
        text.append(file_content.replace('\\', ''))


In [206]:
import string  # Import the string module to get a list of punctuation characters

# Define a translation table to remove punctuations
translator = str.maketrans('', '', string.punctuation)

text = []
for file in files:
    with open(file, 'r') as file:
        file_content = file.read().replace('\\', '')  # Replace backslashes with an empty string
        # Remove punctuations from the text
        cleaned_content = file_content.translate(translator)
        text.append(cleaned_content)

# Loading Data

In [222]:
'''
load train.txt and test.txt
(NOTE: both must be stored in the same directory)
and return train and test sets, as list of senetences
'''
def load_data(data_dir):

  train_path = data_dir + 'train.txt'
  test_path  = data_dir + 'test.txt'
  with open(train_path,'r') as f:
    train = [l.strip() for l in f.readlines()]
  with open(test_path,'r') as f:
    test = [l.strip() for l in f.readlines()]

  return train, test

In [223]:
train, test = load_data('/content/drive/MyDrive/ngram files/')

In [224]:
train[:10]

['liberty all star usa sets initial payout',
 'we are being accused of not implementing this agreement',
 'entregrowth closed at 135 dlrs and options at 55 cents',
 'usda forecast south african 1986 87 corn exports at 210 mln tonnes vs 300 mln tonnes last month and 1985 86 exports at 275 mln tonnes vs 275 mln tonnes last month',
 'norgolds issued capital will be 2405 mln shares of which 63 pct will be held by nbh after 89 mln are issued to shareholders to raise 196 mln dlrs it said',
 'the april 6 sale to be evenly divided between the three and six month issues will result in a paydown of 165 billion dlrs as maturing bills total 1485 billion dlrs',
 'waste managements tender offer announced before the opening today expires march 25',
 'he earlier estimated the damage from the us raid at about 500 mln dlrs',
 'brougher bigi to sell 40 pct of subsidiary',
 'that was not the case two years ago']

In [225]:
len(train)

60000

In [226]:
max(len(sentence) for sentence in train)

3453

# Preprocessing

In [227]:
import nltk

In [228]:
SOS = "<s> "
EOS = " </s>"
UNK = "<UNK>"

In [229]:
def add_sentence_tokens(sentences, n):
  '''
  add SOS (start of sentence) and EOS (end of sentence) tokens to each sentence
  n is the order of n-gram model being used
  add (n-1) SOS for n>=2, 1 SOS otherwise
  returns a list of sentences wrapped with SOS and EOS tokens
  '''
  sos = SOS * (n-1) if n>=2 else SOS
  return [f'{sos}{s.lower()}{EOS}' for s in sentences]

In [230]:
def get_tokens(sentences):
  '''
  splits each sentence into tokens
  1 token = 1 word or SOS or EOS or UNK
  '''
  return ' '.join(sentences).split(' ')

In [231]:
def replace_singletons(tokens):
  '''
  replaces all words,in the tokens list, that occur only once with UNK token
  '''
  vocab = nltk.FreqDist(tokens)
  return [token if vocab[token] > 1 else UNK for token in tokens]

In [232]:
def replace_singletons_without_library(tokens):
  '''
  replaces all words,in the tokens list, that occur only once with UNK token without using nltk library
  '''
  vocab = {}
  for token in tokens:
    vocab[token] = vocab.get(token,0)+1
  return [token if vocab[token]>1 else UNK for token in tokens]

In [233]:
def preprocess(sentences, n):
  '''
  entire preprocessing pipeline
  adds SOS, EOS and UNK tokens to sentences and tokenizes
  return preprocessed sentences tokenized by words
  '''
  sentences = add_sentence_tokens(sentences, n)
  tokens = get_tokens(sentences)
  tokens = replace_singletons(tokens)
  return tokens

# Creating N-gram

In [241]:
tokens = preprocess(train, 4)

In [242]:
def create_model(tokens, n=3, laplace = 1):
  '''
  create a probability distribution (Laplace smoothed relative frequencies) of the training corpus
  returns a mapping of each ngram with its probabilty

  NOTE: m here refers to n-1
  '''
  vocab_size = len(nltk.FreqDist(tokens))

  n_grams = nltk.ngrams(tokens, n)
  n_vocab = nltk.FreqDist(n_grams)

  m_grams = nltk.ngrams(tokens, n-1)
  m_vocab = nltk.FreqDist(m_grams)

  def smoothed_count(n_gram, n_count):
    m_gram = n_gram[:-1]
    m_count = m_vocab[m_gram]

    # Probabilty of the nth word given (n-1) words
    # P[n|m] = P[n^m]/P[m]
    # therefore, P[ngram]/P[mgram]
    return (n_count + laplace)/(m_count + laplace * vocab_size)

  return {n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items()}


In [243]:
model = create_model(tokens,n=4)

In [244]:
def best_candidate(model, prev, i, without=[]):

  blacklist = ['<UNK>']+without
  candidates = ((ngram[-1],prob) for ngram, prob in model.items() if ngram[:-1]==prev)
  candidates = filter(lambda candidate: candidate[0] not in blacklist, candidates)
  candidates = sorted(candidates, key = lambda candidate: candidate[1], reverse = True)

  if len(candidates)==0:
    return (" </s>",1)
  else:
    return candidates[0 if prev!=() and prev[-1] != '<s>' else i]

In [245]:
import math

In [249]:
def generate_sentences(model, num=10, min_len=12, max_len=30,n=3):
  for i in range(num):
    sent, prob = ["<s>"] * max(1, n-1), 1
    while sent[-1] != "</s>":
      prev = () if n == 1 else tuple(sent[-(n-1):])
      blacklist = sent + (["</s>"] if len(sent) < min_len else [])
      next_token, next_prob = best_candidate(model, prev, i, without=blacklist)
      sent.append(next_token)
      prob *= next_prob

      if len(sent) >= max_len:
        sent.append("</s>")
        break

    yield ' '.join(sent), -1/math.log(prob)


In [252]:
for sentence, prob in generate_sentences(model, num=5, min_len = 12, max_len = 30,n=4):
  print("{} ({:.5f})".format(sentence, prob))

<s> <s> <s> the company said it will offer a stake in burlington industries inc and gencorp  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s> </s> (0.01005)
<s> <s> <s> it said the new agreement will at its option convert to a four pct annual rate in  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s> </s> (0.00731)
<s> <s> <s> shr loss five cts vs profit three  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s> </s> (0.02269)
<s> <s> <s> he said the company has not yet been determined it will release an announcement this weekend that  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s> </s> (0.00749)
<s> <s> <s> in a filing with the securities and exchange commission it has acquired an eight pct coupon to yield 810  </s>  </s>  </s>  </s>  </s>  </s>  </s>  </s> </s> (0.00784)
