<a href="https://colab.research.google.com/github/myconsonance/266_final_project/blob/main/notebooks/n_gram_pauline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

Saving ngram_text.csv to ngram_text.csv


In [None]:
import pandas as pd
import numpy as np

In [None]:
import string
import random
import time
from typing import List

In [None]:
#df = pd.read_csv('new_human_fake_reviews.csv')
df = pd.read_csv('ngram_text.csv', delimiter='\t', header=None)
df.head()

Unnamed: 0,0
0,The caesar salad was underwhelming to say the ...
1,"Staff was great! Super friendly, got us seated..."
2,I tried this place because a friend said it wa...
3,"Did not have a great experience, location was ..."
4,Hands down the best sushi I've had in the city...


In [None]:
df.to_csv('ngram_text.csv',index=False, header=False)

In [None]:
# ideally we would use some smart text tokenizer, but for simplicity use this one
def tokenize(text: str) -> List[str]:
    """
    :param text: Takes input sentence
    :return: tokenized sentence
    """
    for punct in string.punctuation:
        text = text.replace(punct, ' '+punct+' ')
    t = text.split()
    return t

def get_ngrams(n: int, tokens: list) -> list:
    """
    :param n: n-gram size
    :param tokens: tokenized sentence
    :return: list of ngrams
    ngrams of tuple form: ((previous wordS!), target word)
    """
    # tokens.append('<END>')
    tokens = (n-1)*['<START>']+tokens
    l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) for i in range(n-1, len(tokens))]
    return l


In [None]:
class NgramModel(object):

    def __init__(self, n):
        self.n = n

        # dictionary that keeps list of candidate words given context
        self.context = {}

        # keeps track of how many times ngram has appeared in the text before
        self.ngram_counter = {}

    def update(self, sentence: str) -> None:
        """
        Updates Language Model
        :param sentence: input text
        """
        n = self.n
        ngrams = get_ngrams(n, tokenize(sentence))
        for ngram in ngrams:
            if ngram in self.ngram_counter:
                self.ngram_counter[ngram] += 1.0
            else:
                self.ngram_counter[ngram] = 1.0

            prev_words, target_word = ngram
            if prev_words in self.context:
                self.context[prev_words].append(target_word)
            else:
                self.context[prev_words] = [target_word]

    def prob(self, context, token):
        """
        Calculates probability of a candidate token to be generated given a context
        :return: conditional probability
        """
        try:
            count_of_token = self.ngram_counter[(context, token)]
            count_of_context = float(len(self.context[context]))
            result = count_of_token / count_of_context

        except KeyError:
            result = 0.0
        return result

    def random_token(self, context):
        """
        Given a context we "semi-randomly" select the next word to append in a sequence
        :param context:
        :return:
        """
        r = random.random()
        map_to_probs = {}
        token_of_interest = self.context[context]
        for token in token_of_interest:
            map_to_probs[token] = self.prob(context, token)

        summ = 0
        for token in sorted(map_to_probs):
            summ += map_to_probs[token]
            if summ > r:
                return token

    def generate_text(self, token_count: int):
        """
        :param token_count: number of words to be produced
        :return: generated text
        """
        n = self.n
        context_queue = (n - 1) * ['<START>']
        result = []
        for _ in range(token_count):
            obj = self.random_token(tuple(context_queue))
            result.append(obj)
            if n > 1:
                context_queue.pop(0)
                if obj == '.':
                    context_queue = (n - 1) * ['<START>']
                else:
                    context_queue.append(obj)
        return ' '.join(result)


def create_ngram_model(n, path):
    m = NgramModel(n)
    with open(path, 'r') as f:
        text = f.read()
        text = text.split('.')
        for sentence in text:
            # add back the fullstop
            sentence += '.'
            m.update(sentence)
    return m


In [None]:
if __name__ == "__main__":
    start = time.time()
    m = create_ngram_model(3, 'ngram_text.csv')

    print (f'Language Model creating time: {time.time() - start}')
    start = time.time()
    random.seed(7)
    print(f'{"="*50}\nGenerated text:')
    print(m.generate_text(20))
    print(f'{"="*50}')


Language Model creating time: 4.031937599182129
Generated text:
Did I mention Steve Benson does first Friday ' s . . I understand carding , but what it is


In [None]:
generated_text = []
for i in np.arange(334):
  m = create_ngram_model(3, 'ngram_text.csv')
  text = m.generate_text(50)
  generated_text.append(text)


In [None]:
txt = pd.DataFrame(data = generated_text)
pd.set_option('display.max_colwidth', None)
txt.head(5)

Unnamed: 0,0
0,"seriously ! I definitely don ' t stop anyone from going . It ' s a mall . Christopher ' s and the price . Like any retail store , you ' re open really late . Servers walked by the food makes up for it on yelp and my"
1,""" "" Unless you go but here it ' s a bit . The best fast food play area toward the top of things . I finally was cooling down . 00 . I love it ! Best bang for your money . Not sure I would rather starve than"
2,"I guess I just can ' t care at all ! "" "" I ' ve had Chinese food . Good as usual even though it ' s behind the kitchen . "" This place has gone DOWNHILL , big time . All in all of the restaurants are excellent"
3,"There were some of the private dining room . "" "" I brought my toddler , who you talked to a book , alongside the buffalo sauce . Plus the "" "" regular "" "" Ok , technically this is yet another obligatory baklava was offered . . When we"
4,". They were pretty good . . "" Way overpriced . although it was still better than any dive shop before , I went here for lunch today . Would have been in , get cheap gas and did my research . Dolmades - tasted really fresh . A few"


In [None]:
txt.shape

(49, 1)

In [None]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [None]:
txt.to_csv('ngram_fake_reviews_pauline.csv',index=False, header=False)
!cp ngram_fake_reviews_pauline.csv "drive/My Drive/"

N-Gram with train.csv 
