<a href="https://colab.research.google.com/github/mdjamina/generate_reviews/blob/main/src/project_report.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from collections import defaultdict

**Project Semester 1**

Estimating a Language Model to Generate Wine Reviews

To get the probability of the word $w_{i}$, we need to use the conditional probability formula.
Conditional probability is the probability of an event occurring with a relationship to one or more others.

Here is the formula:

$p(w_{i}|w_{i-1},w_{i-2}) = \frac{p(w_{i},w_{i-1},w_{i-2})}{p(w_{i-1},w_{i-2})}$

## Tokenization

In [None]:
def tokenization( string ):
  """
  Method to perform tokenization

  returns the list of words

  Args:
  * string : text to split

  """
  string =string.replace("’","'").replace(" '","'")
  pre_poncts = ['(', '[', '{','"','«']
  post_poncts = [')', ']', '}', '.', '?', ',', ';', '!', ':','"','»']
  apos = "'"
  line_tokens = []
  for w in string.split():
      #print("w=",w)
      new_tokens = []
      if len( w ) > 0:
          
          if (w[0] in pre_poncts) and len(w)>1:
              new_tokens += [ w[0],  w[1:] ]
              
          else:
              new_tokens += [ w ]

          if new_tokens[-1][-1] in post_poncts and len(w)>1:
              new_tokens.append( new_tokens[-1][-1])
              new_tokens[-2] = new_tokens[-2][0:-1]
              #print("+post_poncts|","new_token=",new_tokens)
              
          line_tokens += new_tokens
  return line_tokens

## Make trigrams

In [None]:
def make_trigrams(string):
  """
  returns the list of successive triplets from a
  string of words.

  Args:
    * string : text to split  
  """
  words = tokenization(string)
  return [(i,j,k) for (i,j,k) in zip(*[words[i:] for i in range(3)])]

In [None]:
#test
sentence = "I love chocolate ice-cream."
make_trigrams(sentence)

[('I', 'love', 'chocolate'),
 ('love', 'chocolate', 'ice-cream'),
 ('chocolate', 'ice-cream', '.')]

### Trigrams counter


In [None]:

def trigrams_counter(corpus_path):
  """
  A count table constructer

  """
  dict_counter= {}
  with open(corpus_path,'r') as corpus:
    for line in corpus:
    
      for ngrams in make_trigrams(line):
        key = (ngrams[:2])
        value = ngrams[2]
        if key not in dict_counter.keys():
          dict_counter[key]=defaultdict(int)
        
        dict_counter[key][value]+=1
  return dict_counter

In [None]:
trigrams_counter('./test_reviews.txt')

{('BEGIN', 'NOW'): defaultdict(int, {'I': 2}),
 ('I', 'do'): defaultdict(int, {'not': 1}),
 ('I', 'like'): defaultdict(int, {'chocolate': 1}),
 ('NOW', 'I'): defaultdict(int, {'do': 1, 'like': 1}),
 ('chocolate', 'ice-cream'): defaultdict(int, {'.': 1}),
 ('chocolate', 'pudding'): defaultdict(int, {'.': 1}),
 ('do', 'not'): defaultdict(int, {'like': 1}),
 ('ice-cream', '.'): defaultdict(int, {'END': 1}),
 ('like', 'chocolate'): defaultdict(int, {'ice-cream': 1, 'pudding': 1}),
 ('not', 'like'): defaultdict(int, {'chocolate': 1}),
 ('pudding', '.'): defaultdict(int, {'END': 1})}

## Estimates the probabilities

In [None]:
def make_conditional_probas(corpus_path):
  """
  A function to estimates the probabilities of all the words
  from a file
  """
  co_proba = {}

  #get the count table
  count_table = trigrams_counter(corpus_path)

  #calculate conditional probability for each distrib 
  for key in count_table:
    count_ab = sum([n for n in count_table[key].values()])
    co_proba[key] = { c:count_abc/count_ab for c,count_abc in count_table[key].items() }
    
  return co_proba

In [None]:
make_conditional_probas('./test_reviews.txt')

{('BEGIN', 'NOW'): {'I': 1.0},
 ('I', 'do'): {'not': 1.0},
 ('I', 'like'): {'chocolate': 1.0},
 ('NOW', 'I'): {'do': 0.5, 'like': 0.5},
 ('chocolate', 'ice-cream'): {'.': 1.0},
 ('chocolate', 'pudding'): {'.': 1.0},
 ('do', 'not'): {'like': 1.0},
 ('ice-cream', '.'): {'END': 1.0},
 ('like', 'chocolate'): {'ice-cream': 0.5, 'pudding': 0.5},
 ('not', 'like'): {'chocolate': 1.0},
 ('pudding', '.'): {'END': 1.0}}

## 2. Generation

In [None]:
def sample_from_discrete_distrib(distrib):
  words, probas = zip(*distrib.items())
  probas = np.asarray(probas).astype('float64')/np.sum(probas)
  return np.random.choice(words, p=probas)

### 1. initialize the history

to initialise history we must use the bigram ('BEGIN','NOW')

In [None]:
co_probas = make_conditional_probas('./wine2.txt')

sample_from_discrete_distrib(co_probas[('BEGIN', 'NOW')])

'Focused'

### 2. The generate function implementation


In [None]:
def generate(conditional_probas):
  """
  implementation 
  """
  ws = ' '

  #init history
  w_i2 = 'BEGIN'
  w_i1 = 'NOW'
  h = conditional_probas[(w_i2, w_i1)]
  w_i = sample_from_discrete_distrib(h)

  sent = w_i

  while( w_i!='END'):
    w_i2 = w_i1
    w_i1 = w_i
    w_i = sample_from_discrete_distrib(conditional_probas[(w_i2, w_i1)])
    if w_i == 'END': break 
    if (w_i in ".,") or w_i1 == 'NOW' : 
      ws =''
    else: 
      ws =' '
    sent+=ws + w_i
 
  return sent

In [None]:
generate(co_probas)

'4149 Daily Wine Picks found in this category.'

In [None]:
def generate(w_i2,w_i1,conditional_probas):
  """
  implementation by recursive algorithm
  """
  ws = ' '

  w_i = sample_from_discrete_distrib(conditional_probas[(w_i2, w_i1)])
  
  if w_i=='END' : return ''

  if (w_i in ".,") or w_i1 == 'NOW' : ws =''  

  return ws + w_i +  generate(w_i1,w_i,conditional_probas)


In [None]:
generate('BEGIN','NOW',co_probas)

'4079 Daily Wine Picks found in this category.'

stroke of luck the same sentence produced ✌