In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import difflib                        # A library that computes the difference between two strings, unused (was for further exploration and info)

In [2]:
def look_changes(morphs, probs, threshold):
  """
  A very naïve approach that searches for potential candidates for the changes.
  It simply regroupes the different changes that have been recorded to have had one amongst morphological attributes
  :params:  * morphs: (array) morphological attributes, mainly on the test data set
            * Probs: A dictionary that regroupe changes to their morphological attributes and there occurences on training dataset
            * Threshold: Unused, but serves as an indication to ignore underrated examples
  :returns: A dictionnary with potential changes to make to a lemma, and the corresponding accumulative probabilities
  within the training dataset
  """
  possible_predictions = dict()
  for c in probs.keys():
    dict_probs = probs[c]
    for morph in morphs:
      if morph in dict_probs.keys() and dict_probs[morph] >= threshold:
        if c in possible_predictions.keys():
          possible_predictions[c] += dict_probs[morph]
        else:
          possible_predictions[c] = dict_probs[morph]
  return possible_predictions

In [3]:
def stringDif(s, c):
  """
  Looks up the prefix and suffix of a lemma c. The maximal substring in s that correspond to lemma is extracted
  :params: s for the form and c for the lemma
  :return: the prefix+"-"+suffix, where "-" replaces the inducted substring of the lemma c
  """
  # Perte d'information, quoi faire pour rattaraper ?
  for i in range(len(c),0, -1):
    np = c[:i]
    ns = s.replace(np, '-')
    if ns != s:
      # The substring found, and is replaced
      break
  return ns

In [4]:
def look_obvious_changes(morphs, probs):
  """
  An approach that searches for potential candidates for the changes.
  It simply regroupes the different changes that have been recorded to have had *the exact same* morphological attributes
  :params:  * morphs: (String) morphological attributes, mainly on the test data set
            * Probs: A dictionary that regroupe changes to their morphological attributes and there occurences on training dataset
  :returns: A dictionnary with potential changes to make to a lemma, and the corresponding occurence within the training dataset
  """
  possible_predictions = dict()
  for c in probs.keys():
    dict_probs = probs[c]
    if set(morphs.split(";")) == set(dict_probs.keys()):
      if c in possible_predictions.keys():
        possible_predictions[c] += 1
      else:
        possible_predictions[c] = 1
    # If an example is not met on the training set, we abandon the cause. Hence, obvious
  return possible_predictions

def look_less_obvious_changes(morphs, probs):
  """Same as above, but records the cases where only one of the morphological attributes is present"""
  possible_predictions = dict()
  for c in probs.keys():
    dict_probs = probs[c]
    if np.all(list(map(lambda a : a in dict_probs.keys(), morphs.split(";")))):
      # This is the case where the training set has a sample that had the exacte Morph. Att. and is amplified (+2)
      if c in possible_predictions.keys():
        possible_predictions[c] += 2
      else:
        possible_predictions[c] = 2
    # The case where at least one morph. att. is present. This case is less representable (+1) 
    elif np.any(list(map(lambda a : a in dict_probs.keys(), morphs.split(";")))):
      if c in possible_predictions.keys():
        possible_predictions[c] += 1
      else:
        possible_predictions[c] = 1
    # +0 for the case of no presence. This occurences and amplification serve for the probabilistic model later on.
  return possible_predictions

def select_less_obvious(lemma, morphs, corresp, corresp2, corresp3, dist):
  """
    This is an advanced method. It takes to account some learned information from the training set to 
    select a potential candidates. It, all the same, is still a naïve approach.
    :params:  * corresp: Contains, as probs before, the changes "prefix-suffix", the attributes and occurences that correspond to them.
              * corresp2: Contains changes and the corresponding lemmas.
              * corresp3: Contains the changes and their occurences within the training data set (the world)
              * dist: Distance to defines as one pleases between two lemmas
    :return: The most probable changes taking to account:
    * Their M.A. and those of the test set
    * The probability that these changes occur (if they accur rarely in the training set, so they should in the test set)
    * The distance of the new lemma from the lemmas in the training set (if this lemma is so close to another one, so the changes are 
    most probably the same as those on the training set)
  """
  pp = look_less_obvious_changes(morphs, fromCorrespToProbs(corresp))
  new_preds = dict.fromkeys(pp.keys())
  for k in pp.keys():
    dists = list(map(lambda a: dist(a, lemma),corresp3[k]))
    new_preds[k] = pp[k] * corresp2[k] * (np.max(dists) - np.mean(dists))
  minIdx = np.argmax(list(new_preds.values()))
  return np.array(list(new_preds.keys()))[minIdx]

In [5]:
def createTrainDatasetFromFD(fd):
  """
    Simple function creating the tuple (lemma, form, M.A) also the tuple (corresp, corresp2, corresp3) as:
              * corresp: The changes "prefix-suffix", the attributes and occurences that correspond to them.
              * corresp2: Contains changes and the corresponding lemmas.
              * corresp3: Contains the changes and their occurences within the training data set (the world)
    :params: fd is a file descriptor
  """
  # Create words
  l = fd.readline()
  ws, fs, rs = list(), list(), list()
  while l:
      w, f, r = list( map(lambda a : a.strip(), l.split('\t')) )
      ws.append(w); fs.append(f); rs.append(r)
      l = fd.readline()
  # Create a dictionnary with rule (pre-suf) as keys and have the corresponding morph attributes 
  corresp = dict()
  corresp2 = dict()
  corresp3 = dict()
  for i in range(len(ws)):
    rule = stringDif(fs[i], ws[i])
    if rule in corresp.keys():
      corresp2[rule] += 1
      corresp3[rule].append(ws[i])
      for k in rs[i].split(";"):
        if k in corresp[rule]:
          corresp[rule][k] += 1
        else:
          corresp[rule][k] = 1
    else:
        corresp[rule] =  {key: 1 for key in rs[i].split(";")}
        corresp2[rule] = 1
        corresp3[rule] = [ws[i]]
  return (ws, fs, rs), (corresp, corresp2, corresp3)

def testDatasetFromFD(fd):
  """
    In the case of test datasets, the tuple (lemma, M.A) only is returned
  """
  l = fd.readline()
  ws, rs = list(), list()
  while l:
      w, r = list( map(lambda a : a.strip(), l.split('\t')) )
      ws.append(w); rs.append(r)
      l = fd.readline()
  return (ws, rs)

def fromCorrespToProbs(corresp):
  """
    A function that changes occurences in corresp to probabilities
    :Example:
    > corresp = {"pre-é": {"V":7, "Noun":3}, "dé-é" : {"V":8, "Adj":2}}
    > probs   = {"pre-é": {"V":0.7, "Noun":0.3}, "dé-é" : {"V":0.8, "Adj":0.2}}
  """
  probs = corresp.copy()
  for key1 in probs.keys():
    sumValues = sum(probs[key1].values())
    for key2 in probs[key1].keys():
      probs[key1][key2] = probs[key1][key2] / sumValues
  return probs

In [6]:
def selectMaxFromDict(dictio):
  """
    Selects the key that correspond to the highest value in a dictionary
  """
  ks = np.array(list(dictio.keys()))
  vs = np.array(list(dictio.values()))
  ord = np.argsort(vs)
  ks_n = ks[ord]
  return ks_n[0] if not ks_n.shape[0] == 0 else '-'

def selectFromDict(dictio, threshold):
  """
    Selects cases (keys) from dictionary where the values are >= threshold
  """
  ks = np.array(list(dictio.keys()))
  vs = np.array(list(dictio.values()))
  ks = ks[vs>=threshold]
  vs = vs[vs>=threshold]
  ord = np.argsort(vs)
  ks_n = ks[ord]
  return ks_n
  

def lemmasAndChanges(ws, fs, rs):
  """
    From (lemma, form, M.A.) return (lemma, changes, M.A.)
  """
  cs = list(map(lambda f, w: stringDif(f, w), zip(fs, ws)))
  return ws, cs, rs

def distanceFromLemma(lemma1, lemma2):
  """
    An example to defining the distance between two lemmas
  """
  d = 0
  # Distance is the sum of the character-wise differences
  for i in range(min(len(lemma1), len(lemma2))):
    d += abs(ord(lemma1[i]) - ord(lemma2[i]))
  lemma = lemma1 if len(lemma1) > len(lemma2) else lemma2
  # Adding the difference
  reste = sum(list(map(lambda a : ord(a),lemma[i+1:])))
  return d + reste


def distanceFromLemma_(lemma1, lemma2):
  """
    Another example where first and last character are more put to interest than the reste. And no character-wise distance.
  """
  b = 0.5 * (lemma1 == lemma2) + 0.25 * ((lemma1[:2] == lemma2[:2]) + (lemma1[-2:] == lemma2[-2:])) if min(len(lemma1), len(lemma2)) >= 2 else int(lemma1 == lemma2)
  return b

In [9]:
def printAccuracies(fdTrain, fdTest):
  # Training data and dependencies
  (ws, fs, rs), (corresp, corresp2, corresp3) = createTrainDatasetFromFD(fdTrain)
  print("training=", len(ws))
  probs = fromCorrespToProbs(corresp)
  # Test data
  (ws_t, fs_t, rs_t), _ = createTrainDatasetFromFD(fdTest)
  print("Test=", len(ws_t))
  acc, acc2, acc3, acc4 = list(), list(), list(), list()
  # Defining a distance that is a sum of the above mentionned ones
  dist = lambda a, b : distanceFromLemma(a, b) - distanceFromLemma_(a, b)
  # Compare the very naïve approach (Look for elements in data that correspond the most) to a more defined one
  for i in range(len(rs_t)):
    c = select_less_obvious(ws_t[i], rs_t[i], corresp, corresp2, corresp3, dist)
    c2 = selectMaxFromDict(look_obvious_changes(rs_t[i], probs))
    # Without taking the lemma to account
    acc3.append(stringDif(fs_t[i], ws_t[i]) == c)
    acc4.append(stringDif(fs_t[i], ws_t[i]) == c2)
    c = c.replace("-", ws_t[i])
    c2 = c2.replace("-", ws_t[i])
    # Penalize the cases where the lemma is changeable
    acc.append(fs_t[i] == c)
    acc2.append(fs_t[i] == c2)
    
  print("full-word accuracies are: \n * Using no additional info:", np.mean(acc), "\n* Using additional info:", np.mean(acc2), "\n Prefix-Suffix accuracies are: \n * Using no additional info:", np.mean(acc3), "\n * Using additional info", np.mean(acc4))

In [10]:
fdTrain = open("swa.trn")
fdTest = open("swa.tst")
print("For Swahli")
printAccuracies(fdTrain, fdTest)
fdTrain = open("hil.trn")
fdTest = open("hil.tst")
print("For Hil")
printAccuracies(fdTrain, fdTest)

For Swahli
training= 3374
Test= 910
full-word accuracies are: 
 * Using no additional info: 0.9263736263736264 
* Using additional info: 1.0 
 Prefix-Suffix accuracies are: 
 * Using no additional info: 0.9263736263736264 
 * Using additional info 1.0
For Hil
training= 859
Test= 238
full-word accuracies are: 
 * Using no additional info: 0.08823529411764706 
* Using additional info: 0.46638655462184875 
 Prefix-Suffix accuracies are: 
 * Using no additional info: 0.09663865546218488 
 * Using additional info 0.4831932773109244


In [12]:
fdTrain = open("mlg.trn")
fdTest = open("mlg.tst")
print("For MLG")
printAccuracies(fdTrain, fdTest)

fdTrain = open("lug.trn")
fdTest = open("lug.tst")
print("For Lug")
printAccuracies(fdTrain, fdTest)

For MLG
training= 447
Test= 127
full-word accuracies are: 
 * Using no additional info: 0.9763779527559056 
* Using additional info: 1.0 
 Prefix-Suffix accuracies are: 
 * Using no additional info: 0.9763779527559056 
 * Using additional info 1.0
For Lug
training= 3420
Test= 977
full-word accuracies are: 
 * Using no additional info: 0.3940634595701126 
* Using additional info: 0.49437052200614123 
 Prefix-Suffix accuracies are: 
 * Using no additional info: 0.3930399181166837 
 * Using additional info 0.5066530194472876


In [None]:
fdTrain = open("krl.trn")
fdTest = open("krl.tst")
print("For KRL")
printAccuracies(fdTrain, fdTest)

fdTrain = open("isl.trn")
fdTest = open("isl.tst")
print("For Isl")
printAccuracies(fdTrain, fdTest)