In [None]:
import nltk
import pandas as pd
import csv
import string
from textblob_de import TextBlobDE as TBD
from textblob import TextBlob as TBE
import spacy
import language_check
from scipy.spatial.distance import cdist
from laserembeddings import Laser
import kiwi
import utils

# Load pre-trained nlp models
sp_en = spacy.load("en")
sp_de = spacy.load("de")
en_checker = language_check.LanguageTool('en-GB')
ge_checker = language_check.LanguageTool('de-DE')

# Utils
def pos_parser(x,y):
    blacklist = ['.']
    x = [i.tag_ for i in x]
    x = {k:x.count(k) for k in x if k not in blacklist}
    y = [i.tag_ for i in y]
    y = {k:y.count(k) for k in y if k not in blacklist}
    if len(x)>len(y):
        it = x
        nit = y
    else:
        it = y
        nit = x
    res = 0
    for pos in it:
        if pos in nit:
            res += abs(it[pos]-nit[[pos]])
        else:
            res += it[pos]
    return res
def spacy_parser(x,y, mode='pos_'):
    # Models don't have the same entities
    whitelist = ['PER', 'PERSON', 'LOC', 'ORG']
    if mode in ['ents']:
        mode = 'label_'
        x = x.ents
        y = y.ents
    x = [getattr(i,mode) for i in x]
    x = {k:x.count(k) for k in x if k}
    y = [getattr(i, mode) for i in y]
    y = {k:y.count(k) for k in y if k}
    if mode in ['label_']:
        if 'PERSON' in x:
            x['PER'] = x.pop('PERSON')
        x = {k:v for k,v in x.items() if k in whitelist}
        y = {k:v for k,v in y.items() if k in whitelist}

    if len(x)>len(y):
        it = x
        nit = y
    else:
        it = y
        nit = x
    res = 0
    for pos in it:
        if pos in nit:
            res += abs(it[pos]-nit[pos])
        else:
            res += it[pos]
    return res

src = pd.read_csv('en-de/train.ende.src', sep="\n", error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None)
target = pd.read_csv('en-de/train.ende.mt', sep="\n", error_bad_lines=False,quoting=csv.QUOTE_NONE, header=None)
scores = pd.read_csv('en-de/train.ende.scores', sep="\n", error_bad_lines=False,quoting=csv.QUOTE_NONE, header=None)
df = src.rename(columns={0:'src'})
df['tgt'] = target
# df = df.head(10)
# Remove punctuation
df[['src_p', 'tgt_p']] = df[['src', 'tgt']].applymap(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))
# df['scores'] = scores
df['src_len'] = df['src_p'].apply(lambda x: len(x.split(' ')))
df['tgt_len'] = df['tgt_p'].apply(lambda x: len(x.split(' ')))
df1 = pd.DataFrame({'src':[], 'tgt':[]}).transpose()
df1['avg_tkn_len'] = df[['src_len', 'tgt_len']].mean().tolist()
count = lambda l1,l2: sum([1 for x in l1 if x in l2])
df['src_#punc'] = df['src'].apply(lambda x: count(x,set(string.punctuation)) )
df['tgt_#punc'] = df['tgt'].apply(lambda x: count(x,set(string.punctuation)) )
df['tgt_polar'] = df['tgt'].apply(lambda x: TBD(x).sentiment.polarity)
df['src_polar'] = df['src'].apply(lambda x: TBE(x).sentiment.polarity)
df['polar_dff'] = (df['tgt_polar']-df['src_polar']).abs()
df['src_sp'] = df['src'].apply(lambda x: sp_en(x))
df['tgt_sp'] = df['tgt'].apply(lambda x: sp_de(x))
df['src_gram_err'] = df['src'].apply(lambda x: en_checker.check(x))
df['tgt_gram_err'] = df['tgt'].apply(lambda x: ge_checker.check(x))
df['sp_pos_diff'] = [spacy_parser(x,y, 'pos_') for x,y in zip(df['src_sp'], df['tgt_sp'])]
df['sp_ent_diff'] = [spacy_parser(x,y, 'ents') for x,y in zip(df['src_sp'], df['tgt_sp'])]
# Laser embeddings
x = laser.embed_sentences(df['src'].tolist(), lang='en')
y = laser.embed_sentences(df['tgt'].tolist(), lang='de')
df['src_laser_embed'] = x.tolist()
df['tgt_laser_embed'] = y.tolist()
# Laser cosine distance
for i in df.index:
    df.loc[i, 'laser_embed_cdist'] = cdist(
        np.array(df.loc[i, 'src_laser_embed']).reshape(1,-1),
        np.array(df.loc[i, 'tgt_laser_embed']).reshape(1,-1),
        'cosine')[0][0]
# Openkiwi
OK_url = 'https://github.com/unbabel/KiwiCutter/releases/download/v1.0/estimator_en_de.torch.zip'
utils.download_kiwi(OK_url)
model = kiwi.load_model('trained_models/estimator_en_de.torch/estimator_en_de.torch')
examples = {'source': df['src'].tolist(),'target': df['tgt'].tolist()}
predictions = model.predict(examples)
df['openkiwi_score'] = predictions['sentence_scores']

df['scores'] = scores

In [None]:
df = pd.read_pickle('df.pkl')

In [None]:
import torch
xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large')


In [None]:
import numpy as np


In [None]:
# df.to_pickle('df.pkl')
# dist = cdist(x,y, 'cosine')

# for i in df.index:
#     df.loc[i, 'src_laser_embed'] = x[i]
# x.tolist()

In [None]:
for i in df.index:
    df.loc[i, 'laser_embed_cdist'] = cdist(
        np.array(df.loc[i, 'src_laser_embed']).reshape(1,-1),
        np.array(df.loc[i, 'tgt_laser_embed']).reshape(1,-1),
        'cosine')[0][0]

In [None]:
df['abs_laser_embed_cdist'] = df['laser_embed_cdist'].abs()

In [6]:
import pandas as pd
df = pd.read_pickle('df.pkl')
# x = np.array(xlmr.encode('banana').tolist()).reshape(1,-1)
# y = np.array(xlmr.encode('vegetable').tolist()).reshape(1,-1)
# print(x, y)

In [None]:
from scipy.spatial.distance import cdist
from laserembeddings import Laser
laser = Laser()
embeddings = laser.embed_sentences(
    ['apple', 'l√©gume'],
    lang=['en', 'fr'])
# cdist(x, y, 'cosine')
embeddings[0].shape

[nltk_data] Downloading package punkt to /Users/ludo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Getting filename
Checking if file already downloaded
Downloading
estimator_en_de.torch.zip: 360MB [01:40, 3.57MB/s]                               
Download has finished.
Extracting trained_models/estimator_en_de.torch.zip
Done extracting


In [None]:
cdist(embeddings[0].reshape(1,-1), embeddings[1].reshape(1,-1))
# embeddings = laser.embed_sentences(
#     ['let your neural network be polyglot',
#      'use multilingual embeddings!'],
#     lang='en')


In [8]:
df.corr()

Unnamed: 0,src_len,tgt_len,src_#punc,tgt_#punc,tgt_polar,src_polar,sp_pos_diff,sp_ent_diff,scores,laser_embed_cdist,abs_laser_embed_cdist,polar_dff,openkiwi_score
src_len,1.0,0.930897,0.251329,0.282172,0.015372,0.034175,0.470674,0.234507,-0.048007,-0.163272,-0.163272,0.12623,0.15697
tgt_len,0.930897,1.0,0.261039,0.301643,0.000708,0.021773,0.497385,0.23348,-0.045259,-0.177125,-0.177125,0.145774,0.225158
src_#punc,0.251329,0.261039,1.0,0.836125,-0.020326,0.005937,0.229136,0.216247,-0.019488,-0.192414,-0.192414,-0.008147,0.112684
tgt_#punc,0.282172,0.301643,0.836125,1.0,-0.014515,0.001927,0.25375,0.186801,-0.026848,-0.196261,-0.196261,0.004103,0.133771
tgt_polar,0.015372,0.000708,-0.020326,-0.014515,1.0,0.213635,0.026687,0.012901,-0.010209,0.004593,0.004593,-0.091737,-0.023598
src_polar,0.034175,0.021773,0.005937,0.001927,0.213635,1.0,0.033873,-0.008617,0.028852,-0.004833,-0.004833,0.129515,-0.002234
sp_pos_diff,0.470674,0.497385,0.229136,0.25375,0.026687,0.033873,1.0,0.180311,-0.034278,-0.074396,-0.074396,0.043962,0.203629
sp_ent_diff,0.234507,0.23348,0.216247,0.186801,0.012901,-0.008617,0.180311,1.0,-0.014452,-0.192739,-0.192739,-0.034608,0.084963
scores,-0.048007,-0.045259,-0.019488,-0.026848,-0.010209,0.028852,-0.034278,-0.014452,1.0,-0.054626,-0.054626,-0.021223,-0.022113
laser_embed_cdist,-0.163272,-0.177125,-0.192414,-0.196261,0.004593,-0.004833,-0.074396,-0.192739,-0.054626,1.0,1.0,0.045329,-0.015807


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
sid.polarity_scores(eng[0])

# Features

Sentiment analysis
Tense (future, past, present)
Grammatical sanity
possesive features (like Ludo<'s>)


In [None]:
Do we assume english text to be accurate?
Approx same number of rules
English
Variants for: Australian, Canadian, GB, New Zealand, South African, US	2790
German
Variants for: Austria, Germany, Swiss	2894
http://wiki.languagetool.org/development-overview#toc0
    
preprocess by cleaning both texts of grammar.
import string
def preprocessing(lst):
    
    # Remove punctuation and lower-case
#     res = [i[0].lower().translate(str.maketrans('', '', string.punctuation)) for i in lst]
    # Remove stop words
    res = [i[0] for i in lst]
    return res
# eng = preprocessing(src)
# ger = preprocessing(target)

In [None]:
# For downloading language check need to install java8 and configure permissions
https://stackoverflow.com/questions/24342886/how-to-install-java-8-on-mac
https://github.com/myint/language-check/issues/59
https://github.com/myint/language-check/issues/31
https://stackoverflow.com/questions/40684543/how-to-make-python-use-ca-certificates-from-mac-os-truststore