# Data Processing pipeline

- index of sentences
- full english sentence (without preprocessing)
- full german sentence  (without preprocessing)
- english sentnece no stop words, punctuation
- german sentnece no stop words, punctuation
- score
- verbs in english (separated by a space and lemmatized)
- verbs in german (separated by a space and lemmatized)
- adjectives in english (separated by a space and lemmatized)
- adjectives in german (separated by a space and lemmatized)
- common nouns in english (separated by a space and lemmatized)
- common nouns in german (separated by a space and lemmatized)
- Nouns of persons
- Entities or organizations


In [3]:
import pandas as pd
import spacy
import os

In [7]:
# nlp_en = spacy.load("en_core_web_md")
# nlp_ge = spacy.load("de_core_news_md")

import en_core_web_md
import de_core_news_md
nlp_en = en_core_web_md.load()
nlp_ge = de_core_news_md.load()

def spacy_analysis(sentence,nlp):
    doc = nlp(sentence)
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop,token.vector.shape)

In [8]:
flatten = lambda l: [item for sublist in l for item in sublist]

# some helper functions
def prepare_data(filename):
    data = [l.strip().split() + ['</s>'] for l in open(filename) if l.strip()]
    corpus = flatten(data)
    vocab = set(corpus)
    return vocab, data

In [9]:
def extract_sentences(filename,lower=False):
    if lower:
        data = [l.lower().strip() for l in open(filename) if l.strip()]
    else:
        data = [l.strip() for l in open(filename) if l.strip()]
    return data

### Load Dataset

In [10]:
# sentences_en = pd.DataFrame(extract_sentences('../data/en_de/train.ende.src'),columns = ['sentences_en'])
# sentences_ge = pd.DataFrame(extract_sentences('../data/en_de/train.ende.mt'),columns = ['sentences_ge'])
# scores = pd.read_csv('../data/en_de/train.ende.scores',header=None)
# scores = scores.rename(columns={0:"scores"})

In [11]:
# dataset = pd.merge(sentences_en,sentences_ge,left_index=True,right_index=True)
# dataset = pd.merge(dataset,scores,left_index=True,right_index=True)

### Datasets ie also validation

In [43]:
path_train_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.src')
path_train_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.mt')
path_train_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'train.ende.scores')

train_sentences_en = pd.DataFrame(extract_sentences(path_train_en),columns = ['sentences_en'])
train_sentences_ge = pd.DataFrame(extract_sentences(path_train_ge),columns = ['sentences_ge'])
train_scores = pd.read_csv(path_train_scores,header=None)
train_scores = train_scores.rename(columns={0:"scores"})

train_dataset = pd.merge(train_sentences_en,train_sentences_ge,left_index=True,right_index=True)
train_dataset = pd.merge(train_dataset,train_scores,left_index=True,right_index=True)

path_dev_en = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.src')
path_dev_ge = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.mt')
path_dev_scores = os.path.join(os.path.dirname(os.getcwd()), 'data', 'en-de', 'dev.ende.scores')

dev_sentences_en = pd.DataFrame(extract_sentences(path_dev_en),columns = ['sentences_en'])
dev_sentences_ge = pd.DataFrame(extract_sentences(path_dev_ge),columns = ['sentences_ge'])
dev_scores = pd.read_csv(path_dev_scores,header=None)
dev_scores = dev_scores.rename(columns={0:"scores"})

dev_dataset = pd.merge(dev_sentences_en,dev_sentences_ge,left_index=True,right_index=True)
dev_dataset = pd.merge(dev_dataset,dev_scores,left_index=True,right_index=True)

dataset = pd.concat([train_dataset, dev_dataset])
dataset = dataset.reset_index(drop=True)

### Sentence length

In [46]:
def sentence_length(sentence):
    leng=0
    sentence=sentence.split(" ")
    for token in sentence:
        leng+=1
    return leng

In [None]:
def difference_length(a, b):
    lenga=0
    a=a.split(" ")
    for token in a:
        lenga+=1
    
    lengb=0
    b=b.split(" ")
    for token in b:
        lengb+=1
    
    return abs(lenga-lengb)

In [47]:
dataset['english_sentence_length'] = dataset['sentences_en'].apply(sentence_length)
dataset['german_sentence_length'] = dataset['sentences_ge'].apply(sentence_length)

In [48]:
dataset['sentence_length_difference'] = dataset['english_sentence_length'] - dataset['german_sentence_length']

### POS extraction

In [49]:
def extract_pos(pos,sentence,nlp):
    doc = nlp(sentence)
    res = ""
    for token in doc:
        if token.pos_==pos:
            res += token.text + " "
    res = res.strip()
    return res

#### Add verbs

In [50]:
# dataset["german_verbs"] = dataset.sentences_ge.apply(lambda x:extract_pos("VERB",x,nlp_ge))
# dataset["english_verbs"] = dataset.sentences_en.apply(lambda x:extract_pos("VERB",x,nlp_en))

In [114]:
# dataset['verbs_diff'] = dataset.apply(lambda row: difference_length(row['english_verbs'], row['german_verbs']),axis=1)

#### Add adjectives

In [51]:
# dataset["german_adjectives"] = dataset.sentences_ge.apply(lambda x:extract_pos("ADJ",x,nlp_ge))
# dataset["english_adjectives"] = dataset.sentences_en.apply(lambda x:extract_pos("ADJ",x,nlp_en))

In [117]:
# dataset['adjectives_diff'] = dataset.apply(lambda row: difference_length(row['english_adjectives'], row['german_adjectives']),axis=1)

#### Add adverbs

In [52]:
# dataset["german_adverbs"] = dataset.sentences_ge.apply(lambda x:extract_pos("ADV",x,nlp_ge))
# dataset["english_adverbs"] = dataset.sentences_en.apply(lambda x:extract_pos("ADV",x,nlp_en))

In [118]:
# dataset['adverbs_diff'] = dataset.apply(lambda row: difference_length(row['english_adverbs'], row['german_adverbs']),axis=1)

#### Add nouns

In [53]:
# dataset["german_nouns"] = dataset.sentences_ge.apply(lambda x:extract_pos("NOUN",x,nlp_ge))
# dataset["english_nouns"] = dataset.sentences_en.apply(lambda x:extract_pos("NOUN",x,nlp_en))

In [119]:
# dataset['nouns_diff'] = dataset.apply(lambda row: difference_length(row['english_nouns'], row['german_nouns']),axis=1)

### Remove punctuation

In [54]:
def remove_punct(sentence, nlp):
    sentence=nlp(sentence)
    no_punct_sentence = ""
    for token in sentence:
        if not token.is_punct:
            no_punct_sentence+= (token.text + " ")
    
    no_punct_sentence=no_punct_sentence.strip()
    return no_punct_sentence

In [55]:
# dataset['english_no_punctuation'] = dataset['sentences_en'].apply(lambda x: remove_punct(x, nlp_en))
# dataset['german_no_punctuation'] = dataset['sentences_ge'].apply(lambda x: remove_punct(x, nlp_ge))

### Remove stopwords

In [29]:
from spacy.lang.en.stop_words import STOP_WORDS as sw_en
from spacy.lang.de.stop_words import STOP_WORDS as sw_ge

In [30]:
def remove_sw(sentence, nlp):
    sentence = nlp(sentence)
    no_sw_sentence = ""
    
    if nlp==nlp_en:
        stop_words=sw_en
    elif nlp==nlp_ge:
        stop_words=sw_ge
    else:
        return('Use valid language: en or ge')
    for token in sentence:
        if token.text not in stop_words:
            no_sw_sentence += token.text + " "
    
    no_sw_sentence=no_sw_sentence.strip()
    return no_sw_sentence

In [56]:
# dataset['english_no_stop_words'] = dataset['sentences_en'].apply(lambda x: remove_sw(x, nlp_en))
# dataset['german_no_stop_words'] = dataset['sentences_ge'].apply(lambda x: remove_sw(x, nlp_ge))

### Lemmatize

In [32]:
def lemmatizer(sentence, nlp):
    sentence = nlp(sentence)
    lemmatized_sentence = ""
    for token in sentence:
        lemmatized_sentence += token.lemma_ + " "
    
    lemmatized_sentence = lemmatized_sentence.strip()
    return lemmatized_sentence

In [57]:
# dataset['english_lemma'] = dataset['sentences_en'].apply (lambda x: lemmatizer(x, nlp_en))
# dataset['german_lemma'] = dataset['sentences_ge'].apply (lambda x: lemmatizer(x, nlp_ge))

### Sentiment Analysis

#### Sentences

In [34]:
from textblob import TextBlob as textblob_en
from textblob_de import TextBlobDE as textblob_ge

In [35]:
def sentiment(sentence, textblob):
    text = textblob(sentence)
    score = text.sentiment.polarity
    return score

In [58]:
# dataset['english_sentence_sentiment'] = dataset['sentences_en'].apply(lambda x: sentiment(x, textblob_en))
# dataset['german_sentence_sentiment'] = dataset['sentences_ge'].apply(lambda x: sentiment(x, textblob_ge))

In [60]:
# dataset['english_sentence_lemma_sentiment'] = dataset['english_lemma'].apply(lambda x: sentiment(x, textblob_en))
# dataset['german_sentence_lemma_sentiment'] = dataset['german_lemma'].apply(lambda x: sentiment(x, textblob_ge))

In [37]:
def stdz(data):
    data_stdz = (data - data.mean())/data.std()
    return data_stdz

In [59]:
# dataset['std_english_sentence_sentiment']= stdz(dataset['english_sentence_sentiment'])
# dataset['std_german_sentence_sentiment']= stdz(dataset['german_sentence_sentiment'])

#### Get the maximal Sentiment per sentence

In [96]:
# mask_lemm_english = abs(dataset['english_sentence_lemma_sentiment'])>abs(dataset['english_sentence_sentiment'])
# mask_lemm_german = abs(dataset['german_sentence_lemma_sentiment'])>abs(dataset['german_sentence_sentiment'])

In [97]:
# dataset['max_sentiment_english'] = dataset['english_sentence_sentiment']
# dataset['max_sentiment_english'].at[mask_lemm_english] = dataset['english_sentence_lemma_sentiment'][mask_lemm_english]

# dataset['max_sentiment_german']=dataset['german_sentence_sentiment']
# dataset['max_sentiment_german'].at[mask_lemm_english] = dataset['german_sentence_lemma_sentiment'][mask_lemm_english]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [102]:
# dataset['std_max_english_sentiment']= stdz(dataset['max_sentiment_english'])
# dataset['std_max_german_sentiment']= stdz(dataset['max_sentiment_german'])

#### Per POS

#### Store/load dataset

In [8]:
import pandas as pd
# dataset.to_pickle('../data/dataset_v1.pickle')
dataset = pd.read_pickle("../data/dataset_v1.pickle")

### Extra Work on the correlations dataset 

In [15]:
dataset_em = pd.read_pickle('../data/dataset_correlations_v1.pickle')

In [128]:
# dataset_em['std_correlations'] = stdz(dataset_em['correlation'])

In [None]:
# dataset_em['english_em_verbs'] = dataset_em['sentences_en_final'].apply(lambda x:extract_pos("VERB",x,nlp_en))
# dataset_em["german_verbs"] = dataset_em['sentences_ge_final'].apply(lambda x:extract_pos("VERB",x,nlp_ge))

In [None]:
# dataset_em["german_adjectives"] = dataset_em.sentences_ge.apply(lambda x:extract_pos("ADJ",x,nlp_ge))
# dataset_em["english_adjectives"] = dataset_em.sentences_en.apply(lambda x:extract_pos("ADJ",x,nlp_en))

In [None]:
# dataset_em["german_adverbs"] = dataset_em.sentences_ge.apply(lambda x:extract_pos("ADV",x,nlp_ge))
# dataset_em["english_adverbs"] = dataset_em.sentences_en.apply(lambda x:extract_pos("ADV",x,nlp_en))

In [None]:
# dataset_em["german_nouns"] = dataset_em.sentences_ge.apply(lambda x:extract_pos("NOUN",x,nlp_ge))
# dataset_em["english_nouns"] = dataset_em.sentences_en.apply(lambda x:extract_pos("NOUN",x,nlp_en))

In [55]:
dataset_em['embedded_words_matched_max'] = dataset_em[['length_ge','length_en']].max(axis=1)
dataset_em['embedded_words_matched_min'] = dataset_em[['length_ge','length_en']].min(axis=1)

In [56]:
dataset_em['weights'] = dataset_em['embedded_words_matched_min']/(dataset_em['embedded_words_matched_max'] + 
                        dataset_em['non_translated_words'])

In [57]:
dataset_em['weights']=dataset_em['weights'].fillna(0)

In [58]:
dataset_em['weighted_corr'] = dataset_em['weights']*dataset_em['correlation']

In [59]:
dataset_em['weighted_corr'].corr(dataset_em['scores'])

0.06959344364193254

In [60]:
dataset_em['correlation'].corr(dataset_em['scores'])

0.08979190053718233

In [61]:
dataset_em.nsmallest(10, 'scores')

Unnamed: 0,sentences_en,sentences_ge,scores,person,sentences_en_no_propnouns,sentences_ge_no_propnouns,sentences_en_clean,sentences_ge_clean,non_translated_words,sentences_en_cleaner,...,sentences_en_final,sentences_ge_final,length_ge,length_en,distance,correlation,embedded_words_matched_max,embedded_words_matched_min,weights,weighted_corr
3917,"The Mummy, A Handbook of Egyptian Funerary Arc...","The Mummy, A Handbook of Egyptian Funerary Arc...",-8.140713,[],"The Mummy, A Handbook of Egyptian Funerary Arc...","The Mummy, A Handbook of Egyptian Funerary Arc...",the mummy a handbook of egyptian funerary arch...,the mummy a handbook of egyptian funerary arch...,5,,...,,,0,0,0,0.615833,0,0,0.0,0.0
6975,He vacated the WWA Cruiserweight title in Apri...,Nach seiner Rückkehr zum WWF verließ er im Apr...,-7.481519,[WWF],He vacated the WWA Cruiserweight title in Apri...,Nach seiner Rückkehr zum verließ er im April ...,he vacated the wwa cruiserweight title in apri...,nach seiner rückkehr zum verließ er im april d...,2,vacated title april returning,...,vacated title april returning,rückkehr verließ april titel,4,4,0,0.587837,4,4,0.666667,0.391891
2080,"Haroun and S.A. Mourad, Proceedings of Interna...","Haroun and S.A. Mourad, Proceedings of Interna...",-7.148534,"[Haroun, S.A. Mourad, Cairo, Egypt]","and , Proceedings of International Colloquium...","and , Proceedings of International Colloquium...",and proceedings of international colloquium on...,and proceedings of international colloquium on...,1,proceedings international structural engineeri...,...,proceedings international structural engineeri...,proceedings international engineering april pp,5,6,1,0.630766,6,5,0.714286,0.450547
6229,Boone Trails Grand Towers Gravois Trail New Ho...,Boone Trails Grand Towers Gravois Trail New Ho...,-7.028717,[Boone Trails Grand Towers Gravois Trail New H...,North Star Osage Trailblazers Trails Thunde...,North Star Osage Trailblazers Trails Thunde...,north star osage trailblazers trails thunderbird,north star osage trailblazers trails thunderbird,3,north star trails bird,...,north star trails bird,north star trails bird,4,4,0,0.570327,4,4,0.571429,0.325901
7033,World Encyclopedia of Police Forces and Penal ...,World Encyclopedia of Police Forces and Straal...,-7.028717,[],World Encyclopedia of Police Forces and Penal ...,World Encyclopedia of Police Forces and Straal...,world encyclopedia of police forces and penal ...,world encyclopedia of police forces and straal...,1,world encyclopedia police penal systems,...,world encyclopedia police penal systems,world encyclopedia police systems,4,5,1,0.569677,5,4,0.666667,0.379785
1846,"Faerie Apocalypse, Jason Franks, IFWG Publishi...","Faerie Apocalypse, Jason Franks, IFWG Publishi...",-6.776486,"[Faerie Apocalypse, Jason Franks, IFWG Publish...",", , .",", , .",,,0,,...,,,0,0,0,0.615833,0,0,0.0,0.0
2163,"Pollock, Griselda, Generations and Geographies...","Pollock, Griselda, Generations and Geographies...",-6.776486,"[Pollock, Griselda, Generations and, the Visua...","Geographies in , , , 1996.","Geographies in , , , 1996.",geographies in,geographies in,1,,...,,,0,0,0,0.615833,0,0,0.0,0.0
2214,Illustrated Encyclopedia of Woodworking Handto...,Illustrated Encyclopedia of Woodworking Handto...,-6.776486,"[Woodworking Handtools, Instruments & Devices]","Illustrated Encyclopedia of , .","Illustrated Encyclopedia of , .",illustrated encyclopedia of,illustrated encyclopedia of,1,encyclopedia,...,encyclopedia,encyclopedia,1,1,0,0.524171,1,1,0.5,0.262085
2576,"""The Great Reform Act of 1832 and British Demo...","""The Great Reform Act of 1832 and British Demo...",-6.776486,[],"""The Great Reform Act of 1832 and British Demo...","""The Great Reform Act of 1832 and British Demo...",the great reform act of and british democratiz...,the great reform act of and british democracy,0,great reform act british democratization,...,great reform act british democratization,great reform act british,4,5,1,0.661122,5,4,0.8,0.528898
2500,"""Radio Birdman"", in The Trouser Press Record G...","""Radio Birdman"", in The Hose Press Record Guid...",-6.629883,[ed],"""Radio Birdman"", in The Trouser Press Record G...","""Radio Birdman"", in The Hose Press Record Guid...",radio birdman in the trouser press record guide,radio birdman in the hose press record guide,2,radio trouser press guide,...,radio trouser press guide,radio hose press guide,4,4,0,0.661558,4,4,0.666667,0.441038


In [63]:
dataset_em.to_pickle('../data/dataset_correlations_v2.pickle')