In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from scipy.stats.stats import pearsonr

### Getting the data

In [11]:
# dataset_embeddings = pd.read_pickle('../data/dataset_correlations_v1.pickle')
dataset_embeddings = pd.read_pickle('../data/dataset_correlations_v2.pickle')
dataset_baseline = pd.read_pickle('../data/dataset_v1.pickle')
dataset_laser = pd.read_pickle('../data/dataset_corrleations_laser.pickle')

In [12]:
dataset_embeddings.columns

Index(['sentences_en', 'sentences_ge', 'scores', 'person',
       'sentences_en_no_propnouns', 'sentences_ge_no_propnouns',
       'sentences_en_clean', 'sentences_ge_clean', 'non_translated_words',
       'sentences_en_cleaner', 'sentences_ge_cleaner', 'sentences_en_final',
       'sentences_ge_final', 'length_ge', 'length_en', 'distance',
       'correlation', 'embedded_words_matched_max',
       'embedded_words_matched_min', 'weights', 'weighted_corr'],
      dtype='object')

In [13]:
dataset_embeddings.head()

Unnamed: 0,sentences_en,sentences_ge,scores,person,sentences_en_no_propnouns,sentences_ge_no_propnouns,sentences_en_clean,sentences_ge_clean,non_translated_words,sentences_en_cleaner,...,sentences_en_final,sentences_ge_final,length_ge,length_en,distance,correlation,embedded_words_matched_max,embedded_words_matched_min,weights,weighted_corr
0,José Ortega y Gasset visited Husserl at Freibu...,1934 besuchte José Ortega y Gasset Husserl in ...,1.101697,"[José Ortega y Gasset, Husserl, Freiburg]",visited at in 1934.,1934 besuchte in .,visited at in,besuchte in,0,visited,...,visited,besuchte,1,1,0,0.518761,1,1,1.0,0.518761
1,"However, a disappointing ninth in China meant ...",Eine enttäuschende Neunte in China bedeutete j...,-0.516656,[China],"However, a disappointing ninth in meant that ...",Eine enttäuschende Neunte in bedeutete jedoch...,however a disappointing ninth in meant that he...,eine enttäuschende neunte in bedeutete jedoch ...,0,however disappointing ninth meant dropped back...,...,however disappointing ninth meant dropped back...,enttäuschende neunte bedeutete jedoch gesamtwe...,8,8,0,0.619618,8,8,1.0,0.619618
2,"In his diary, Chase wrote that the release of ...","In seinem Tagebuch, Chase schrieb, dass die Ve...",-2.226388,"[Chase, Mason, Slidell]","In his diary, wrote that the release of and ...","In seinem Tagebuch, schrieb, dass die Veröffe...",in his diary wrote that the release of and was...,in seinem tagebuch schrieb dass die veröffentl...,0,diary wrote release like gall wormwood,...,diary wrote release like gall wormwood,tagebuch schrieb veröffentlichung galle wermut,5,6,1,0.63308,6,5,0.833333,0.527567
3,Heavy arquebuses mounted on wagons were called...,Schwere Arquebuses auf Waggons montiert wurden...,-0.827379,[],Heavy arquebuses mounted on wagons were called...,Schwere Arquebuses auf Waggons montiert wurden...,heavy arquebuses mounted on wagons were called...,schwere arquebuses auf waggons montiert wurden...,4,heavy mounted wagons called,...,heavy mounted wagons called,schwere waggons montiert wurden genannt,5,4,-1,0.626568,5,4,0.444444,0.278475
4,Once North Pacific salmon die off after spawni...,Sobald der nordpazifische Lachs nach dem Laich...,0.364695,[],Once North Pacific salmon die off after spawni...,Sobald der nordpazifische Lachs nach dem Laich...,once north pacific salmon die off after spawni...,sobald der nordpazifische lachs nach dem laich...,0,north pacific salmon die spawning usually loca...,...,north pacific salmon die spawning usually loca...,sobald lachs laichen abstirbt fressen regel lo...,11,14,3,0.58308,14,11,0.785714,0.458134


In [14]:
dataset_embeddings_features = dataset_embeddings[['non_translated_words',
                                                 'distance',
                                                 'correlation',
                                                 'weights']]

In [15]:
dataset_baseline.columns

Index(['sentences_en', 'sentences_ge', 'scores', 'english_sentence_length',
       'german_sentence_length', 'sentence_length_difference', 'german_verbs',
       'english_verbs', 'german_adjectives', 'english_adjectives',
       'german_adverbs', 'english_adverbs', 'german_nouns', 'english_nouns',
       'english_no_punctuation', 'german_no_punctuation',
       'english_no_stop_words', 'german_no_stop_words', 'english_lemma',
       'german_lemma', 'english_sentence_sentiment',
       'german_sentence_sentiment', 'std_english_sentence_sentiment',
       'std_german_sentence_sentiment', 'english_sentence_lemma_sentiment',
       'german_sentence_lemma_sentiment', 'max_sentiment_english',
       'max_sentiment_german', 'std_max_english_sentiment',
       'std_max_german_sentiment', 'verbs_diff', 'adjectives_diff',
       'adverbs_diff', 'nouns_diff'],
      dtype='object')

In [16]:
pd.options.display.width=None
dataset_baseline['german_adjectives'].sample(40)

4717                                                     
5761                                                leere
471                                                Zivile
3540                                                     
2860    Tatsächlich breiten populistischen amerikanischer
1015                                        Zentrale kurz
3716                                          demontierte
3630                                              weitere
4957                                                     
3928                                       amerikanischer
2368                                          nordöstlich
392                                                      
4980                                                     
1964                                                     
3701                                   fünfzehnte rechten
478                                                      
7747                                                     
3227          

In [17]:
dataset_baseline_features = dataset_baseline[['english_sentence_length',
                                              'std_max_english_sentiment',
                                              'german_sentence_length',
                                              'std_max_german_sentiment',
                                              'max_sentiment_english',
                                              'max_sentiment_german',
                                              'sentence_length_difference',
                                              'verbs_diff', 
                                              'adjectives_diff',
                                              'adverbs_diff',
                                              'nouns_diff'
                                             ]]

In [18]:
dataset_laser.columns

Index(['sentences_en', 'sentences_ge', 'scores', 'person',
       'sentences_en_no_propnouns', 'sentences_ge_no_propnouns',
       'sentences_en_clean', 'sentences_ge_clean', 'non_translated_words',
       'sentences_en_cleaner', 'sentences_ge_cleaner', 'sentences_en_final',
       'sentences_ge_final', 'length_ge', 'length_en', 'distance',
       'correlation', 'std_correlations', 'sentence_correlation'],
      dtype='object')

In [19]:
dataset_laser_features = dataset_laser[['sentence_correlation']]

In [20]:
dataset_features = pd.concat((dataset_baseline_features, dataset_embeddings_features,
                              dataset_laser_features),axis=1)

In [40]:
dataset_features = dataset_features[['correlation','non_translated_words']]

In [41]:
dataset_features

Unnamed: 0,correlation,non_translated_words
0,0.518761,0
1,0.619618,0
2,0.633080,0
3,0.626568,4
4,0.583080,0
...,...,...
7995,0.582558,2
7996,0.684928,0
7997,0.585751,3
7998,0.659042,0


In [42]:
dataset_features_list = list(dataset_features.columns)
dataset_features_arr = np.array(dataset_features)

In [43]:
dataset_labels = dataset_embeddings['scores']
dataset_labels_arr = np.array(dataset_labels)

### Splitting Train and Validation

In [44]:
# train_features = dataset_features_arr[:7000]
# train_labels = dataset_labels_arr[:7000]

# val_features = dataset_features_arr[7000:]
# val_labels = dataset_labels_arr[7000:]

train_features = np.concatenate((dataset_features_arr[:6000], dataset_features_arr[7000:]))
train_labels = np.concatenate((dataset_labels_arr[:6000], dataset_labels_arr[7000:]))

val_features = dataset_features_arr[6000:7000]
val_labels = dataset_labels_arr[6000:7000]

In [45]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', val_features.shape)
print('Testing Labels Shape:', val_labels.shape)

Training Features Shape: (7000, 2)
Training Labels Shape: (7000,)
Testing Features Shape: (1000, 2)
Testing Labels Shape: (1000,)


In [46]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

### Random Forest

In [48]:
for max_depth in range(1,8):
    rf = RandomForestRegressor(n_estimators = 1000, random_state = 666, max_depth=max_depth)
    rf.fit(train_features, train_labels);
    predictions = rf.predict(val_features)
    pearson = pearsonr(val_labels, predictions)
    errors = abs(predictions - val_labels)
    print('Max Depth', max_depth)
    print('RMSE:', rmse(predictions,val_labels))
    print(f"Pearson {pearson[0]}")
    print('Mean Absolute Error:', round(np.mean(errors), 4))
    print()

Max Depth 6
RMSE: 0.8589019911175123
Pearson 0.10972793723320023
Mean Absolute Error: 0.5194

Max Depth 7
RMSE: 0.8597558268113434
Pearson 0.10490388283102567
Mean Absolute Error: 0.5203

Max Depth 8
RMSE: 0.860927066160044
Pearson 0.09938762746855294
Mean Absolute Error: 0.5214

Max Depth 9
RMSE: 0.8628274311378396
Pearson 0.0917707464901201
Mean Absolute Error: 0.5231



In [28]:
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(dataset_features_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: sentence_correlation Importance: 0.27
Variable: correlation          Importance: 0.22
Variable: non_translated_words Importance: 0.12
Variable: english_sentence_length Importance: 0.07
Variable: german_sentence_length Importance: 0.05
Variable: std_max_english_sentiment Importance: 0.04
Variable: max_sentiment_english Importance: 0.04
Variable: nouns_diff           Importance: 0.04
Variable: distance             Importance: 0.04
Variable: weights              Importance: 0.04
Variable: verbs_diff           Importance: 0.02
Variable: std_max_german_sentiment Importance: 0.01
Variable: max_sentiment_german Importance: 0.01
Variable: sentence_length_difference Importance: 0.01
Variable: adjectives_diff      Importance: 0.01
Variable: adverbs_diff         Importance: 0.0


### SVM

In [29]:
for k in ['linear','poly','rbf','sigmoid']:
    clf_t = SVR(kernel=k)
    clf_t.fit(train_features, train_labels)
    print(k)
    predictions = clf_t.predict(val_features)
    pearson = pearsonr(val_labels, predictions)
    errors = abs(predictions - val_labels)
    print(f'RMSE: {rmse(predictions,val_labels)}')
    print(f'Pearson {pearson[0]}')
    print('Mean Absolute Error:', round(np.mean(errors), 4))
    print()

linear
RMSE: 0.872707092652995
Pearson 0.1485700358409105
Mean Absolute Error: 0.4844

poly
RMSE: 0.8770801165456373
Pearson 0.09819970342325901
Mean Absolute Error: 0.4881

rbf
RMSE: 0.8770824454598339
Pearson 0.1019492145778241
Mean Absolute Error: 0.4872

sigmoid
RMSE: 117.05570695945646
Pearson -0.07324938173141567
Mean Absolute Error: 83.2121



### Neural Network

In [30]:
import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
import torch.optim as optim

In [31]:
class Net(nn.Module):

    def __init__(self, x):
        super(Net, self).__init__()
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(x.shape[1], 4) 
        nn.init.xavier_uniform_(self.fc1.weight, gain=1.0)
        self.fc2 = nn.Linear(4, 2)
        nn.init.xavier_uniform_(self.fc2.weight, gain=1.0)
        self.fc3 = nn.Linear(2, 1)
        nn.init.xavier_uniform_(self.fc2.weight, gain=1.0)

    def forward(self, x):
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net(train_features)
print(net)

Net(
  (fc1): Linear(in_features=16, out_features=4, bias=True)
  (fc2): Linear(in_features=4, out_features=2, bias=True)
  (fc3): Linear(in_features=2, out_features=1, bias=True)
)


In [32]:
train_features_ten, train_labels_ten = Tensor(train_features), Tensor(train_labels.reshape(-1,1))

In [33]:
means = train_features_ten.mean(dim=0, keepdim=True)
stds = train_features_ten.std(dim=0, keepdim=True)
normalized_train = (train_features_ten - means) / stds

In [34]:
val_features_ten = (Tensor(val_features)- means) / stds
test_labels = Tensor((val_labels).reshape(-1, 1))

In [35]:
train_data = TensorDataset(normalized_train, train_labels_ten)
train_loader = DataLoader(train_data, shuffle=True, batch_size=32)

In [36]:
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=1e-3, betas=(0.5, 0.999))

In [37]:
for epoch in range(40):
    running_loss = 0.0
    for batch_idx, data in enumerate(train_loader):
        inputs, labels = data
        # zero the parameter gradients
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # print statistics
        running_loss += loss.item()
    with torch.no_grad():
        predictions = net(val_features_ten)
        test_loss = criterion(predictions, test_labels).item()
        pearson = pearsonr(val_labels, predictions.numpy().reshape(-1,))[0]
        print('Test Epoch: ', epoch, " ", pearson)
        
        #train correlation
        predictions = net(normalized_train)
        pearson = pearsonr(train_labels, predictions.numpy().reshape(-1,))[0]
        print('Train Epoch: ', epoch, " ", pearson)
        
#     print('Train loss', epoch, running_loss)
#     print('Test loss', epoch, test_loss)
print('Finished Training')

Test Epoch:  0   -0.035912439776535826
Train Epoch:  0   -0.009848891028516292
Test Epoch:  1   -0.02448051079446147
Train Epoch:  1   -0.0028065159247310473
Test Epoch:  2   -0.006077330332305691
Train Epoch:  2   0.01265311443772594
Test Epoch:  3   0.01549981453290309
Train Epoch:  3   0.033033111672348354
Test Epoch:  4   0.04045530992625532
Train Epoch:  4   0.05473110857656334
Test Epoch:  5   0.06007420435868921
Train Epoch:  5   0.07010967271360326
Test Epoch:  6   0.08049852225648294
Train Epoch:  6   0.08465015512908337
Test Epoch:  7   0.08669786421597145
Train Epoch:  7   0.09410430353974464
Test Epoch:  8   0.09606216354017541
Train Epoch:  8   0.1024164968229119
Test Epoch:  9   0.09808805085023252
Train Epoch:  9   0.10934727057539548
Test Epoch:  10   0.1026454269717271
Train Epoch:  10   0.11638492884978263
Test Epoch:  11   0.1016423847069589
Train Epoch:  11   0.12113895371286013
Test Epoch:  12   0.10013760398659262
Train Epoch:  12   0.12500286515840692
Test Epoch:

In [38]:
net.eval()
predictions = net(val_features_ten).detach().numpy().reshape(-1,)

In [39]:
pearson = pearsonr(val_labels, predictions)
errors = abs(predictions - val_labels)
print(f'RMSE: {rmse(predictions,val_labels)}')
print(f'Pearson {pearson[0]}')
print('Mean Absolute Error:', round(np.mean(errors), 4))
print()

RMSE: 0.8615368467116323
Pearson 0.09564195685774426
Mean Absolute Error: 0.5158

