In [1]:
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import os

## 1. Getting the data

This section shows the way the built features and the scores are extracted after the sentences have been preprocessed through the 'feature_engineering_pipeline' code.

In [2]:
dataset = pd.read_pickle('data/datasets/dataset_with_features.pickle')

In [3]:
# Selecting the numerical features created
features = dataset[['max_sentiment_english',
                    'max_sentiment_german',
                    'std_max_english_sentiment',
                    'std_max_german_sentiment',
                    'german_sentence_length',
                    'english_sentence_length',
                    'sentence_length_difference',
                    'verbs_diff',
                    'adjectives_diff',
                    'adverbs_diff',
                    'nouns_diff',
                    'non_translated_words',
                    'correlation',
                    'sentence_correlation',
                    'non_match_correlation',
                    'non_translated_ratio']]

In [4]:
# Splitting between the Train/Validation dataset and the test
train_val = features[:8000]
test = features[8000:]

In [5]:
# Setting the features to a list (used for random forest important features)
dataset_features_list = list(train_val.columns)
dataset_features_arr = np.array(train_val)
test_features_arr = np.array(test)

In [6]:
# Collecting the scores for the train and validation datasets
path_train_scores = os.path.join(os.getcwd(), 'data', 'en-de', 'train.ende.scores')
train_scores = pd.read_csv(path_train_scores,header=None)
train_scores = train_scores.rename(columns={0:"scores"})
path_val_scores = os.path.join(os.getcwd(), 'data', 'en-de', 'dev.ende.scores')
val_scores = pd.read_csv(path_val_scores,header=None)
val_scores = val_scores.rename(columns={0:"scores"})
scores = pd.concat([train_scores, val_scores])

dataset_labels_arr = np.array(scores)

## 2. Cross Validation and results

This section shows the two step cross validation for each of the chosen algorithms: PLS, Ridge, SVM, Random Forest, Neural Networks.
The first step consists of a randomized grid search cross validation to provide the best model parameters for each model.
The second step consists of running 500 randomly split cross validations to estimate the distribution in scores of each of the best models. This allows to seelect the best model and then pass it through section 3 to make the test predictions.

In [7]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV

In [None]:
# Lines to standardize the features if desired

# normalised = (dataset_features_arr - dataset_features_arr.min(axis=0))/ \
#              (dataset_features_arr.max(axis=0) - dataset_features_arr.min(axis=0))
# normalised.var(axis=0)
# sel = VarianceThreshold(threshold=0.01)
# dataset_features_arr = sel.fit_transform(normalised)

### 2.1 PLS

In [None]:
from sklearn.cross_decomposition import PLSRegression

In [None]:
# Performing a randomized grid search and cross validation to find the best PLSRegressor model 

n_components = [int(x) for x in np.arange(1,12)]
scale = [True, False]
max_iter = [int(x) for x in np.arange(300,1000)]

random_grid = {'n_components': n_components,
               'scale': scale, 
               'max_iter':max_iter}

pls = PLSRegression()
pls_random = RandomizedSearchCV(estimator = pls, param_distributions = random_grid, 
                                n_iter = 100, cv = 8, random_state=42, scoring='neg_mean_squared_error')
pls_random.fit(dataset_features_arr, dataset_labels_arr)
print(pls_random.best_params_)

#### In the next cell, a cross validation over many randomly created splits are run on each of the best model to provide estimates of the score distribution. 

In [None]:
rs = ShuffleSplit(n_splits=500, test_size=.25, random_state=0)
cv = rs.split(dataset_features_arr)

correlation_pls = []
mae_pls = []
X = dataset_features_arr
y = dataset_labels_arr.reshape(-1)

for train, test in cv:
    pls = PLSRegression(**pls_random.best_params_)
    pls.fit(X[train], y[train])
    predictions = pls.predict(X[test]).reshape(-1)
    pearson = pearsonr(y[test], predictions)[0]
    error = abs(predictions - y[test])
    correlation_pls.append(pearson)
    mae_pls.append(error)
    
print(np.array(correlation_pls).mean())
print(np.array(mae_pls).mean())
print(np.array(correlation_pls).std())

In [None]:
# Create predictions directory if don't exist
predictions_dir = os.path.join(os.getcwd(), 'data', 'imgs')
if not os.path.exists(predictions_dir):
    os.mkdir(predictions_dir)

In [None]:
sns.distplot(correlation_pls)
plt.xlabel('Pearson Correlation')
plt.ylabel('Counts')
plt.savefig('data/imgs/pls.svg', bbox_inches='tight')

### 2.2 Linear Regression

In [None]:
from sklearn.linear_model import Ridge

In [None]:
# Performing a randomized grid search and cross validation to find the best Ridge Linear Regression model 

alpha = [x for x in np.linspace(0,1,10)]
normalize = [True, False]
max_iter = [int(x) for x in np.arange(300,1500)]

random_grid = {'alpha': alpha,
               'normalize': normalize, 
               'max_iter':max_iter}

lr = Ridge()
lr_random = RandomizedSearchCV(estimator = lr, param_distributions = random_grid, 
                                n_iter = 100, cv = 8, random_state=42, scoring='neg_mean_squared_error')
lr_random.fit(dataset_features_arr, dataset_labels_arr)
print(lr_random.best_params_)

#### In the next cell, a cross validation over many randomly created splits are run on each of the best model to provide estimates of the score distribution. 

In [None]:
# Making a cross validation over 500 different splits to compute an estimate of the score distribution

rs = ShuffleSplit(n_splits=500, test_size=.25, random_state=0)
cv = rs.split(dataset_features_arr)

correlation_ridge = []
mae_ridge = []
X = dataset_features_arr
y = dataset_labels_arr.reshape(-1)

# for all different split
for train, test in cv:
    rl = Ridge(**lr_random.best_params_)
    rl.fit(X[train], y[train])
    predictions = rl.predict(X[test]).reshape(-1)
    pearson = pearsonr(y[test], predictions)[0]
    error = abs(predictions - y[test])
    correlation_ridge.append(pearson)
    mae_ridge.append(error)

# Print the score of interest
print(np.array(correlation_ridge).mean())
print(np.array(mae_ridge).mean())
print(np.array(correlation_ridge).std())

In [None]:
# Plot and save the distribution
sns.distplot(correlation_ridge)
plt.xlabel('Pearson Correlation')
plt.ylabel('Counts')
plt.savefig('data/imgs/ridge.svg', bbox_inches='tight')

### 2. 3 SVM

In [None]:
from sklearn.svm import SVR

In [None]:
# Performing a randomized grid search and cross validatino to find the best SVM model 

kernel = ('linear', 'poly', 'rbf')
C = [int(x) for x in np.linspace(1, 150, 10)]

random_grid = {'kernel': kernel,
               'C': C}
svm = SVR()
svm_random = RandomizedSearchCV(estimator = svm, param_distributions = random_grid, 
                                n_iter = 10, cv = 2, scoring='neg_mean_squared_error')
svm_random.fit(dataset_features_arr, dataset_labels_arr.ravel())
print(svm_random.best_params_)

#### In the next cell, a cross validation over many randomly created splits are run on each of the best model to provide estimates of the score distribution. 

In [None]:
rs = ShuffleSplit(n_splits=10, test_size=.25, random_state=0)
cv = rs.split(dataset_features_arr)

correlation_svm = []
mae_svm = []
X = dataset_features_arr
y = dataset_labels_arr.reshape(-1)

for train, test in cv
    svm = SVR(**svm_random.best_params_)
    svm.fit(X[train], y[train])
    predictions = svm.predict(X[test]).reshape(-1)
    pearson = pearsonr(y[test], predictions)[0]
    error = abs(predictions - y[test])
    correlation_svm.append(pearson)
    mae_svm.append(error)
    
print(np.array(correlation_svm).mean())
print(np.array(mae_svm).mean())
print(np.array(correlation_svm).std())

sns.distplot(correlation_svm)
plt.xlabel('Pearson Correlation')
plt.ylabel('Counts')
plt.savefig('data/imgs/svr.svg', bbox_inches='tight')

### 2.4 Random Forests

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Performing a randomized grid search and cross validation to find the best random forest model 
# Setting the different parameters

n_estimators = [500, 1000]
max_depth = [1,2]
bootstrap = [True]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'bootstrap':bootstrap}

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 2, cv = 2, random_state=42, scoring='neg_mean_squared_error')
rf_random.fit(dataset_features_arr, dataset_labels_arr.ravel())
print(rf_random.best_params_)
print(rf_random.best_score_)

In [None]:
# Highlighting the most important features of the model
rf = RandomForestRegressor(**{'n_estimators': 1000, 'max_depth': 2, 'bootstrap': True})
rf.fit(dataset_features_arr, dataset_labels_arr.ravel())

importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(dataset_features_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

#### In the next cell, a cross validation over many randomly created splits are run on each of the best model to provide estimates of the score distribution. 

In [None]:
rs = ShuffleSplit(n_splits=10, test_size=.25, random_state=0)
cv = rs.split(dataset_features_arr)

correlation_rf = []
mae_rf = []
X = dataset_features_arr
y = dataset_labels_arr.reshape(-1)

for train, test in cv
    rf = RandomForestRegressor(**rf_random.best_params_)
    rf.fit(X[train], y[train])
    predictions = rf.predict(X[test]).reshape(-1)
    pearson = pearsonr(y[test], predictions)[0]
    error = abs(predictions - y[test])
    correlation_rf.append(pearson)
    mae_rf.append(error)
    
print(np.array(correlation_rf).mean())
print(np.array(mae_rf).mean())
print(np.array(correlation_rf).std())

sns.distplot(correlation_rf)
plt.xlabel('Pearson Correlation')
plt.ylabel('Counts')
plt.savefig('data/imgs/rf.svg', bbox_inches='tight')

### 2.5 Nerual Network

The final model tested were neural networks. The following provides the implementation of the network using Pytorch, as well as a cross validation on the tested architecture. In this section, the full LASER embedding 1024 dimension vectors were included in the feartures.

In [8]:
import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
import torch.optim as optim

In [9]:
# Load the laser embeddings daatsets
english_laser = np.load('data/laser_embeddings/laser_1024_english.npy')
german_laser = np.load('data/laser_embeddings/laser_1024_german.npy')

In [10]:
# Select the train and validation features
english_laser_train = english_laser[:8000,:]
german_laser_train = german_laser[:8000,:]

In [11]:
# Get arrays from pandas columns
english_laser_train = np.array(english_laser_train)
german_laser_train = np.array(german_laser_train)

In [12]:
# Get the total features
laser_features = np.concatenate((english_laser_train, german_laser_train), axis=1)
train_val_features = np.concatenate((laser_features, dataset_features_arr), axis = 1)

In [23]:
# Constructing the model

class Net(nn.Module):

    def __init__(self, x):
        super(Net, self).__init__()
        
        x_laser = x[:, :2048]
        x_baseline = x[:, 2048:]
        
        encoding_size = 15
        self.encoder1 = nn.Linear(x_laser.shape[1], 10)
        nn.init.xavier_uniform_(self.encoder1.weight, gain=1.0)
        self.dropout_encoder1 = nn.Dropout(p=0.5)
        
        self.encoder2 = nn.Linear(10, encoding_size)
        nn.init.xavier_uniform_(self.encoder1.weight, gain=1.0)
        self.dropout_encoder2 = nn.Dropout(p=0.5)
        self.bn_encoder = nn.BatchNorm1d(num_features=encoding_size)
        
        x = torch.zeros(x_laser.shape[0], encoding_size + x_baseline.shape[1])
        
        self.fc1 = nn.Linear(x.shape[1], 6) 
        nn.init.xavier_uniform_(self.fc1.weight, gain=1.0)
        self.dropout_fc1 = nn.Dropout(p=0.5)
        
        self.fc2 = nn.Linear(6, 1)
        nn.init.xavier_uniform_(self.fc2.weight, gain=1.0)
        self.dropout_fc2 = nn.Dropout(p=0.5)
        
#         self.fc3 = nn.Linear(3, 1)
#         nn.init.xavier_uniform_(self.fc2.weight, gain=1.0)

    def forward(self, x):
        
        x_laser = x[:, :2048]
        x_baseline = x[:, 2048:]
        
        x_laser = x_laser.view(-1, self.num_flat_features(x_laser))
        x_baseline = x_baseline.view(-1, self.num_flat_features(x_baseline))
        
        x_laser = self.encoder1(x_laser)
        x_laser = self.dropout_encoder1(x_laser)
        x_laser = F.relu(x_laser)
        
        x_laser = self.encoder2(x_laser)
#         x_laser = self.dropout_encoder2(x_laser)
        x_laser = F.relu(x_laser)
        x_laser = self.bn_encoder(x_laser)
        
        
        x = torch.cat((x_laser, x_baseline), dim=1)
        x = x.view(-1, self.num_flat_features(x))
        
        x = self.fc1(x)
#         x = self.dropout_fc1(x)
        x = F.relu(x)

        x = self.fc2(x)
#         x = self.dropout_fc2(x)
#         x = F.relu(x)
        
#         x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net(train_val_features)
print(net)

Net(
  (encoder1): Linear(in_features=2048, out_features=10, bias=True)
  (dropout_encoder1): Dropout(p=0.5, inplace=False)
  (encoder2): Linear(in_features=10, out_features=15, bias=True)
  (dropout_encoder2): Dropout(p=0.5, inplace=False)
  (bn_encoder): BatchNorm1d(15, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=31, out_features=6, bias=True)
  (dropout_fc1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=6, out_features=1, bias=True)
  (dropout_fc2): Dropout(p=0.5, inplace=False)
)


In [24]:
# Performing cross validation over random splits of the data, and printing the best pearson correlation 
# coefficents obtained during on the test data

rs = ShuffleSplit(n_splits=2, test_size=.25, random_state=0)
cv = rs.split(dataset_features_arr)

for train_idx, val_idx in cv:
    
    # Rededining the model to restars learning
    net = Net(train_val_features)
    
    train_features = train_val_features[train_idx]
    train_labels = dataset_labels_arr[train_idx].reshape(-1)
    
    val_features = train_val_features[val_idx]
    val_labels = dataset_labels_arr[val_idx].reshape(-1)
    
    # Making to tensors as required by Pytorch to propagate the gradients
    train_features_ten, train_labels_ten = Tensor(train_features), Tensor(train_labels.reshape(-1,1))

    # Standardizing the inputs
    means = train_features_ten.mean(dim=0, keepdim=True)
    stds = train_features_ten.std(dim=0, keepdim=True)
    normalized_train = (train_features_ten - means) / stds

    val_features_ten = (Tensor(val_features)- means) / stds
    test_labels = Tensor((val_labels).reshape(-1, 1))

    # Creating the dataloaders to create the batches and iterate over epochs
    train_data = TensorDataset(normalized_train, train_labels_ten)
    train_loader = DataLoader(train_data, shuffle=True, batch_size=32)

#     criterion = nn.MSELoss()
    criterion = nn.SmoothL1Loss()
    optimizer = optim.Adam(net.parameters(), lr=1e-4, betas=(0.5, 0.999))

    test_pearson=[]
    train_pearson=[]

    
    for epoch in range(300):
        running_loss = 0.0
        for batch_idx, data in enumerate(train_loader):
            inputs, labels = data
            # zero the parameter gradients
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            # print statistics
            running_loss += loss.item()

        # No grads to prevent from propagating the gradient from the test data
        with torch.no_grad():
            # Getting the test correlation
            predictions = net(val_features_ten)
            test_loss = criterion(predictions, test_labels).item()
            pearson_test = pearsonr(val_labels, predictions.numpy().reshape(-1,))[0]
            test_pearson.append(pearson_test)

            #train correlation
            predictions = net(normalized_train)
            pearson_train = pearsonr(train_labels, predictions.numpy().reshape(-1,))[0]
            train_pearson.append(pearson_train)
            
            print('Epoch ', epoch, pearson_train, pearson_test)

    print('Test: ', test_pearson.index(max(test_pearson)), max(test_pearson))
    print('Train: ', train_pearson.index(max(train_pearson)), max(train_pearson))
    plt.plot(test_pearson)
    print('Finished Training')

Epoch  0 -0.02944079262970075 -0.025213594014865774
Epoch  1 -0.01175644548889364 -0.03250766463941739
Epoch  2 -0.006735831321859406 -0.016802109737217727
Epoch  3 0.003431444878047189 -0.03207921840964783
Epoch  4 0.02903109773281274 0.011315651638896293
Epoch  5 0.029358323820659355 -0.010061562697622137
Epoch  6 0.038649121502183754 -0.0035826851149515643
Epoch  7 0.04815560239985098 0.019335264359989904
Epoch  8 0.0634790487078665 0.017149663101207052
Epoch  9 0.08135475774907965 0.026436420166071842
Epoch  10 0.08667350717247588 0.041879370720633136
Epoch  11 0.09691185100045482 0.016732541252108935
Epoch  12 0.09752580486872553 0.027291230143670506
Epoch  13 0.09204197968928536 0.049863918965120824
Epoch  14 0.11034144044088914 0.07703609161333533
Epoch  15 0.12526958019262224 0.052577350257110154
Epoch  16 0.12015184313609692 0.03829942497726205
Epoch  17 0.14281559986787365 0.06829813972356705
Epoch  18 0.16183759180127516 0.027158155523315172
Epoch  19 0.14695642149270888 0.0

KeyboardInterrupt: 

## 3. Predictions

This section provides provides the code which trains our best models (ie. the Ridge linear regression and PLS regrssion using the best found hyper parameters) on the full train and validation dataset to predict the test scores.

In [None]:
# Make predictions with the best Ridge regression 
lr_test = Ridge(**{'normalize': True, 'max_iter': 595, 'alpha': 0.2222222222222222})
lr_test.fit(dataset_features_arr, dataset_labels_arr)
predictions = lr_test.predict(test_features_arr).reshape(-1,)

In [None]:
# Making predictions with the best PLS regression 
pls_test = PLSRegression(**{'scale': True, 'n_components': 2, 'max_iter': 339})
pls_test.fit(dataset_features_arr, dataset_labels_arr)
predictions = pls_test.predict(test_features_arr).reshape(-1,)

In [None]:
# Create predictions directory if don't exist
predictions_dir = os.path.join(os.getcwd(), 'data', 'predictions')
if not os.path.exists(predictions_dir):
    os.mkdir(predictions_dir)

In [None]:
# Writing the scores to be tested on Codalab

def writeScores(method_name,scores):
    fn = "data/predictions/predictions.txt"
    print("")
    with open(fn, 'w') as output_file:
        for idx,x in enumerate(scores):
            #out =  metrics[idx]+":"+str("{0:.2f}".format(x))+"\n"
            #print(out)
            output_file.write(f"{x}\n")

writeScores("model_name",predictions)