In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from scipy.stats.stats import pearsonr

### Getting the data

In [32]:
# dataset_embeddings = pd.read_pickle('../data/dataset_correlations_v1.pickle')
dataset_embeddings = pd.read_pickle('../data/dataset_correlations_v2.pickle')
dataset_baseline = pd.read_pickle('../data/dataset_v1.pickle')
dataset_laser = pd.read_pickle('../data/dataset_corrleations_laser.pickle')

In [33]:
dataset_embeddings.columns

Index(['sentences_en', 'sentences_ge', 'scores', 'person',
       'sentences_en_no_propnouns', 'sentences_ge_no_propnouns',
       'sentences_en_clean', 'sentences_ge_clean', 'non_translated_words',
       'sentences_en_cleaner', 'sentences_ge_cleaner', 'sentences_en_final',
       'sentences_ge_final', 'length_ge', 'length_en', 'distance',
       'correlation', 'embedded_words_matched_max',
       'embedded_words_matched_min', 'weights', 'weighted_corr'],
      dtype='object')

In [69]:
dataset_embeddings_features = dataset_embeddings[['non_translated_words',
                                                 'distance',
                                                 'weighted_corr',
                                                 'correlation',
                                                 'weighted_corr']]

In [70]:
dataset_baseline.columns

Index(['sentences_en', 'sentences_ge', 'scores', 'english_sentence_length',
       'german_sentence_length', 'sentence_length_difference', 'german_verbs',
       'english_verbs', 'german_adjectives', 'english_adjectives',
       'german_adverbs', 'english_adverbs', 'german_nouns', 'english_nouns',
       'english_no_punctuation', 'german_no_punctuation',
       'english_no_stop_words', 'german_no_stop_words', 'english_lemma',
       'german_lemma', 'english_sentence_sentiment',
       'german_sentence_sentiment', 'std_english_sentence_sentiment',
       'std_german_sentence_sentiment', 'english_sentence_lemma_sentiment',
       'german_sentence_lemma_sentiment', 'max_sentiment_english',
       'max_sentiment_german', 'std_max_english_sentiment',
       'std_max_german_sentiment', 'verbs_diff', 'adjectives_diff',
       'adverbs_diff', 'nouns_diff'],
      dtype='object')

In [71]:
dataset_baseline_features = dataset_baseline[['std_max_english_sentiment',
                                              'std_max_german_sentiment',
                                              'max_sentiment_english',
                                              'max_sentiment_german',
                                              'sentence_length_difference',
                                              'verbs_diff', 
                                              'adjectives_diff',
                                              'adverbs_diff',
                                              'nouns_diff'
                                             ]]

In [72]:
dataset_laser.columns

Index(['sentences_en', 'sentences_ge', 'scores', 'person',
       'sentences_en_no_propnouns', 'sentences_ge_no_propnouns',
       'sentences_en_clean', 'sentences_ge_clean', 'non_translated_words',
       'sentences_en_cleaner', 'sentences_ge_cleaner', 'sentences_en_final',
       'sentences_ge_final', 'length_ge', 'length_en', 'distance',
       'correlation', 'std_correlations', 'sentence_correlation'],
      dtype='object')

In [73]:
dataset_laser_features = dataset_laser[['sentence_correlation']]

In [88]:
dataset_features = pd.concat((dataset_baseline_features, dataset_embeddings_features,
                              dataset_laser_features),axis=1)

In [89]:
dataset_features = dataset_features[['correlation','non_translated_words', 'sentence_correlation']]

In [76]:
dataset_features

Unnamed: 0,weighted_corr,weighted_corr.1,non_translated_words,sentence_correlation
0,0.518761,0.518761,0,0.938219
1,0.619618,0.619618,0,0.910342
2,0.527567,0.527567,0,0.932145
3,0.278475,0.278475,4,0.934417
4,0.458134,0.458134,0,0.907760
...,...,...,...,...
7995,0.485465,0.485465,2,0.966300
7996,0.587081,0.587081,0,0.914410
7997,0.376554,0.376554,3,0.926388
7998,0.659042,0.659042,0,0.960786


In [77]:
dataset_features_list = list(dataset_features.columns)
dataset_features_arr = np.array(dataset_features)

In [78]:
dataset_labels = dataset_embeddings['scores']
dataset_labels_arr = np.array(dataset_labels)

### Splitting Train and Validation

In [79]:
train_features = dataset_features_arr[:7000]
train_labels = dataset_labels_arr[:7000]

val_features = dataset_features_arr[7000:]
val_labels = dataset_labels_arr[7000:]

# train_features = np.concatenate((dataset_features_arr[:6000], dataset_features_arr[7000:]))
# train_labels = np.concatenate((dataset_labels_arr[:6000], dataset_labels_arr[7000:]))

# val_features = dataset_features_arr[6000:7000]
# val_labels = dataset_labels_arr[6000:7000]

In [80]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', val_features.shape)
print('Testing Labels Shape:', val_labels.shape)

Training Features Shape: (7000, 4)
Training Labels Shape: (7000,)
Testing Features Shape: (1000, 4)
Testing Labels Shape: (1000,)


In [81]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

### Random Forest

In [82]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 666, max_depth = 2)
rf.fit(train_features, train_labels);
predictions = rf.predict(val_features)
pearson = pearsonr(val_labels, predictions)
errors = abs(predictions - val_labels)
print('RMSE:', rmse(predictions,val_labels))
print(f"Pearson {pearson[0]}")
print('Mean Absolute Error:', round(np.mean(errors), 4))

RMSE: 0.8590193766171741
Pearson 0.10737972547221139
Mean Absolute Error: 0.5213


In [83]:
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(dataset_features_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: sentence_correlation Importance: 0.62
Variable: non_translated_words Importance: 0.17
Variable: weighted_corr        Importance: 0.11
Variable: weighted_corr        Importance: 0.1


### SVM

In [84]:
for k in ['linear','poly','rbf','sigmoid']:
    clf_t = SVR(kernel=k)
    clf_t.fit(train_features, train_labels)
    print(k)
    predictions = clf_t.predict(val_features)
    pearson = pearsonr(val_labels, predictions)
    errors = abs(predictions - val_labels)
    print(f'RMSE: {rmse(predictions,val_labels)}')
    print(f'Pearson {pearson[0]}')
    print('Mean Absolute Error:', round(np.mean(errors), 4))
    print()

linear
RMSE: 0.8781714603894802
Pearson 0.09125672746481546
Mean Absolute Error: 0.4906

poly
RMSE: 0.8759406511439161
Pearson 0.12691700084650853
Mean Absolute Error: 0.4891

rbf
RMSE: 0.8767300579884044
Pearson 0.10712159566033594
Mean Absolute Error: 0.4896

sigmoid
RMSE: 14.054784759725278
Pearson -0.059415335134824276
Mean Absolute Error: 10.4566

