In [72]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from scipy.stats.stats import pearsonr

### Getting the data

In [221]:
# dataset_embeddings = pd.read_pickle('../data/dataset_correlations_v1.pickle')
dataset_embeddings = pd.read_pickle('../data/dataset_correlations_v2.pickle')
dataset_baseline = pd.read_pickle("../data/dataset_v1.pickle")

In [222]:
dataset_embeddings.columns

Index(['sentences_en', 'sentences_ge', 'scores', 'person',
       'sentences_en_no_propnouns', 'sentences_ge_no_propnouns',
       'sentences_en_clean', 'sentences_ge_clean', 'non_translated_words',
       'sentences_en_cleaner', 'sentences_ge_cleaner', 'sentences_en_final',
       'sentences_ge_final', 'length_ge', 'length_en', 'distance',
       'correlation', 'std_correlations'],
      dtype='object')

In [225]:
dataset_embeddings_features = dataset_embeddings[['non_translated_words',
                                                 'distance',
                                                 'std_correlations']]

In [226]:
dataset_baseline.columns

Index(['sentences_en', 'sentences_ge', 'scores', 'english_sentence_length',
       'german_sentence_length', 'sentence_length_difference', 'german_verbs',
       'english_verbs', 'german_adjectives', 'english_adjectives',
       'german_adverbs', 'english_adverbs', 'german_nouns', 'english_nouns',
       'english_no_punctuation', 'german_no_punctuation',
       'english_no_stop_words', 'german_no_stop_words', 'english_lemma',
       'german_lemma', 'english_sentence_sentiment',
       'german_sentence_sentiment', 'std_english_sentence_sentiment',
       'std_german_sentence_sentiment', 'english_sentence_lemma_sentiment',
       'german_sentence_lemma_sentiment', 'max_sentiment_english',
       'max_sentiment_german', 'std_max_english_sentiment',
       'std_max_german_sentiment', 'verbs_diff', 'adjectives_diff',
       'adverbs_diff', 'nouns_diff'],
      dtype='object')

In [227]:
dataset_baseline_features = dataset_baseline[['std_max_english_sentiment',
                                              'std_max_german_sentiment',
                                              'max_sentiment_english',
                                              'max_sentiment_german',
                                              'sentence_length_difference',
                                              'verbs_diff', 
                                              'adjectives_diff',
                                              'adverbs_diff',
                                              'nouns_diff'
                                             ]]

In [228]:
dataset_features = pd.concat((dataset_baseline_features, dataset_embeddings_features),axis=1)

In [229]:
# dataset_features = dataset_features[['correlation','non_translated_words']]

In [230]:
dataset_features

Unnamed: 0,std_max_english_sentiment,std_max_german_sentiment,max_sentiment_english,max_sentiment_german,sentence_length_difference,verbs_diff,adjectives_diff,adverbs_diff,nouns_diff,non_translated_words,distance,std_correlations
0,-0.186145,-0.007065,0.000000,0.0,1,0,0,0,0,0,0,-1.120192
1,-1.661481,-2.422978,-0.304167,-1.0,-1,0,0,1,1,0,0,0.043681
2,-0.186145,-0.007065,0.000000,0.0,0,0,0,0,1,0,1,0.199028
3,-1.156229,-0.007065,-0.200000,0.0,0,0,0,0,0,4,-1,0.123885
4,-0.186145,-0.007065,0.000000,0.0,0,1,0,1,1,0,3,-0.377963
...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-0.186145,-0.007065,0.000000,0.0,0,1,2,0,3,2,0,-0.383984
7996,-0.186145,-0.007065,0.000000,0.0,1,0,1,0,1,0,1,0.797347
7997,-1.520010,-1.698204,-0.275000,-0.7,-2,2,0,0,2,3,2,-0.347143
7998,-0.186145,-0.007065,0.000000,0.0,1,0,0,0,1,0,0,0.498618


In [231]:
dataset_features_list = list(dataset_features.columns)
dataset_features_arr = np.array(dataset_features)

In [232]:
dataset_labels = dataset_embeddings['scores']
dataset_labels_arr = np.array(dataset_labels)

### Splitting Train and Validation

In [233]:
train_features = dataset_features_arr[:7000]
train_labels = dataset_labels_arr[:7000]

val_features = dataset_features_arr[7000:]
val_labels = dataset_labels_arr[7000:]

# train_features = np.concatenate((dataset_features_arr[:6000], dataset_features_arr[7000:]))
# train_labels = np.concatenate((dataset_labels_arr[:6000], dataset_labels_arr[7000:]))

# val_features = dataset_features_arr[6000:7000]
# val_labels = dataset_labels_arr[6000:7000]

In [234]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', val_features.shape)
print('Testing Labels Shape:', val_labels.shape)

Training Features Shape: (7000, 12)
Training Labels Shape: (7000,)
Testing Features Shape: (1000, 12)
Testing Labels Shape: (1000,)


In [235]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

### Random Forest

In [236]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 666, max_depth = 2)
rf.fit(train_features, train_labels);
predictions = rf.predict(val_features)
pearson = pearsonr(val_labels, predictions)
errors = abs(predictions - val_labels)
print('RMSE:', rmse(predictions,val_labels))
print(f"Pearson {pearson[0]}")
print('Mean Absolute Error:', round(np.mean(errors), 4))

RMSE: 0.8603439155704071
Pearson 0.0909183812729866
Mean Absolute Error: 0.5216


In [237]:
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(dataset_features_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: std_correlations     Importance: 0.67
Variable: non_translated_words Importance: 0.14
Variable: distance             Importance: 0.08
Variable: nouns_diff           Importance: 0.05
Variable: std_max_english_sentiment Importance: 0.02
Variable: max_sentiment_english Importance: 0.02
Variable: sentence_length_difference Importance: 0.01
Variable: verbs_diff           Importance: 0.01
Variable: std_max_german_sentiment Importance: 0.0
Variable: max_sentiment_german Importance: 0.0
Variable: adjectives_diff      Importance: 0.0
Variable: adverbs_diff         Importance: 0.0


### SVM

In [238]:
for k in ['linear','poly','rbf','sigmoid']:
    clf_t = SVR(kernel=k)
    clf_t.fit(train_features, train_labels)
    print(k)
    predictions = clf_t.predict(val_features)
    pearson = pearsonr(val_labels, predictions)
    errors = abs(predictions - val_labels)
    print(f'RMSE: {rmse(predictions,val_labels)}')
    print(f'Pearson {pearson[0]}')
    print('Mean Absolute Error:', round(np.mean(errors), 4))
    print()

linear
RMSE: 0.8765473527399004
Pearson 0.1300191766974358
Mean Absolute Error: 0.4876

poly
RMSE: 0.8839171969860865
Pearson 0.03526143522827664
Mean Absolute Error: 0.4972

rbf
RMSE: 0.8826129098984249
Pearson 0.05280393944950446
Mean Absolute Error: 0.5009

sigmoid
RMSE: 120.4722536598051
Pearson 0.02583690688614004
Mean Absolute Error: 54.2065

