In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import keras

Using TensorFlow backend.


In [2]:
# Reading data

tweets_xlnet = pd.read_csv('../XLNet/tweets_with_pred_label_xlnet.csv', encoding="utf-8")
tweets_elmo = pd.read_csv('../ELMo/tweets_with_pred_label_elmo.csv', encoding="utf-8")
tweets_bert = pd.read_csv('../BERT/tweets_with_pred_label_bert.csv', encoding="utf-8")
tweets = pd.read_csv('../Data to predict/tweets.csv', encoding="utf-8")

xlnet_labels = list(tweets_xlnet['predicted_sentiment'])
elmo_labels = list(tweets_elmo['predicted_sentiment'])
bert_labels = list(tweets_bert['predicted_sentiment'])

#in this project BERT model drops first instance so we have to drop first instance of tweets, elmo prediction, and xlnet prediction.
tweets_xlnet.drop(tweets_xlnet.index[0], inplace=True)
tweets_elmo.drop(tweets_elmo.index[0], inplace=True)
tweets.drop(tweets.index[0], inplace=True)

xlnet_labels_np = np.asanyarray(xlnet_labels)
elmo_labels_np = np.asanyarray(elmo_labels)
bert_labels_np = np.asanyarray(bert_labels)

print(tweets_elmo.shape)

(749999, 4)


In [3]:
# Defining one-hot-ecoder and decoder

le = preprocessing.LabelEncoder()
le.fit(xlnet_labels)

def encode(labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def decode(one_hot):
    dec = np.argmax(one_hot, axis=1)
    return le.inverse_transform(dec)

In [4]:
# Prioritizing models

xlnet_labels_one_hot = encode(xlnet_labels) * 0.55
elmo_labels_one_hot = encode(elmo_labels) * 0.6
bert_labels_one_hot = encode(bert_labels) * 0.65

In [5]:
# Ensembling

ensembled_result = xlnet_labels_one_hot + elmo_labels_one_hot + bert_labels_one_hot
temp = np.argmax(ensembled_result, axis=1)
ensembled_results = le.inverse_transform(temp)

In [6]:
# Calculate the similarity between result of seperate models and ensembled model

same_in_bert_and_ensembeled = (ensembled_results == bert_labels_np).sum()/ensembled_results.shape[0]
print(f"Bert model labels are {round(same_in_bert_and_ensembeled*100,2)}% similar with ensembled model lables")

same_in_elmo_and_ensembeled = (ensembled_results == elmo_labels_np).sum()/ensembled_results.shape[0]
print(f"Elmo model labels are {round(same_in_elmo_and_ensembeled*100,2)}% similar with ensembled model lables")

same_in_xlnet_and_ensembeled = (ensembled_results == xlnet_labels_np).sum()/ensembled_results.shape[0]
print(f"xlnet model labels are {round(same_in_xlnet_and_ensembeled*100,2)}% similar with ensembled model lables")

Bert model labels are 86.71% similar with ensembled model lables
Elmo model labels are 50.4% similar with ensembled model lables
xlnet model labels are 45.64% similar with ensembled model lables


In [None]:
# Saving predicted labels

tweets['predicted_sentiment'] = ensembled_results
tweets.to_csv('../Data to predict/ensemble_predicted_tweets.csv', index=False, header=True, encoding="utf-8")