### 10 Emoji Prediction

The task of this project is to make a system that would automatically fill the text with the appropriate emoticons. This can be done in two steps. First, for each position within the text a prediction is made whether an emoticon should be placed there. Second, an appropriate emoticon is chosen from a list of available emoticons. Both these tasks can be set up as supervised classification problems.

Competition website:
https://competitions.codalab.org/competitions/17344

Dataset:
https://competitions.codalab.org/competitions/17344

Entry point:
https://arxiv.org/pdf/1702.07285.pdf (Barbieri, Francesco, Miguel Ballesteros, and Horacio Saggion. Are Emojis Predictable?)

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
%pylab inline

Populating the interactive namespace from numpy and matplotlib


Define paths to folders containing data and results.

In [2]:
# NUMBER_OF_TWEETS = 100000

MAX_WORDS_PER_TWEET = 30
DATA_LOCATION = "./train/data/"
RESULT_LOCATION = "./result/"
TWEET_FILE_NAME = "tweet_by_ID_28_4_2018__03_20_05" + "_"

if NUMBER_OF_TWEETS is not None:
    TWEET_FILE_NAME += str(NUMBER_OF_TWEETS)
else:
    TWEET_FILE_NAME += "ALL"

### Load the data
Tweets are loaded in two ways: list of strings (for the TF-IDF vectorizer) and a list of list of words (for feature extraction). Labels are read as a numpy array of N * MAX_WORDS_PER_TWEET dimensions.

In [3]:
base_file_name = DATA_LOCATION + TWEET_FILE_NAME

text_lines = []
text_lines_split = []

with open(base_file_name + ".text", 'r', encoding="utf-8") as out_text:
    for line in out_text:
        text_lines.append(line[:-1])
        text_lines_split.append(line[:-1].split())
        
loc_lines = []
with open(base_file_name + ".loclabels", 'r') as loc_labels:
    for line in loc_labels:
        loc_line = []
        for c in line[:-1]:
            loc_line.append(int(c))
        loc_lines.append(loc_line)

loc_lines = np.asarray(loc_lines)

# full_text = open(base_file_name + ".full", 'r')
# emoji_labels = open(base_file_name + ".emolabels", 'r')
# emoji_ids = open(base_file_name + ".ids", 'r')

print(f"example of tweet texts:\n{text_lines[:5]}\n")
print(f"example of labels (emoji locations):\n{loc_lines[:5]}")

example of tweet texts:
['lol west covina california', 'things got a little festive at the office christmas2016 redrock', 'step out and explore ellis island cafe', 'rupauls drag race bingo fun drag queens be sexy rupaulsdragrace user abwyman la', 'just light makeup blueeyes lupusgirl photography modelingagency modeling smiling']

example of labels (emoji locations):
[[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


### Feature Extraction
TF-IDF is computed on the collection of tweets. Then for every position between words a new example is generated: a 2 * k array containing the k left and k right tfidf values of words. Labels are taken as 1 or 0 wether an emoji was there in the original tweet.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import gensim

tfidf_model = TfidfVectorizer(input="content", analyzer="word", stop_words="english")
X_tfidf = tfidf_model.fit_transform(text_lines)

word_to_tfidf_index_dict = {}
for i, word in enumerate(tfidf_model.get_feature_names()):
    word_to_tfidf_index_dict[word] = i



print(f"X shape {X_tfidf.shape}")
print(f"Y shape {loc_lines.shape}")
print(f"some tf-idf values\n{X_tfidf[0]}\n")

def neighbor_features(tweets, labels, k, func):
    N = len(tweets)
    X = []
    y = []
    
    for tweet_index, (tweet, label) in enumerate(zip(tweets, labels)):
        for pos in range(len(tweet) + 1):
            
#             if label[pos] == 0 and random.random() > 0.20:
#                 continue
                
            x = []
            for i in range(pos - k, pos + k):
                if i < 0 or i >= len(tweet):
                    x.append(0.0)
                else:
                    x.append(func(tweet_index, tweet[i]))
            X.append(x)
            y.append(label[pos])
            
    return np.asarray(X), np.asarray(y)

def word_to_tfidf(tweet_index, word):
    if word in word_to_tfidf_index_dict:
        return X_tfidf[tweet_index, word_to_tfidf_index_dict[word]]
    else:
        return 0.0

X, y = neighbor_features(text_lines_split, loc_lines, 3, word_to_tfidf)

emoji_num = np.count_nonzero(y)
class_freq_ratio = emoji_num / (X.shape[0] * X.shape[1])

print("after feature extraction:")
print(f"X shape {X.shape}")
print(f"y shape {y.shape}")
print(f"some X values\n{X[:5]}")
print(f"some y values\n{y[:5]}")
print(f"non zero elements (1 in label) in y {emoji_num}")
print(f"class frequency ratio {class_freq_ratio}\n")

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


X shape (100000, 101169)
Y shape (100000, 31)
non zero elements (1 in label) in y 100893
class frequency ratio 9.972718915873439e-06

some tf-idf values
  (0, 53222)	0.4278228183080142
  (0, 97113)	0.43669316514848056
  (0, 21796)	0.7050588046593335
  (0, 15621)	0.35938669649827265

after feature extraction:
X shape (1187039, 6)
y shape (1187039,)
some X values
[[0.         0.         0.         0.42782282 0.43669317 0.7050588 ]
 [0.         0.         0.42782282 0.43669317 0.7050588  0.3593867 ]
 [0.         0.42782282 0.43669317 0.7050588  0.3593867  0.        ]
 [0.42782282 0.43669317 0.7050588  0.3593867  0.         0.        ]
 [0.43669317 0.7050588  0.3593867  0.         0.         0.        ]]
some y values
[0 1 0 0 0]


### Build data sets
Dataset is randomly split into train and test subsets.

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_train[30:50])
print(y_train[30:50])

(949631, 6)
(949631,)
[[0.         0.21002416 0.46831285 0.         0.         0.35150307]
 [0.31119785 0.         0.32296707 0.23357677 0.27625077 0.3067234 ]
 [0.31842575 0.3052548  0.14581719 0.23862097 0.3573096  0.37329836]
 [0.34882237 0.27353302 0.4324465  0.29457364 0.26902211 0.        ]
 [0.         0.51469825 0.3309578  0.51469825 0.11501215 0.        ]
 [0.         0.         0.         0.21093655 0.         0.        ]
 [0.         0.46152608 0.31650171 0.58426773 0.33550733 0.        ]
 [0.16048372 0.2187436  0.47048244 0.47048244 0.47048244 0.40641934]
 [0.37097258 0.         0.54391678 0.40979628 0.         0.        ]
 [0.         0.         0.         0.48248596 0.         0.        ]
 [0.         0.25451798 0.23668705 0.         0.14669033 0.25408373]
 [0.5359691  0.         0.21991171 0.         0.22324085 0.        ]
 [0.32315036 0.         0.         0.24319833 0.         0.        ]
 [0.17264715 0.31196645 0.20075195 0.3607291  0.         0.        ]
 [0.3417698 

### Baselines

Make sure to check should_train flags when training/testing.

In [36]:
from sklearn.externals import joblib

should_train_global = True
should_train_linear_svm = True
should_train_bagging_svm = False
should_train_random_forest = True
should_train_adaboost = True

linear_svm_model_file_name = "linear_svm.pkl"
bagging_model_file_name = "bagging_svm.pkl"
random_forest_file_name = "random_forest.pkl"
adaboost_file_name = "adaboost.pkl"

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def print_scores(y_pred, y_test):
    accuracy_result = accuracy_score(y_pred, y_test)
    precision_result = precision_score(y_pred, y_test)
    recall_result = recall_score(y_pred, y_test)
    f1_result = f1_score(y_pred, y_test)
    positives_ratio = np.count_nonzero(y_pred) / y_pred.shape[0]

    print(f"Accuracy: {accuracy_result}")
    print(f"Precision: {precision_result}")
    print(f"Recall: {recall_result}")
    print(f"F1: {f1_result}")
    print(f"ratio of positive/negative predictions {positives_ratio}")
    
def train_and_save_model(model, X_train, y_train, model_file_name):
    model.fit(X_train, y_train)
    joblib.dump(model, model_file_name)
    
def load_and_test_model(model_file_name, X_test, y_test):
    clf_loaded = joblib.load(model_file_name)
    y_pred = clf_loaded.predict(X_test)
    print_scores(y_pred, y_test)

### Linear SVM Baseline

Linear SVM is based on the liblinear library and is faster on large datasets. Not using dual optimization problem makes the train extremely fast.

In [31]:
from sklearn.svm import LinearSVC

if should_train_global and should_train_svm:
    svm_clf = LinearSVC(class_weight="balanced", dual=False)
    train_and_save_model(svm_clf, X_train, y_train, linear_svm_model_file_name)
    
load_and_test_model(linear_svm_model_file_name, X_test, y_test)

Accuracy: 0.6857308936514355
Precision: 0.6767808047988003
Recall: 0.16575052336471482
F1: 0.2662851073873024


### Bagging SVM

Warning! SVM is based on the libsvm library and it scales poorly with large datasets. That is why an ensemble (bagging) is used. Each classifier is trained on a portion of the data which greatly reduces training times and gives similar (if not better) results. Using 10 estimators it still takes a few hours to train. Results are just a bit better than a single linear SVM.

In [25]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

n_estimators = 10

if should_train_global and should_train_bagging_svm:
    bagging_svm_clf = BaggingClassifier(SVC(kernel='linear', class_weight='balanced'), max_samples=1.0 / n_estimators,
                                        n_estimators=n_estimators, bootstrap=False)
    train_and_save_model(bagging_svm_clf, X_train, y_train, bagging_model_file_name)
    
load_and_test_model(bagging_model_file_name, X_test, y_test)    

Accuracy: 0.6922597385092331
Precision: 0.6711322169457635
Recall: 0.16802032362621547
F1: 0.268756505725038


### Random Forest Baseline

Random forests are pretty fast, but the results are generally worse (F1). High accuracy and recall with low precision.

In [35]:
from sklearn.ensemble import RandomForestClassifier

if should_train_global and should_train_random_forest:
    random_forest_clf = RandomForestClassifier(min_samples_leaf=2)
    train_and_save_model(random_forest_clf, X_train, y_train, random_forest_file_name)
    
load_and_test_model(random_forest_file_name, X_test, y_test)

Accuracy: 0.9147627712629735
Precision: 0.12786803299175206
Recall: 0.4783991023003553
F1: 0.20179867466077625


### AdaBoost Baseline



In [45]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

estimator_num_list = [1, 2, 4]

if should_train_global and should_train_adaboost:
    for n_estimators in estimator_num_list:
        adaboost_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=n_estimators, class_weight="balanced"), n_estimators=200)
        adaboost_clf.fit(X_train, y_train)
        y_pred = adaboost_clf.predict(X_test)
        print_scores(y_pred, y_test)
        
#         train_and_save_model(adaboost_clf, X_train, y_train, adaboost_file_name)
        
#         load_and_test_model(adaboost_file_name, X_test, y_test)

KeyboardInterrupt: 