### 10 Emoji Prediction

The task of this project is to make a system that would automatically fill the text with the appropriate emoticons. This can be done in two steps. First, for each position within the text a prediction is made whether an emoticon should be placed there. Second, an appropriate emoticon is chosen from a list of available emoticons. Both these tasks can be set up as supervised classification problems.

Competition website:
https://competitions.codalab.org/competitions/17344

Dataset:
https://competitions.codalab.org/competitions/17344

Entry point:
https://arxiv.org/pdf/1702.07285.pdf (Barbieri, Francesco, Miguel Ballesteros, and Horacio Saggion. Are Emojis Predictable?)

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
%pylab inline

Populating the interactive namespace from numpy and matplotlib


Define paths to folders containing data and results.

In [2]:
MAX_WORDS_PER_TWEET = 30
DATA_LOCATION = "./train/crawler/data/"
RESULT_LOCATION = "./result/"
TWEET_FILE_NAME = "tweet_by_ID_28_4_2018__03_20_05.txt"

### Load the data
Tweets are loaded in two ways: list of strings (for the TF-IDF vectorizer) and a list of list of words (for feature extraction). Labels are read as a numpy array of N * MAX_WORDS_PER_TWEET dimensions.

In [3]:
base_file_name = DATA_LOCATION + TWEET_FILE_NAME

text_lines = []
text_lines_split = []

with open(base_file_name + ".text", 'r', encoding="utf-8") as out_text:
    for line in out_text:
        text_lines.append(line[:-1])
        text_lines_split.append(line[:-1].split())
        
loc_lines = []
with open(base_file_name + ".loclabels", 'r') as loc_labels:
    for line in loc_labels:
        loc_line = []
        for c in line[:-1]:
            loc_line.append(int(c))
        loc_lines.append(loc_line)

loc_lines = np.asarray(loc_lines)

# full_text = open(base_file_name + ".full", 'r')
# emoji_labels = open(base_file_name + ".emolabels", 'r')
# emoji_ids = open(base_file_name + ".ids", 'r')

print(text_lines[:5])
print(loc_lines[:5])

['lol west covina california', 'things got a little festive at the office christmas2016 redrock', 'step out and explore ellis island cafe', 'rupauls drag race bingo fun drag queens be sexy rupaulsdragrace user abwyman la', 'just light makeup blueeyes lupusgirl photography modelingagency modeling smiling']
[[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


### Feature Extraction
TF-IDF is computed on the collection of tweets. Then for every position between words a new example is generated: a 2 * k array containing the k left and k right tfidf values of words. Labels are taken as 1 or 0 wether an emoji was there in the original tweet.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

tfidf_model = TfidfVectorizer(input="content", analyzer="word", stop_words="english")
X_tfidf = tfidf_model.fit_transform(text_lines)

word_to_tfidf_index_dict = {}
for i, word in enumerate(tfidf_model.get_feature_names()):
    word_to_tfidf_index_dict[word] = i

print(X_tfidf.shape)
print(X_tfidf[0])

def neighbor_features(tweets, labels, k, func):
    N = len(tweets)
    X = []
    y = []
    
    for tweet_index, (tweet, label) in enumerate(zip(tweets, labels)):
        for pos in range(len(tweet) + 1):
            x = []
            for i in range(pos - k, pos + k):
                if i < 0 or i >= len(tweet):
                    x.append(0.0)
                else:
                    x.append(func(tweet_index, tweet[i]))
            X.append(x)
            y.append(label[pos])
            
    return np.asarray(X), np.asarray(y)

def word_to_tfidf(tweet_index, word):
    if word in word_to_tfidf_index_dict:
        return X_tfidf[tweet_index, word_to_tfidf_index_dict[word]]
    else:
        return 0.0

X, y = neighbor_features(text_lines_split, loc_lines, 3, word_to_tfidf)

print(X[:5])
print(y[:5])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(100000, 101169)
  (0, 53222)	0.4278228183080142
  (0, 97113)	0.43669316514848056
  (0, 21796)	0.7050588046593335
  (0, 15621)	0.35938669649827265
[[0.         0.         0.         0.42782282 0.43669317 0.7050588 ]
 [0.         0.         0.42782282 0.43669317 0.7050588  0.3593867 ]
 [0.         0.42782282 0.43669317 0.7050588  0.3593867  0.        ]
 [0.42782282 0.43669317 0.7050588  0.3593867  0.         0.        ]
 [0.43669317 0.7050588  0.3593867  0.         0.         0.        ]]
[0 1 0 0 0]


### Build data sets
Dataset is randomly split into train and test subsets.

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train[30:50])
print(y_train[30:50])

[[0.         0.21002416 0.46831285 0.         0.         0.35150307]
 [0.31119785 0.         0.32296707 0.23357677 0.27625077 0.3067234 ]
 [0.31842575 0.3052548  0.14581719 0.23862097 0.3573096  0.37329836]
 [0.34882237 0.27353302 0.4324465  0.29457364 0.26902211 0.        ]
 [0.         0.51469825 0.3309578  0.51469825 0.11501215 0.        ]
 [0.         0.         0.         0.21093655 0.         0.        ]
 [0.         0.46152608 0.31650171 0.58426773 0.33550733 0.        ]
 [0.16048372 0.2187436  0.47048244 0.47048244 0.47048244 0.40641934]
 [0.37097258 0.         0.54391678 0.40979628 0.         0.        ]
 [0.         0.         0.         0.48248596 0.         0.        ]
 [0.         0.25451798 0.23668705 0.         0.14669033 0.25408373]
 [0.5359691  0.         0.21991171 0.         0.22324085 0.        ]
 [0.32315036 0.         0.         0.24319833 0.         0.        ]
 [0.17264715 0.31196645 0.20075195 0.3607291  0.         0.        ]
 [0.3417698  0.         0.26336237

### SVM Baseline

In [None]:
from sklearn.svm import SVC
from numpy.linalg import norm
from sklearn.externals import joblib

model_file_name = "svm.pkl"

svm_clf = SVC(kernel="rbf")
svm_clf.fit(X_train, y_train)
joblib.dump(svm_clf, model_file_name) 

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

svm_clf = joblib.load(model_file_name) 
y_pred = svm_clf.predict(X_test)

accuracy_result = accuracy_score(y_pred, y_test)
precision_result = precision_score(y_pred, y_test)
recall_result = recall_score(y_pred, y_test)
f1_result = f1_score(y_pred, y_test)

print(f"Accuracy: {accuracy_result}")
print(f"Precision: {precision_result}")
print(f"Recall: {recall_result}")
print(f"F1: {f1_result}")

In [None]:
print(np.count_nonzero(y_pred))