This notebook was designed to run under the assumption that shared drive folders for the project are mounted.

# Catch all for repository setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone model repository and download the weights for the pretrained DeepMoji model
% cd /
! unzip -n "/content/drive/Shareddrives/EECS 545/dev.zip" -d /content
! unzip -n "/content/drive/Shareddrives/EECS 545/train.zip" -d /content
! unzip -n "/content/drive/Shareddrives/EECS 545/test.zip" -d /content

% cd "/content/drive/Shareddrives/EECS 545/DeepMoji"
! git clone https://github.com/huggingface/torchMoji.git
! pip install emoji
! pip install unidecode

# Uncomment if weights need to be downloaded

In [None]:
# Download weights from DropBox
# Need to say yes here
% cd "/content/drive/Shareddrives/EECS 545/DeepMoji/torchMoji"
# python scripts/download_weights.py

# Imports and Dataset setup

In [None]:
% cd "/content/drive/Shareddrives/EECS 545/DeepMoji/torchMoji"
%load_ext autoreload
%autoreload 2
import numpy as np
import emoji
import pandas as pd
import json
import os
from torchmoji.global_variables import NB_TOKENS, PRETRAINED_PATH, VOCAB_PATH, ROOT_PATH
import re

# These can be replaced by NLTK/glove/BERT for us in the future
from torchmoji.word_generator import TweetWordGenerator, WordGenerator # Takes words and splits them
from torchmoji.sentence_tokenizer import SentenceTokenizer # Tokenizes via vocab
from torchmoji.create_vocab import VocabBuilder # Buils vocab for corpus

from torchmoji.model_def import torchmoji_emojis # Model for pretrained

In [None]:
# All the emojis used by DeepMoji out of box

EMOJIS = ":joy: :unamused: :weary: :sob: :heart_eyes: \
:pensive: :ok_hand: :blush: :heart: :smirk: \
:grin: :notes: :flushed: :100: :sleeping: \
:relieved: :relaxed: :raised_hands: :two_hearts: :expressionless: \
:sweat_smile: :pray: :confused: :kissing_heart: :heartbeat: \
:neutral_face: :information_desk_person: :disappointed: :see_no_evil: :tired_face: \
:v: :sunglasses: :rage: :thumbsup: :cry: \
:sleepy: :yum: :triumph: :hand: :mask: \
:clap: :eyes: :gun: :persevere: :smiling_imp: \
:sweat: :broken_heart: :yellow_heart: :musical_note: :speak_no_evil: \
:wink: :skull: :confounded: :smile: :stuck_out_tongue_winking_eye: \
:angry: :no_good: :muscle: :facepunch: :purple_heart: \
:sparkling_heart: :blue_heart: :grimacing: :sparkles:".split(' ')

EMOJIS_set = set([i[1:-1] for i in EMOJIS])

In [None]:
train_files = [f for f in os.listdir("/content/train") if os.path.splitext(f)[1]==".csv" ]
test_files = [f for f in os.listdir("/content/test") if os.path.splitext(f)[1]==".csv" ]
valid_files = [f for f in os.listdir("/content/dev") if os.path.splitext(f)[1]==".csv" ]
train_files

In [None]:
from pandas.core.frame import DataFrame
from unidecode import unidecode

# Added files manually from zip for now
train_tweet_df = DataFrame()
test_tweet_df = DataFrame()
valid_tweet_df = DataFrame()

for train_filename in train_files:
  temp = pd.read_csv("/content/train/"+train_filename, names=["Tweet","Emoji"])
  train_tweet_df = pd.concat([train_tweet_df,temp],ignore_index=True)

for test_filename in test_files:
  temp = pd.read_csv("/content/test/"+test_filename, names=["Tweet","Emoji"])
  test_tweet_df = pd.concat([test_tweet_df,temp],ignore_index=True)

for valid_filename in valid_files:
  temp = pd.read_csv("/content/dev/"+valid_filename, names=["Tweet","Emoji"])
  valid_tweet_df = pd.concat([valid_tweet_df,temp],ignore_index=True)

# Map for mapping all possible emojis to numbers, will be useful in the future.
emoji_to_number = pd.unique(pd.concat([train_tweet_df["Emoji"],
                                      test_tweet_df["Emoji"],
                                      valid_tweet_df["Emoji"]],
                                      axis = 0))
# Testing leftover emoji removal on dataset, comment out later?

train_tweet_df['Tweet'] = train_tweet_df['Tweet'].apply(lambda x : emoji.replace_emoji(x, replace=''))
test_tweet_df['Tweet'] = test_tweet_df['Tweet'].apply(lambda x : emoji.replace_emoji(x, replace=''))
valid_tweet_df['Tweet'] = valid_tweet_df['Tweet'].apply(lambda x : emoji.replace_emoji(x, replace=''))

train_tweet_df['Tweet'] = train_tweet_df['Tweet'].apply(lambda x : str(unidecode(x).strip()))
test_tweet_df['Tweet'] = test_tweet_df['Tweet'].apply(lambda x : str(unidecode(x).strip()))
valid_tweet_df['Tweet'] = valid_tweet_df['Tweet'].apply(lambda x : str(unidecode(x).strip()))

train_tweet_df.dropna(how='any', inplace=True)
valid_tweet_df.dropna(how='any', inplace=True)
test_tweet_df.dropna(how='any', inplace=True)

# Create random subset of the data we have due to Colab limitations
print(train_tweet_df.shape,test_tweet_df.shape,valid_tweet_df.shape)

# Common functions

In [None]:
# Verbose makes the function print the results for given row
# If 0 we only print accuracy at the end
from sklearn.metrics import f1_score
y_pred = []
y_true = []

def evaluate_results(X_test, y_test, results,verbose=0):
  accuracy = 0.0
  top1_accuracy = 0.0
  for i in range(len(X_test)):
    emoji_id = np.argpartition(results[i], -5)[-5:]
    emoji_label = y_test[i]
    emoji_id_top_1 = np.argpartition(results[i], -1)[-1:]

    ret_string = "{},\nActual emoji was :{}:\nPrediction(s):".format(X_test[i], 
                                                              emoji_label
                                                              )
    for id in emoji_id:
      ret_string += EMOJIS[id]
    
    ret_string += "\n"
    ret_string = emoji.emojize(ret_string, use_aliases=True)
    
    accuracy += 1 if emoji_label.lower() in [EMOJIS[id][1:-1] for id in emoji_id] else 0
    top1_accuracy += 1 if emoji_label == EMOJIS[emoji_id_top_1[0]][1:-1] else 0

    if verbose !=0:
      print(ret_string)
      verbose-=1    
    top1_accuracy += 1 if emoji_label == EMOJIS[emoji_id_top_1[0]][1:-1] else 0

    y_pred.append(EMOJIS[emoji_id_top_1[0]][1:-1])
    y_true.append(y_test[i])

  print("Top 5: ", accuracy/len(X_test)*100, "%")
  print("Top 1:", top1_accuracy/len(X_test)*100, "%")
  print("F-1: ", f1_score(y_true, y_pred, average = 'weighted'))
  return

In [None]:
def evaluate_results_custom(X_test, y_test, results, mapping,verbose=0):
  accuracy = 0
  y_pred = []
  y_true = []
  for i in range(len(X_test)):
    emoji_id = np.argpartition(results[i], -3)[-3:]
    emoji_id_top_1 = np.argpartition(results[i], -1)[-1:]
    
    emoji_label = y_test[i]
    ret_string = "{}\nActual emoji was :{}:\nPrediction(s):".format(X_test[i], 
                                                              emoji_label
                                                              )
    for id in emoji_id:
      ret_string += mapping(id)

    ret_string = emoji.emojize(ret_string, use_aliases=True)
    accuracy += 1 if emoji_label in {mapping[id][1:-1] for id in emoji_id} else 0
    top1_accuracy += 1 if emoji_label == mapping[emoji_id_top_1[0]][1:-1] else 0

    y_pred.append(emoji_id_top_1[0])
    y_true.append(y_test[i])

    if verbose !=0:
      print(ret_string)
      verbose-=1

  print("Top 5: ", accuracy/len(X_test)*100, "%")
  print("Top 1:", top1_accuracy/len(X_test)*100, "%")
  print("F-1: ", f1_score(y_true, y_prd))
  return

# Testing new dataset on torchmoji

## Generating Output on Test Data Using Pretrained Weights and Vocabulary

In [None]:
# Generate num_test many random data points from original dataset
test_tweet_df_temp = test_tweet_df

# Split into text and emoji
X_test = test_tweet_df_temp["Tweet"].to_numpy()
y_test = test_tweet_df_temp["Emoji"].to_numpy()

# Max size tweet for tokenization size
maxlen_test = len(max(X_test, key=lambda x:len(x)))

In [None]:
# Load pretrained vocabulary to tokenize sentences based on it

with open(VOCAB_PATH, 'r') as f:
  vocabulary = json.load(f)

#Updated implementation of wordgen such that they save a list of rows that
# were ignored.
# Control+f "New addition" in respective file to inspect
# Further, removed assertion for the number of sentences input being equal to
# the number of sentences found
tokenizer = SentenceTokenizer(vocabulary,maxlen_test)

# Note, line 119 was changed for below
tokenized_X_test = tokenizer.tokenize_sentences(X_test)[0]
print(tokenizer.ignored_rows)

In [None]:
# Remove the ignored rows
y_test = np.delete(y_test,tokenizer.ignored_rows)
X_test = np.delete(X_test,tokenizer.ignored_rows)

# Remove the empty rows since tokenized X is ordered based on skipping
# bad rows
tokenized_X_test = tokenized_X_test[:len(tokenized_X_test)-len(tokenizer.ignored_rows)]

In [None]:
# Now we can get the probabilities for our sentences on raw DeepMoji
m_batches = 5000
model = torchmoji_emojis(PRETRAINED_PATH)
token_X_batches = np.array_split(tokenized_X_test,m_batches)
# Break computation into pieces to save RAM
results = [model(curr_batch) for curr_batch in token_X_batches]
#results = model(tokenized_X_test)
len(results)

In [None]:
# Flatten all batches
results_flat = []
for x in results:
  for y in x:
    results_flat.append(y)
len(results_flat)

In [None]:
evaluate_results(X_test, y_test, results_flat,verbose=10)

## Generating Output on Test Data Using Pretrained Weights and Vocabulary while restricting input to originally trained emojis

In [None]:
# Same as earlier occurence, except we now only include samples with emojis
# used by DeepMoji
test_tweet_df_temp = test_tweet_df[test_tweet_df["Emoji"].str.lower().isin(EMOJIS_set)]

X_test = test_tweet_df_temp["Tweet"].to_numpy()
y_test = test_tweet_df_temp["Emoji"].to_numpy()

maxlen_test = len(max(X_test, key=lambda x:len(x)))
print(len(test_tweet_df_temp), len(test_tweet_df))

In [None]:
with open(VOCAB_PATH, 'r') as f:
  vocabulary = json.load(f)

tokenizer = SentenceTokenizer(vocabulary,maxlen_test)

# Updated implementation of wordgen such that they save a list of rows that
# were ignored.
# Further, removed assertion for the number of sentences input being equal to
# the number of sentences found
tokenized_X_test = tokenizer.tokenize_sentences(X_test)[0]
print(tokenizer.ignored_rows)

In [None]:
# Remove the ignored rows
y_test = np.delete(y_test,tokenizer.ignored_rows)
X_test = np.delete(X_test,tokenizer.ignored_rows)

# Remove the empty rows since tokenized X is ordered based on skipping
# bad rows
tokenized_X_test = tokenized_X_test[:len(tokenized_X_test)-len(tokenizer.ignored_rows)]

In [None]:
# Now we can get the probabilities for our sentences on raw DeepMoji
print(len(y_test),len(tokenized_X_test))

m_batches = 5000
model = torchmoji_emojis(PRETRAINED_PATH)
token_X_batches = np.array_split(tokenized_X_test,m_batches)
# Break computation into pieces to save RAM
results = [model(curr_batch) for curr_batch in token_X_batches]
len(results)

In [None]:
np.save("/content/drive/Shareddrives/EECS 545/deepmoji_res_post_remove.npy", results_flat, allow_pickle=True)

In [None]:
# Flatten all batches
results_flat = []
for x in results:
  for y in x:
    results_flat.append(y)
len(results_flat)

In [None]:
evaluate_results(X_test, y_test, results_flat)

# Generating output by retraining for our data

In [None]:
from torchmoji.create_vocab import extend_vocab
from torchmoji.finetuning import finetune, load_benchmark, finetune
from torchmoji.model_def import torchmoji_transfer

In [None]:
from sklearn.preprocessing import LabelBinarizer
# Map Emojis to numbers
train_tweet_df["Emoji"].replace(emoji_to_number,
                                [i for i in range(len(emoji_to_number))],
                                inplace=True)

test_tweet_df["Emoji"].replace(emoji_to_number,
                                [i for i in range(len(emoji_to_number))],
                                inplace=True)

valid_tweet_df["Emoji"].replace(emoji_to_number,
                                [i for i in range(len(emoji_to_number))],
                                inplace=True)
  
# Extra cleaning to be safe
train_tweet_df['Tweet'] = train_tweet_df['Tweet'][train_tweet_df['Tweet'].str.strip().astype(bool)]
test_tweet_df['Tweet'] = test_tweet_df['Tweet'][test_tweet_df['Tweet'].str.strip().astype(bool)]
valid_tweet_df['Tweet'] = valid_tweet_df['Tweet'][valid_tweet_df['Tweet'].str.strip().astype(bool)]

# Split into text and emoji
X_train = train_tweet_df["Tweet"]

# Reorder to use accuracy metric or f-1 metric in training.
# One hot for F-1, labels for accuracy.
y_train = train_tweet_df["Emoji"].to_numpy()
y_train = LabelBinarizer().fit_transform(train_tweet_df["Emoji"])

X_test = test_tweet_df["Tweet"]
y_test = test_tweet_df["Emoji"].to_numpy()
y_test = LabelBinarizer().fit_transform(test_tweet_df["Emoji"])

X_valid = valid_tweet_df["Tweet"]
y_valid = valid_tweet_df["Emoji"].to_numpy()
y_valid = LabelBinarizer().fit_transform(valid_tweet_df["Emoji"])

In [None]:
# Rebuild Vocabulary
ALT_VOCAB_PATH = '{}/model/vocabulary_alt.json'.format(ROOT_PATH)
word_gen = WordGenerator(X_train)
vocab_builder = VocabBuilder(word_gen)

# Line 95 `in word_gen was changed to ignore unicode letters instead of sentence
# The function itself seems to be adding unicode not actually

# Line 107 needs to become an ascii check
# Something is strange though, unidecode returns an ascii string
# but the ascii check is for the string class?
# My guess is to remove line 137-138
vocab_builder.count_all_words()

# Fix below must be applied for collab on line 50, if using save_vocab
# np_dict = np.array([(i[0],i[1]) for i in self.word_counts.items()], dtype=dtype)

In [None]:
# Load pretrained vocabulary to tokenize sentences based on it
with open(VOCAB_PATH, 'r') as f:
  vocabulary = json.load(f)

# Extends vocabulary and stores the number of new tokens in num_added
# Expected to be 0 for partial dataset
num_added = extend_vocab(vocabulary,vocab_builder, max_tokens=10000)
num_emojis = len(emoji_to_number)
print("Total number of data points removed were ", num_added)

maxlen_test = len(max(X_test, key=lambda x:len(x)))
maxlen_train = len(max(X_train, key=lambda x:len(x)))
maxlen_valid = len(max(X_valid, key=lambda x:len(x)))
max_len_overall = max([maxlen_train, maxlen_test, maxlen_valid])

print(max_len_overall)
tokenizer_train = SentenceTokenizer(vocabulary, max_len_overall)
tokenizer_test = SentenceTokenizer(vocabulary,max_len_overall)
tokenizer_valid = SentenceTokenizer(vocabulary,max_len_overall)

# Same error fix in file, the tokenizer here aggressively preprocesses?
# Change line 120 to:
# assert len(sentences)-2 == next_insert or len(sentences)-1 == next_insert or len(sentences) == next_insert

# Tokenize each of the datasets as needed by torchmoji
tokenized_X_train = tokenizer_train.tokenize_sentences(X_train)[0]
tokenized_X_test = tokenizer_test.tokenize_sentences(X_test)[0]
tokenized_X_valid = tokenizer_valid.tokenize_sentences(X_valid)[0]

In [None]:
# Realign all data
print(X_train[tokenizer_train.ignored_rows])
print("Ignored rows for training",tokenizer_train.ignored_rows)
y_train = np.delete(y_train,tokenizer_train.ignored_rows,0)
tokenized_X_train = tokenized_X_train[:len(tokenized_X_train)-len(tokenizer_train.ignored_rows)]

print("Ignored rows for validating",tokenizer_valid.ignored_rows)
y_valid = np.delete(y_valid,tokenizer_valid.ignored_rows,0)
tokenized_X_valid = tokenized_X_valid[:len(tokenized_X_valid)-len(tokenizer_valid.ignored_rows)]

print("Ignored rows for testing",tokenizer_test.ignored_rows)
y_test = np.delete(y_test,tokenizer_test.ignored_rows,0)
tokenized_X_test = tokenized_X_test[:len(tokenized_X_test)-len(tokenizer_test.ignored_rows)]

In [None]:
# Convert into format for finetune function in torchMoji
input_text =  [tokenized_X_train,tokenized_X_valid,tokenized_X_test]
output_label = [y_train,y_valid,y_test]
print(y_train.shape, tokenized_X_train.shape)

In [None]:
# Generates model for transfer learning provided by authors
from torchmoji.class_avg_finetuning import class_avg_finetune
from torchmoji.finetuning import finetune

num_emojis = len(emoji_to_number)

model = torchmoji_transfer(2, 
                           weight_path=PRETRAINED_PATH,
                           extend_embedding=num_added,
                           )

# finetuning.py:line 526 .numpy()[0] changed
# to numpy() due to version change causing 
# this to directly be a scalar

# tested accuracy variant as well as different configurations.
# No useful results.
# model_def.py:line 249-250 commented out
# Update: Above confirmed https://github.com/huggingface/torchMoji/issues/21
# Changed line 610 and 611 in finetuning.py for mem
print(output_label)
model, score =  class_avg_finetune(model,
                          input_text,
                          output_label, 
                          num_emojis, 
                          32,
                          'last',
                          )

results = model(tokenized_X_test)

In [None]:
  evaluate_results_custom(X_test, y_test, results, emoji_to_number,verbose=100)