In [0]:
import pandas as pd
import re
import spacy
import numpy as np

In [0]:
nlp = spacy.load('en')

In [0]:
def load_train_data():
    raw_train_data = pd.read_csv("/content/drive/My Drive/Meme Analysis Challenge/train_data.csv")
    full_df = pd.merge(raw_train_data, pd.read_csv("/content/drive/My Drive/Meme Analysis Challenge/train_label.csv"), on="id")
    full_df.fillna("No text", inplace=True)
    return full_df

In [0]:
full_df = load_train_data()

In [0]:
def clean_text(text):

    found_url = re.findall(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org)))", text)
    if found_url:
        text = text.replace(found_url[0], "")
        text = text.replace(found_url[0].split(".")[0], "")
    for w in text.split():
        if ".com" in w or "@" in w:
            text = text.replace(w, "")
            
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub(r'^b\s+', '', text)
    # We remove all the single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # For now we remove all punctuation, but maybe for memes stuff like ! and ? important, we can test later.
    text = re.sub(r'[^\w\s]','', text)

    
    return text


In [0]:
def lemmatize_text(text):
    # Make a spacy pipe ffrom this later.
    doc = nlp(text)
    lemmas = [token.lemma_ if token.lemma_ not in ["-PRON-"] else token.text for token in doc]
    return " ".join(lemmas)

In [0]:
def add_relevant_columns(df, lemmatize=True):
    
    def add_lowercased_texts(df):
        df["lowercased_text"] = [t.lower() for t in df.Corrected_text]
        return df
    
    def clean_texts(df):
        df["cleaned_text"] = df.lowercased_text.apply(lambda x: clean_text(x))
        return df
    
    def lemmatize_texts(df):
        df["lemmatized_text"] = df.cleaned_text.apply(lambda x: lemmatize_text(x))
        return df
    
    if lemmatize:
        return lemmatize_texts(clean_texts(add_lowercased_texts(df)))
    else:
        return clean_texts(add_lowercased_texts(df))
    

In [0]:
prepared_train = add_relevant_columns(full_df, lemmatize=True)

In [0]:
prepared_train.sample(5)

Flair now

In [0]:
from flair.data import Corpus
from flair.datasets import TREC_6
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentEmbeddings, DocumentPoolEmbeddings, BertEmbeddings, BytePairEmbeddings, StackedEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.models import SequenceTagger

In [0]:
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
import numpy as np
from flair.data import Sentence

In [0]:
# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

# initialize the document embeddings, take the average of the embeddings
document_embeddings = DocumentPoolEmbeddings([glove_embedding,
                                              flair_embedding_backward,
                                              flair_embedding_forward])

In [0]:
sentences = [Sentence(s) for s in full_df.Corrected_text]

In [0]:
train_embeddings = []
for s in sentences:
    # embed the sentence with our document embedding
    document_embeddings.embed(s)

    # now check out the embedded sentence.
    embedding = s.get_embedding()
    train_embeddings.append(embedding)

In [0]:
train_embeddings_np = np.array([t.cpu().detach().numpy() for t in train_embeddings])

In [0]:
# np.save("flair_embeddings_train.npy", train_embeddings_np)

In [0]:
test_embeddings_np.shape

In [0]:
test_data = pd.read_csv("/content/drive/My Drive/Meme Analysis Challenge/test_data.csv")
test_data.fillna("No text", inplace=True)
test_sentences = [Sentence(s) for s in test_data.Corrected_text]

In [0]:
test_embeddings = []
for s in test_sentences:
    # embed the sentence with our document embedding
    document_embeddings.embed(s)

    # now check out the embedded sentence.
    embedding = s.get_embedding()
    test_embeddings.append(embedding.cpu().detach().numpy())

test_embeddings_np = np.array(test_embeddings)

In [0]:
# np.save("flair_embeddings_test.npy", test_embeddings_np)

In [0]:
# Split the dataset into 80% train and test and val 10% each.
train, validate, test = np.split(prepared_train.sample(frac=1), [int(.6*len(prepared_train)), int(.8*len(prepared_train))])

In [0]:
validate.to_csv("data_folder/dev.csv")
train.to_csv("data_folder/train.csv")
test.to_csv("data_folder/test.csv")

In [0]:
data_folder = '/content/data_folder'

In [0]:
# column format indicating which columns hold the text and label(s)
column_name_map = {6: "text", 3: "label"}

In [0]:
# load corpus containing training, test and dev data
corpus: Corpus = CSVClassificationCorpus(data_folder,
                                         column_name_map,
                                         skip_header=True
) 

In [0]:
corpus.filter_empty_sentences()

In [0]:
# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

In [0]:
label_dict.item2idx

In [0]:
# This way we can combine multiple embedding sources for each token in our data. Here we take the BytePair embeddings (ex. "cat", "er", "pill", "ar" for the word "caterpillar") that helps with embedding unknown tokens, 
# Flair backward and forward contextualized embeddings and the BERT embeddings.
word_embeddings = [
                   BytePairEmbeddings('en'), 
                    FlairEmbeddings('mix-forward'), 
                    FlairEmbeddings('mix-backward'), 
                    BertEmbeddings('bert-base-uncased')
                   ]

In [0]:
# 4. initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default)
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                                     hidden_size=512,
                                                                     reproject_words=True,
                                                                     reproject_words_dimension=256,
                                                                     )

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

In [0]:
# 7. start the training
trainer.train('/content/drive/My Drive/Meme Analysis Challenge/flair',
              learning_rate=0.3,
              mini_batch_size=16,
              anneal_factor=0.7,
              patience=5,
              max_epochs=100)

In [0]:
classifier = TextClassifier.load('/content/drive/My Drive/Meme Analysis Challenge/flair/final-model.pt')

In [0]:
test_sentences = pd.read_csv("/content/drive/My Drive/Meme Analysis Challenge/test_data.csv")

In [0]:
def load_real_test_set(lemmatize=True):
    test_data = pd.read_csv("/content/drive/My Drive/Meme Analysis Challenge/test_data.csv")
    test_data.fillna("No text", inplace=True)
    test_texts_lowercase = [t.lower() for t in test_data.Corrected_text]
    test_data["lowercased_text"] = test_texts_lowercase
    test_data["cleaned_text"] = test_data.lowercased_text.apply(lambda x: clean_text(x))
    if lemmatize:
        test_data["lemmatized_text"] = test_data.cleaned_text.apply(lambda x: lemmatize_text(x))
    else:
        pass
    return test_data
    

In [0]:
real_test_set = load_real_test_set()

In [0]:
from flair.data import Sentence

In [0]:
sentences = [Sentence(s) for s in real_test_set.lemmatized_text]

In [0]:
test_predictions = [classifier.predict(sentence) for sentence in sentences]

In [0]:
labels_test = [tp[0].to_dict()['labels'][0]['value'] for tp in test_predictions]

In [0]:
len(labels_test)

In [0]:
real_test_set["Humour"] = labels_test

In [0]:
def make_predictions_df(test_data_with_pred):
    df = test_data_with_pred[["id", "Humour"]]
    return df

In [0]:
submission_df = make_predictions_df(real_test_set)

In [0]:
submission_df.to_csv("submission_14.csv", index=False)