# Installing Packages 

In [1]:
#In this step, we install necessary packages. Sentencepiece trainer can receive any iterable object 
#to feed training sentences. wget is used for retrieves content from web servers. Nltk.punkt and other tokenization 
#tools are used for the purpose of dividing a string into substrings by splitting on the specified string 

In [2]:
!pip install sentencepiece
!pip install tensorflow-hub
!pip install tensorflow-addons
!pip install wget 

Collecting sentencepiece
  Downloading sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 6.9 MB/s eta 0:00:01
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.94
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m
Collecting tensorflow-hub
  Downloading tensorflow_hub-0.10.0-py2.py3-none-any.whl (107 kB)
[K     |████████████████████████████████| 107 kB 6.7 MB/s eta 0:00:01
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.10.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.11.2-cp36-cp36m-manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 6.7 MB/s eta 0:00:01
[?25hCollect

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import re
import string
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import wget
wget.download("https://raw.githubusercontent.com/google-research/ALBERT/master/tokenization.py")

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


'tokenization (1).py'

# Data Preprocessing 

In [4]:
#In this step, we conduct the data pre-processing. 
#we incorporated multiple data cleaning functions into one general preprocessing function to make 
#the entire process more streamlined. In this process, we removed links, non-ASCII characters, URLs, emojis, 
#punctuations...etc. We also converted the abbreviation words to non-abbreviation words. 
#references: https://www.kaggle.com/rftexas/text-only-kfold-bert, https://www.kaggle.com/rftexas/text-only-kfold-bert

In [5]:
twitter_train = pd.read_json(path_or_buf='./train.jsonl',lines = True )
twitter_test=pd.read_json(path_or_buf='./test.jsonl',lines = True)

In [6]:
from collections import Counter
Counter(twitter_train['label'])

Counter({'SARCASM': 2500, 'NOT_SARCASM': 2500})

In [7]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
train_label=le.fit_transform(twitter_train['label'])

In [8]:
twitter_train.head()

Unnamed: 0,label,response,context
0,SARCASM,@USER @USER @USER I don't get this .. obviousl...,[A minor child deserves privacy and should be ...
1,SARCASM,@USER @USER trying to protest about . Talking ...,[@USER @USER Why is he a loser ? He's just a P...
2,SARCASM,@USER @USER @USER He makes an insane about of ...,[Donald J . Trump is guilty as charged . The e...
3,SARCASM,@USER @USER Meanwhile Trump won't even release...,[Jamie Raskin tanked Doug Collins . Collins lo...
4,SARCASM,@USER @USER Pretty Sure the Anti-Lincoln Crowd...,[Man ... y ’ all gone “ both sides ” the apoca...


In [9]:
twitter_test.head()

Unnamed: 0,id,response,context
0,twitter_1,"@USER @USER @USER My 3 year old , that just fi...","[Well now that ’ s problematic AF <URL>, @USER..."
1,twitter_2,@USER @USER How many verifiable lies has he to...,[Last week the Fake News said that a section o...
2,twitter_3,@USER @USER @USER Maybe Docs just a scrub of a...,[@USER Let ’ s Aplaud Brett When he deserves i...
3,twitter_4,@USER @USER is just a cover up for the real ha...,[Women generally hate this president . What's ...
4,twitter_5,@USER @USER @USER The irony being that he even...,"[Dear media Remoaners , you excitedly sharing ..."


In [10]:
def clean_tweets(tweet):
    """Removes links and non-ASCII characters"""
    tweet = ''.join([x for x in tweet if x in string.printable])
    # Removing URLs
    tweet = re.sub(r"http\S+", "", tweet)    
    return tweet

In [11]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [12]:
def remove_punctuations(text):
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    for p in punctuations:
        text = text.replace(p, f' {p} ')
    text = text.replace('...', ' ... ')
    if '...' not in text:
        text = text.replace('..', ' ... ')   
    return text

In [13]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [14]:
def convert_abbrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

In [15]:
def convert_abbrev_in_text(text):
    tokens = word_tokenize(text)
    tokens = [convert_abbrev(word) for word in tokens]
    text = ' '.join(tokens)
    return text

In [16]:
def preprocessing(df):
    df.response=df.response.str.replace('@USER', "") 
    df.response=df.response.str.replace('\d+', '')
    df.response=df.response.str.lower()
    df.response=df.response.str.replace('[^\w\s]','')
    df.context = df.context.apply(lambda x: ','.join(map(str, x)))
    df.context = df.context.str.replace('@USER', "") 
    df.context = df.context.str.lower()
    df.context = df.context.str.replace('[^\w\s]','')
    df.context = df.context.str.replace('\d+', '')
    df.response = df.response.apply(lambda x: clean_tweets(x))
    df.response = df.response.apply(lambda x: remove_emoji(x))
    df.response = df.response.apply(lambda x: remove_punctuations(x))
    df.response = df.response.apply(lambda x: convert_abbrev_in_text(x))
    df.context = df.context.apply(lambda x: clean_tweets(x))
    df.context = df.context.apply(lambda x: remove_emoji(x))
    df.context = df.context.apply(lambda x: remove_punctuations(x))
    df.context = df.context.apply(lambda x: convert_abbrev_in_text(x))   
    return df

In [17]:
twitter_train = preprocessing(twitter_train)
twitter_test = preprocessing(twitter_test)

In [18]:
twitter_train.head()

Unnamed: 0,label,response,context
0,SARCASM,i dont get this obviously you do care or you w...,a minor child deserves privacy and should be k...
1,SARCASM,trying to protest about talking about him and ...,why is he a loser hes just a press secretary h...
2,SARCASM,he makes an insane about of money from the mov...,donald j trump is guilty as charged the eviden...
3,SARCASM,meanwhile trump wont even release his sat scor...,jamie raskin tanked doug collins collins looks...
4,SARCASM,pretty sure the antilincoln crowd claimed that...,man y all gone both sides the apocalypse one d...


In [19]:
twitter_test.head()

Unnamed: 0,id,response,context
0,twitter_1,my year old that just finished reading nietzsc...,well now that s problematic af url my year old...
1,twitter_2,how many verifiable lies has he told now docum...,last week the fake news said that a section of...
2,twitter_3,maybe docs just a scrub of a coach i mean to g...,let s aplaud brett when he deserves it he coac...
3,twitter_4,is just a cover up for the real hate inside th...,women generally hate this president whats up w...
4,twitter_5,the irony being that he even has to ask why,dear media remoaners you excitedly sharing cli...


# BERT

In [20]:
#In this step, we conducted the BERT embedding, training and model fitting. 
#First, we downloaded the bert_layer, which is a pre-trained neural network with the Transformer architecture. We 
#choosed l=24 hidden layers. Then, we encoded out texts to ids to generate encoded tokens, masks and segments using  
#pre-trained bert layers. One thing to be noticed, we encoded response and context separately, and combined then afterwards. 
#Finally, we fit the Bert encoded matrices into model with epochs =3 and batch size = 6. We did 90% training set and 10% 
#sets. As a result, we achieved 0.74 of F1.

In [21]:
import tensorflow_hub as hub
bert_module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(bert_module_url, trainable=True)

In [22]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [23]:
import tokenization
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [24]:
max_len=256

In [25]:
train_response = bert_encode(twitter_train.response, tokenizer, max_len=max_len)
train_context = bert_encode(twitter_train.context, tokenizer, max_len=max_len)
train_labels=twitter_train.label.values

In [26]:
test_response = bert_encode(twitter_test.response, tokenizer, max_len=max_len)
test_context = bert_encode(twitter_test.context, tokenizer, max_len=max_len)

In [27]:
test_generate=[test_context[0],test_context[1],test_context[2],test_response[0],test_response[1],test_response[2]]

In [28]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,concatenate,Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [29]:
contex_input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="contex_input_word_ids")
contex_input_mask = Input(shape=(max_len,), dtype=tf.int32, name="contex_input_mask")
contex_segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="contex_segment_ids")
_, context_sequence_output = bert_layer([contex_input_word_ids, contex_input_mask, contex_segment_ids])
context_clf_output = context_sequence_output[:, 0, :]

reponse_input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="reponse_input_word_ids")
reponse_input_mask = Input(shape=(max_len,), dtype=tf.int32, name="reponse_input_mask")
reponse_segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="reponse_segment_ids")
_, reponset_sequence_output = bert_layer([reponse_input_word_ids, reponse_input_mask, reponse_segment_ids])
reponset_clf_output = reponset_sequence_output[:, 0, :]

concat=concatenate([context_clf_output,reponset_clf_output])
out = Dense(1, activation='sigmoid')(concat)
    
model = Model(inputs=[contex_input_word_ids, contex_input_mask, contex_segment_ids,reponse_input_word_ids,reponse_input_mask,reponse_segment_ids], outputs=out)
model.compile(Adam(1e-6), loss='binary_crossentropy', metrics=['accuracy'])

In [30]:
train_generate=[train_context[0],train_context[1],train_context[2],train_response[0],train_response[1],train_response[2]]

In [31]:
train_hist = model.fit(
    train_generate, train_label,
    epochs=3,
    batch_size=6,
    validation_split=0.1)

Train on 4500 samples, validate on 500 samples


In [32]:
test_fit=model.predict(test_generate, batch_size=6, verbose=1).ravel()



In [33]:
result=test_fit.round()
result=le.inverse_transform(result.ravel().astype('int16'))
result=pd.DataFrame(result,columns=['label'])
result=pd.concat([twitter_test['id'], result], axis=1)
result.head()
result.to_csv('./answer.txt',sep=',',index=False,header=None)