<a href="https://colab.research.google.com/github/piotrjaniszewski1/Offensive-Language-Identification-and-Categorization/blob/ernie/Ernie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Install necessary packages -> uncomment what is currently needed

!pip install unidecode
!pip install contractions
!pip install wordsegment
!pip install -U symspellpy
!pip install emoji --upgrade
!pip install -U imbalanced-learn
!pip install bert-for-tf2
!pip install transformers
!pip install paddlepaddle-gpu

Requirement already up-to-date: symspellpy in /usr/local/lib/python3.6/dist-packages (6.5.2)
Requirement already up-to-date: emoji in /usr/local/lib/python3.6/dist-packages (0.5.4)
Requirement already up-to-date: imbalanced-learn in /usr/local/lib/python3.6/dist-packages (0.6.1)


In [0]:
# All imports - DO NOT CHANGE THE ORDER OF INSTRUCTIONS
#!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo

import re
import os
import sys
import json

#if not 'bert_repo' in sys.path:
    #sys.path.insert(0, 'bert_repo')

import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.model_selection import train_test_split
import spacy
from bs4 import BeautifulSoup
import unidecode
import contractions
import gensim.downloader as api
import re
import wordsegment
import pkg_resources
from symspellpy.symspellpy import SymSpell, Verbosity
import emoji
from imblearn.over_sampling import SMOTE
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow import keras
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
#from modeling import BertModel, BertConfig
#from tokenization import FullTokenizer, convert_to_unicode
#from extract_features import InputExample, convert_examples_to_features
from tqdm import tqdm
#import tensorflow_addons as tfa
# import nltk
from google.colab import auth, drive
# nltk.download('punkt')

wordsegment.load()

# Load SymSpell -> package for correcting misspellings
sym_spell = SymSpell(2, 7)

dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

# get TF logger 
log = logging.getLogger('tensorflow')
log.handlers = []

In [0]:
#Import data
training_examples_url = 'https://raw.githubusercontent.com/piotrjaniszewski1/SemEval-2020-Task12/master/data2019/olid-training-v1.0.tsv'
training_dataset = pd.read_csv(training_examples_url, delimiter='\t')
print(training_dataset.head())

      id                                              tweet  ... subtask_b subtask_c
0  86426  @USER She should ask a few native Americans wh...  ...       UNT       NaN
1  90194  @USER @USER Go home you’re drunk!!! @USER #MAG...  ...       TIN       IND
2  16820  Amazon is investigating Chinese employees who ...  ...       NaN       NaN
3  62688  @USER Someone should'veTaken" this piece of sh...  ...       UNT       NaN
4  43605  @USER @USER Obama wanted liberals &amp; illega...  ...       NaN       NaN

[5 rows x 5 columns]


# **Training and validation sets**

In [0]:
training_examples, validation_examples = train_test_split(training_dataset, test_size=0.1)

# prepare training examples
training_examples_A = training_examples['tweet'][training_examples['subtask_a'].notnull()]
training_examples_B = training_examples['tweet'][training_examples['subtask_b'].notnull()]
training_examples_C = training_examples['tweet'][training_examples['subtask_c'].notnull()]

# prepare validation examples
validation_examples_A = validation_examples['tweet'][validation_examples['subtask_a'].notnull()]
validation_examples_B = validation_examples['tweet'][validation_examples['subtask_b'].notnull()]
validation_examples_C = validation_examples['tweet'][validation_examples['subtask_c'].notnull()]

# prepare training labels
training_labels_A = np.array((training_examples['subtask_a'][training_examples['subtask_a'].notnull()] == 'OFF').astype(int))
training_labels_B = np.array((training_examples['subtask_b'][training_examples['subtask_b'].notnull()] == 'TIN').astype(int))
c_mapping = {'IND': 0, 'GRP': 1, 'OTH': 2}
training_labels_C = np.array(training_examples['subtask_c'][training_examples['subtask_c'].notnull()].replace(c_mapping))

# prepare validation labels
validation_labels_A = (validation_examples['subtask_a'][validation_examples['subtask_a'].notnull()] == 'OFF').astype(int)
validation_labels_B = (validation_examples['subtask_b'][validation_examples['subtask_b'].notnull()] == 'TIN').astype(int)
validation_labels_C = (validation_examples['subtask_c'][validation_examples['subtask_c'].notnull()]).replace(c_mapping)


training_x = np.array(training_examples_A)
validation_x = np.array(validation_examples_A)
training_y = np.array(training_labels_A)
validation_y = np.array(validation_labels_A)

# **Preprocessing**

### Common preprocessing functions

In [0]:
# remove html tags if exist
def strip_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    stripped_text = soup.get_text(separator=' ')
    return stripped_text


# remove unnecessary whitespaces
def remove_whitespace(text):
    text = text.strip()
    return ' '.join(text.split())


# remove accented chars (e.g. caffè -> caffe)
def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text


# remove hashes and split words (e.g. '#fortTrump' -> 'fort trump')
def split_hashtags(text):
    splitted = text.split()
    new_word_sequence = []

    for chunk in splitted:
        if chunk[0] == '#':
            chunk = chunk[1:]
            new_word_sequence.extend(wordsegment.segment(chunk))
        else:
            new_word_sequence.append(chunk)
        
    return ' '.join(tuple(new_word_sequence))


def substitute_emojis(text):
    demojized_text = emoji.demojize(text)
    return re.compile('[_:]+').sub(' ', demojized_text)


def preprocess_common(text):
    text = strip_html_tags(text)
    text = contractions.fix(text)
    text = split_hashtags(text)
    text = substitute_emojis(text)
    text = remove_whitespace(text)
    text = remove_accented_chars(text)
    return text.lower()

In [0]:
# Remove redundant @user tokens
def remove_redundant_users(example):
    user_count = 0
    new_example = example[:]
    for i, token in reversed(list(enumerate(example))):
        if token == '@user':
            user_count += 1
        if user_count > 3:
            new_example.pop(i)
    else:
        user_count = 0

    return new_example

### Spacy preprocessing

In [0]:
# Try leaving '?' and '!' as far as punctuation is concerned

nlp = spacy.load('en_core_web_sm')

# exclude negation words from spacy stopwords list
deselect_stop_words = ['no', 'not', 'noone', 'none', 'lacks', 'lack', 'nor', 'never', 'neighter', 'hardly', 'nobody', 'nothing', 'lacking', 'nowhere']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False

def preprocess_spacy(text):
    doc = nlp(text)

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text

        # remove punctuations
        if token.pos_ == 'PUNCT' and flag == True and token.text != '@user': 
            flag = False
       
        # remove special characters
        if token.pos_ == 'SYM' and flag == True: 
            flag = False
        
        # remove numbers
        if (token.pos_ == 'NUM' or token.text.isnumeric()) and flag == True:
            flag = False

        # correct misspelings
        if flag == True:
            suggestions = sym_spell.lookup(edit, Verbosity.TOP, 2)
            if len(suggestions) > 0:
                edit = suggestions[0].term

        # remove stop words
        if token.is_stop and token.pos_ != 'NUM': 
            flag = False

        # convert tokens to base form
        elif token.lemma_ != '-PRON-' and flag == True:
            edit = token.lemma_

        # append tokens edited and not removed to list 
        if edit != '' and flag == True:
            clean_text.append(edit)        
    
    return clean_text

### Preprocessing execution

In [0]:
# normalization -> papers, complicated solutions, replace abbreviations with full names (e.g. MAGA)
# check removing less stop words (some may have some significance)

cleaned_x = [preprocess_spacy(example) for example in training_x[0:30]]
reduced_users_x = [remove_redundant_users(example) for example in cleaned_x]
print(reduced_users_x[0:30])
print(training_x[0:30])

[['Verge', 'collapse)the', 'breaking', 'wave', 'not', 'explain', 'sea.', '🤔', 'fast', 'break', 'thing', 'tell', 'break', '…', 'url'], ['@USER', 'Loll'], ['@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', '@USER', 'follow', 'patriot', 'follow', 'thank', 'maga', 'KAG', 'WalkAway', 'Witchhunt', '#', 'redwaverise', 'VoteRed', '#', 'VoteDemsOut', 'voteredtosaveamerica'], ['@USER', 'ANTIFA', 'WANNABE', '☝', '☝', '☝', '☝'], ['@USER', '@USER', 'different', 'opinion', 'well', 'gun', 'control', 'law', 'look', 'like', '😎', ' ', 'case', 'illegal', 'alien', 'right', 'involve', ' ', 'case', 'Americans', '2a', 'right', ' ', 'u', 'tell', 'difference', 'american', 'right', '&', 'amp', 'illegal', 'alien'], ['@USER', 'happy', 'birthday', 'sweetie', 'hope', 'day', 'awesome'], ['@user', 'devil', 'single', 'priest', 'rape', 'child', 'pope', 'refuse', 'condemn', 'remove', 'leaders

# **Save data to file**

In [0]:
train = pd.DataFrame(training_x, columns = ['text_a'])
train['label'] = training_y
val = pd.DataFrame(validation_x, columns = ['text_a'])
val['label'] = validation_y
train.to_csv('train.tsv', index=False, sep='\t')
val.to_csv('val.tsv', index=False, sep='\t')

# Download model

In [0]:
!git clone https://github.com/PaddlePaddle/ERNIE.git

Cloning into 'ERNIE'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 1253 (delta 13), reused 20 (delta 6), pack-reused 1218[K
Receiving objects: 100% (1253/1253), 15.70 MiB | 13.71 MiB/s, done.
Resolving deltas: 100% (702/702), done.


In [0]:
!wget https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz
!gunzip ERNIE_Base_en_stable-2.0.0.tar.gz
!tar -xvf ERNIE_Base_en_stable-2.0.0.tar

--2019-12-29 21:15:53--  https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz
Resolving ernie.bj.bcebos.com (ernie.bj.bcebos.com)... 103.235.46.61
Connecting to ernie.bj.bcebos.com (ernie.bj.bcebos.com)|103.235.46.61|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405413980 (387M) [application/x-gzip]
Saving to: ‘ERNIE_Base_en_stable-2.0.0.tar.gz’


2019-12-29 21:18:38 (2.40 MB/s) - ‘ERNIE_Base_en_stable-2.0.0.tar.gz’ saved [405413980/405413980]

ernie_config.json
params/
params/encoder_layer_4_post_att_layer_norm_scale
params/encoder_layer_2_multi_head_att_query_fc.b_0
params/encoder_layer_2_ffn_fc_1.w_0
params/encoder_layer_6_multi_head_att_key_fc.b_0
params/encoder_layer_7_multi_head_att_value_fc.w_0
params/encoder_layer_11_multi_head_att_value_fc.b_0
params/encoder_layer_5_multi_head_att_query_fc.w_0
params/encoder_layer_1_multi_head_att_value_fc.w_0
params/encoder_layer_11_post_ffn_layer_norm_scale
params/encoder_layer_7_multi_head_att_query_fc.w

In [0]:
!mkdir -p parameters/params
!mkdir -p 'dataset/sem-eval'
!mv train.tsv dataset/sem-eval/
!mv val.tsv dataset/sem-eval/
!mv params/ parameters/params

mv: cannot move 'params/' to 'parameters/params/params': Directory not empty


In [0]:
!mv dataset/ ERNIE/
!mv parameters/ ERNIE/

In [0]:
os.chdir('ERNIE/')
os.environ['TASK_DATA_PATH']='dataset'
os.environ['MODEL_PATH']='parameters/params'

# Run classifier

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
!sh '/content/gdrive/My Drive/Pracownia/Ernie/sem-eval-task.sh'

/content/gdrive/My Drive/Pracownia/Ernie/sem-eval-task.sh: 8: /content/gdrive/My Drive/Pracownia/Ernie/sem-eval-task.sh: [[: not found
2019-12-29 21:26:05,220-INFO: -----------  Configuration Arguments -----------
[INFO] 2019-12-29 21:26:05,220 [     args.py:   68]:	-----------  Configuration Arguments -----------
2019-12-29 21:26:05,220-INFO: batch_size: 32
[INFO] 2019-12-29 21:26:05,220 [     args.py:   70]:	batch_size: 32
2019-12-29 21:26:05,220-INFO: checkpoints: ./checkpoints
[INFO] 2019-12-29 21:26:05,220 [     args.py:   70]:	checkpoints: ./checkpoints
2019-12-29 21:26:05,220-INFO: chunk_scheme: IOB
[INFO] 2019-12-29 21:26:05,220 [     args.py:   70]:	chunk_scheme: IOB
2019-12-29 21:26:05,220-INFO: decr_every_n_nan_or_inf: 2
[INFO] 2019-12-29 21:26:05,220 [     args.py:   70]:	decr_every_n_nan_or_inf: 2
2019-12-29 21:26:05,220-INFO: decr_ratio: 0.8
[INFO] 2019-12-29 21:26:05,220 [     args.py:   70]:	decr_ratio: 0.8
2019-12-29 21:26:05,220-INFO: dev_set: None
[INFO] 2019-12-29 2

In [0]:
from google.colab import files
files.download("output/test_out.{1..5}.2e-5.32.4.tsv.3.1493") 

In [0]:
test_out = pd.read_csv("output/test_out.{1..5}.2e-5.32.4.tsv.3.1493", delimiter='\t', header=None)

In [0]:
import re
pattern = re.compile(r'(,){2,}')

def parse_element(el):
  multiple_commas = el.replace(' ', ',')[1:-1]
  one_comma = re.sub(pattern, ',', multiple_commas)
  return float(one_comma.split(',')[1])

In [0]:
preds = [parse_element(el) for el in test_out[1]]
preds[:20]

[0.87108123,
 0.952258,
 0.96701026,
 0.97473025,
 0.95807177,
 0.00673935,
 0.7690937,
 0.01468256,
 0.01104505,
 0.97675025,
 0.02298346,
 0.47771004,
 0.5081026,
 0.01658216,
 0.01078551,
 0.96044254,
 0.80696654,
 0.7182258,
 0.01436398,
 0.08695079]

In [0]:
validation_y
float_validation_y = [float(el) for el in validation_y]
float_validation_y[:20]

[0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0]

In [0]:
from keras.metrics import binary_accuracy
acc = binary_accuracy(float_validation_y, preds)
with tf.Session() as sess:
  print(acc.eval()) 

0.8066465


In [0]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [0]:
f1_val = f1(tf.cast(validation_y, tf.float32), preds)
with tf.Session() as sess:
  print(f1_val.eval()) 

0.7104072
