# BERT implementation for predicting outcome of ECHR cases

In [None]:
#all necessary imports
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
import glob,re, os, sys, random
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from nltk.corpus import stopwords
from random import shuffle
import os
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 8.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 64.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [None]:
from transformers import AutoTokenizer,TFAutoModel, TFAutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-uncased-echr")
model = TFAutoModel.from_pretrained("nlpaueb/bert-base-uncased-echr", ) #ForSequenceClassification

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at nlpaueb/bert-base-uncased-echr were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at nlpaueb/bert-base-uncased-echr.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
"""
from transformers import TFLongformerModel, LongformerTokenizer
# I wish this would work :(
model = TFLongformerModel.from_pretrained('allenai/longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
"""

In [None]:
 import tensorflow as tf
 tf.test.gpu_device_name() 

'/device:GPU:0'

## Load Data 

Note, this was processed in the other notebook

In [None]:
import pandas as pd
import numpy as np
file = '/gdrive/Shareddrives/Now_Forecasting_Final_Project/data/complete_df.csv'
df = pd.read_csv(file)
df.dropna(inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
target = 'outcome'
X = df.drop(target, axis = 1)
y = df[target]
y = y.apply(lambda x: 1 if x == 'violation' else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

In [None]:
import tensorflow as tf
def batch_encode(X, tokenizer):
    return tokenizer.batch_encode_plus(
    X,
    #max_length=512, # set the length of the sequences
    add_special_tokens=True, # add [CLS] and [SEP] tokens
    return_attention_mask=True,
    return_token_type_ids=False, # not needed for this type of ML task
    padding=True, # add 0 pad tokens to the sequences less than max_length
    truncation = True,
    #truncation_side = 'left',
    return_tensors='tf'
)

In [None]:
X_res = batch_encode(list(X_res['text']), tokenizer)
X_test =batch_encode(list(X_test['text']), tokenizer)

In [None]:
def create_model(max_sequence, num_labels):
    input_ids = tf.keras.layers.Input(shape=(max_sequence,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence,), dtype=tf.int32, name='attention_mask')
    output = model(input_ids, attention_mask = attention_mask)[1]
    #output = model(input_ids)[0]
    # Provide number of classes to the final layer:
    output = tf.keras.layers.Dense(1, activation='sigmoid')(output)

    # Final model:
    final_model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)
    return final_model

In [None]:
test_model = create_model(512, 1)
for i in range(2):
  test_model.layers[i].trainable = False
opt = tf.keras.optimizers.Adam(learning_rate=3e-5)
test_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
y_res = np.asarray(y_res).reshape(-1,1)

In [None]:
test_model.output

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dense_1')>

In [None]:
test_model.fit(
    x=[X_res['input_ids'], X_res['attention_mask']],
    y=y_res,
    validation_split = 0.2,
    epochs=2,
    batch_size=8
)
# she is learning slowly but badly accuracy of 0.6 -> that is not what was promised to us 
# in the paper -> accuracy of 0.7 or 0.8 -> hmm -> tokenization? wrong layer added? 

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f622b7c0910>

In [None]:
y_hat = test_model.predict([X_test['input_ids'], X_test['attention_mask']])

it only predict zeros?? 