# Use transfer-learning with huggingface-transformers library

## Load data and make it accessible for huggingface environment

In [6]:
%load_ext autoreload
%autoreload 2
from transformers import (AutoTokenizer, 
                          TFAutoModel,
                          TFAutoModelForSequenceClassification
                          )
from datasets import Dataset, DatasetDict, load_from_disk # to use huggingface datasets
from detector.utils import load_data, divide_frame
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, PredefinedSplit, RandomizedSearchCV

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [8]:
data = load_data()

In [9]:
train, val, test = (data["train"].reset_index(drop=True), 
                    data["valid"].reset_index(drop=True), 
                    data["test"].reset_index(drop=True))

def remove_newline(text: str) -> str:
    return text.replace("\n", " ")


for df in [train, val, test]:
    df["text"] = df["text"].apply(remove_newline)
    df["text_length"] = df['text'].apply(len)

In [10]:
train_small, train_medium, train_long = divide_frame(train)

In [11]:
for df in [train_small, train_medium, train_long]:
    print(df.shape[0])

train_sample = pd.concat([train_small.sample(10_000, random_state=1), 
                          train_medium.sample(20_000, random_state=1),
                          train_long.sample(10_000, random_state=1)]).reset_index(drop=True)

124936
250122
124942


In [12]:
train_sample

Unnamed: 0,text,AI,text_length
0,"""I didn't even know what a baby looked like un...",1,264
1,"In this video, watch as an electric eel can cl...",1,1043
2,"IOTA is divisible, fungible, durable , portabl...",0,529
3,× Report Chanel Preston And Ryan Ryans Just A ...,0,330
4,The best-selling film of 2012 starring Christi...,1,987
...,...,...,...
39995,"BANGUI, Central African Republic (AP) — A day ...",1,4709
39996,So what world view is under assault by populis...,0,5219
39997,This is a list of the most frequently asked qu...,1,5083
39998,Gerardo Mora/GettyImages. Mitt Romney has a p...,0,4951


In [13]:
ds_train = Dataset.from_pandas(train_sample, split="train")
ds_val = Dataset.from_pandas(val, split="valid")
ds_test = Dataset.from_pandas(test, split="test")

In [14]:
# pack datasets into a dictionary to tokenize them in parallel
ds_dict = DatasetDict({"train": ds_train, "valid": ds_val, "test": ds_test})

In [15]:
ds_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'AI', 'text_length'],
        num_rows: 40000
    })
    valid: Dataset({
        features: ['text', 'AI', 'text_length'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['text', 'AI', 'text_length'],
        num_rows: 10000
    })
})

## Create Tokenizer suitable for the model

In [31]:
def model_checkpoint(name: str="distilbert", 
                     large: bool=True,
                     uncased: bool=True) -> str:
    model_ckpt = f'{name}-large' if large else f'{name}-base'
    return f'{model_ckpt}-uncased' if uncased else model_ckpt

model_ckpt = model_checkpoint(large=False)
# define the tokenizer the model was trained with
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_ckpt

'distilbert-base-uncased'

In [17]:
tokenizer("this is a test!", return_tensors="tf")

{'input_ids': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[ 101, 2023, 2003, 1037, 3231,  999,  102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 7), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [18]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [19]:
# define a tokenize function that tokenizes the text in batches
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [20]:
ds_encoded = ds_dict.map(tokenize, batched=True, batch_size=10_000)

                                                                                                                              

In [21]:
ds_encoded["train"].column_names

['text', 'AI', 'text_length', 'input_ids', 'attention_mask']

## Option 1: Use pre-trained model as feature extractor

### Extracting last hidden layer of a BERT model

For this approach the model-weights of our RoBERTA model are frozen and provide features for a classifier 

In [22]:
model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True) # load the model from the checkpoint

Downloading (…)502fb3e1a6fe1485ea7c: 100%|█████████████████████████████████████████████████| 268M/268M [00:10<00:00, 25.6MB/s]
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, yo

In [23]:
text = "this is a test"
inputs = tokenizer(text, return_tensors="tf")
outputs = model(**inputs)
outputs

TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(1, 6, 768), dtype=float32, numpy=
array([[[-0.15651292, -0.18619664,  0.05277662, ..., -0.1188115 ,
          0.06620605,  0.54701567],
        [-0.35751367, -0.64835584, -0.06178998, ..., -0.30401936,
          0.3507685 ,  0.5220684 ],
        [-0.27718452, -0.44594458,  0.1818426 , ..., -0.09477939,
         -0.00757483,  0.9958281 ],
        [-0.28408554, -0.39167705,  0.37525544, ..., -0.21505752,
         -0.11725186,  1.0526482 ],
        [ 0.2660825 , -0.50936383, -0.31801328, ..., -0.42029798,
          0.01444203, -0.21489479],
        [ 0.94406086,  0.01117276, -0.47139454, ...,  0.14394656,
         -0.7287833 , -0.16194996]]], dtype=float32)>, hidden_states=None, attentions=None)

In [24]:
outputs.last_hidden_state.shape #output [batch_size, n_tokens, hidden_dim]

TensorShape([1, 6, 768])

In [25]:
# for classification it is common practice use hidden state associated to start 
# of sequence token
outputs.last_hidden_state[:, 0].shape 

TensorShape([1, 768])

In [26]:
# extract last hidden state for whole dataset
def extract_hidden_states(batch):
    inputs = {k: v for k,v in batch.items() if k in tokenizer.model_input_names}
    last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:, 0].numpy()}

In [27]:
ds_encoded.set_format("tensorflow", columns=["input_ids", "attention_mask", "AI"])

In [28]:
extract_hidden_states(ds_encoded["train"][:2])

{'hidden_state': array([[ 0.17704949,  0.00966937, -0.16979785, ..., -0.05526689,
          0.2692022 ,  0.25584105],
        [-0.20750774, -0.3828966 , -0.23853931, ..., -0.06183687,
          0.5136264 ,  0.5783782 ]], dtype=float32)}

In [29]:
ds_hidden = ds_encoded.map(extract_hidden_states, batched=True, batch_size=50)

                                                                                                                              

KeyboardInterrupt: 

In [25]:
ds_hidden

DatasetDict({
    train: Dataset({
        features: ['text', 'AI', 'text_length', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 40000
    })
    valid: Dataset({
        features: ['text', 'AI', 'text_length', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['text', 'AI', 'text_length', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 10000
    })
})

In [56]:
ds_hidden.save_to_disk("hidden_states")

                                                                                                 

In [58]:
ds_hidden = load_from_disk("hidden_states")

-----

### Preparing data to train model

We are now using these hidden states to train a relatively simple classifier to predict if text is AI written or not.

In [29]:
X_train = np.array(ds_hidden["train"]["hidden_state"])
X_val = np.array(ds_hidden["valid"]["hidden_state"])
X_test = np.array(ds_hidden["test"]["hidden_state"])

y_train = np.array(ds_hidden["train"]["AI"])
y_val = np.array(ds_hidden["valid"]["AI"])
y_test = np.array(ds_hidden["test"]["AI"])

In [31]:
X_train.shape, y_train.shape

((40000, 768), (40000,))

In [37]:
X_search = np.vstack((X_train, X_val))
y_search = np.hstack((y_train, y_val))

In [38]:
X_search.shape, y_search.shape
split = PredefinedSplit([-1]*X_train.shape[0]+[0]*X_val.shape[0])

((50000, 768), (50000,))

### Logistic Regression

In [36]:
lr_clf = LogisticRegression(max_iter=3000)

params = {"C":[2**k for k in range(-2, 5)]}

search = GridSearchCV(lr_clf,
                      param_grid=params,
                      n_jobs=-1,
                      cv = split,
                      scoring="accuracy")

In [39]:
search.fit(X_search, y_search)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [42]:
lr_clf_best = search.best_estimator_

In [44]:
lr_clf_best.fit(X_train, y_train)
lr_clf_best.score(X_test, y_test)

0.871

### SVM Classifier

In [45]:
from sklearn.svm import SVC

In [70]:
svc = SVC(random_state=1)

params_svc = {
    'C': [0.5, 1, 2],
    #'degree': 3,
    'kernel': ['linear']
 }

search_svc = GridSearchCV(
    svc,
    param_grid=params_svc,
    n_jobs=-1,
    cv=split,
    scoring="accuracy"
)

In [71]:
search_svc.fit(X_search, y_search)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [72]:
search_svc.best_score_

0.8785

In [73]:
svc_best = search_svc.best_estimator_

In [74]:
svc_best.fit(X_train, y_train)

In [75]:
svc_best.score(X_test, y_test)

0.8726

### Random Forest Classifier

In [76]:
from sklearn.ensemble import RandomForestClassifier

In [83]:
rf_clf = RandomForestClassifier(random_state=1, verbose=1, n_jobs=-1)

In [84]:
params_rf = {
 'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000]
 }

random_search = RandomizedSearchCV(rf_clf,
                                   param_distributions=params_rf,
                                   n_iter=5,
                                   scoring="accuracy",
                                   n_jobs=-1,
                                   cv=split)

In [85]:
random_search.fit(X_search, y_search)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  warn(
  warn(
  warn(
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:  2.0min
[Paralle

In [86]:
random_search.best_score_

0.7969

### Gradient Boosting

In [87]:
from xgboost import XGBClassifier

In [99]:
xgb_clf = XGBClassifier(booster = 'gbtree', n_estimators=1000, seed=1, random_state=1)
xgb_clf.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': None,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 1000,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': 1,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None,
 'seed': 1}

In [101]:
params_xgb = {
    'colsample_bytree': [0.7, 0.8, 0.9, 1],
    'max_depth': [5,6,7],
    'subsample': [0.9, 1],
    'reg_lambda': [0.8, 0.9, 1]
    }

search_xgb = RandomizedSearchCV(
    xgb_clf,
    param_distributions=params_xgb,
    n_jobs=-1,
    cv=split,
    scoring="accuracy",
    n_iter=3
)