In [None]:
!pip install tensorflow_text
!pip install tf-models-official



In [None]:
import tensorflow as tf 
import tensorflow_hub as hub
import tensorflow_text

import pandas as pd
import numpy as np


from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import r2_score, mean_squared_error

from official.nlp import optimization

import matplotlib.pyplot as plt

In [None]:
path =  "/content/"
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sample_submission = pd.read_csv(path + 'sample_submission.csv')

In [None]:
train.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [None]:
train_x, test_x, train_y, test_y = train_test_split(train.excerpt.to_numpy(), train.target.to_numpy(), test_size=0.2, shuffle=True, random_state=42)

In [None]:
def create_folds(data, target="target", num_splits = 5): 
    data["kfold"] = -1 
    data = data.sample(frac=1).reset_index(drop=True)
    
    # Applying Sturg's rule to calculate the no. of bins for target
    num_bins = int(1 + np.log2(len(data))) 

    data.loc[:, "bins"] = pd.cut(data[target], bins=num_bins, labels=False) 
    
    kf = StratifiedKFold(n_splits=num_splits)
    
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)): 
        data.loc[v_, 'kfold'] = f
        
    data = data.drop(["bins"], axis = 1)         
    return data 

In [None]:
cv_data_df = create_folds(train, target = 'target', num_splits = 5)
cv_data_df.kfold.value_counts()

3    567
1    567
2    567
0    567
4    566
Name: kfold, dtype: int64

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "http://tfhub.dev/tensorflow/albert_en_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/albert_en_large/3",
    trainable=False)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 1024].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 1024].

In [None]:
embedding_model = tf.keras.Model(text_input, outputs)
sentences = tf.constant(["(your text here)", "thor is just myth", "how are you?"])
rslt = embedding_model(sentences)
pool = rslt["pooled_output"]
seq = rslt["sequence_output"]
print (type(pool), pool.numpy().shape, seq.shape)
pool = tf.keras.layers.Reshape((-1, 1024))(pool)
cf = tf.keras.layers.Concatenate(axis=1)([pool, seq])
att = tf.keras.layers.Attention()([pool, seq])
cnv = tf.keras.layers.MultiHeadAttention(num_heads=5, key_dim=5)
output_tensor, weights = cnv(pool, seq,
                               return_attention_scores=True)
print (pool.shape)
print (seq.shape)
#print (cf.shape)
print (output_tensor.shape)

<class 'tensorflow.python.framework.ops.EagerTensor'> (3, 1024) (3, 128, 1024)
(3, 1, 1024)
(3, 128, 1024)
(3, 1, 1024)


In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import activations,callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import initializers

In [None]:
def albert_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    embedding_model_result = embedding_model(text_input)
    pool = embedding_model_result["pooled_output"]
    seq = embedding_model_result["sequence_output"]
    pool = tf.keras.layers.Reshape((-1, 1024))(pool)
    #att = tf.keras.layers.Attention()([pool, seq])
    #lstm = tf.keras.layers.LSTM(16, activation='relu')(pool)
    matt = tf.keras.layers.MultiHeadAttention(num_heads=16, key_dim=16,  dropout=0.1)
    net = matt(pool, seq,return_attention_scores=False)
    net = tf.keras.layers.Dense(1, name='regressor')(net)
    model = tf.keras.models.Model(inputs=text_input, outputs=net, name="albert_model")
    return model

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "http://tfhub.dev/tensorflow/albert_en_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/albert_en_xxlarge/3",
    trainable=False)

def albert_large_xx():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    encoder_inputs = preprocessor(text_input)
    embedding_model_result = encoder(encoder_inputs)
    pool = embedding_model_result["pooled_output"]
    seq = embedding_model_result["sequence_output"]
    pool = tf.keras.layers.Reshape((-1, 4096))(pool)
    #att = tf.keras.layers.Attention()([pool, seq])
    #lstm = tf.keras.layers.LSTM(16, activation='relu')(pool)
    matt = tf.keras.layers.MultiHeadAttention(num_heads=16, key_dim=16,  dropout=0.1)
    net = matt(pool, seq,return_attention_scores=False)
    net = tf.keras.layers.Flatten()(net)
    net = tf.keras.layers.Dense(2048, activation="relu", input_shape=(None, 4096))(net)
    net = tf.keras.layers.Dense(1028, activation="relu")(net)
    net = tf.keras.layers.Dense(514, activation="relu")(net)
    net = tf.keras.layers.Dense(257, activation="relu")(net)
    net = tf.keras.layers.Dense(1, name='regressor')(net)
    model = tf.keras.models.Model(inputs=text_input, outputs=net, name="albert_model")
    return model

In [None]:
model_base = albert_large_xx()
model_base.summary()

Model: "albert_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer_2 (KerasLayer)      {'input_word_ids': ( 0           input_3[0][0]                    
__________________________________________________________________________________________________
keras_layer_3 (KerasLayer)      {'encoder_outputs':  222595584   keras_layer_2[1][0]              
                                                                 keras_layer_2[1][1]              
                                                                 keras_layer_2[1][2]              
_______________________________________________________________________________________

In [None]:
epochs = 10
batch_size = 8
steps_per_epoch = round(len(train.index)//batch_size)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
callbacks = tf.keras.callbacks.EarlyStopping(
    monitor='val_root_mean_squared_error', min_delta=0.0001, patience=2, verbose=2,
    mode='min')

checkpoint_filepath = '/content/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_root_mean_squared_error',
    mode='min',
    save_best_only=True)

import tensorflow.keras.backend as K
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [None]:
model_base.compile(optimizer=optimizer,
                         loss=root_mean_squared_error,
                         metrics=tf.keras.metrics.RootMeanSquaredError())

In [None]:
history = model_base.fit(x=np.array(train_x),
                y=train_y,
                validation_data=(np.array(test_x), test_y),
                epochs=epochs,
                batch_size= batch_size,
                callbacks = [callbacks, model_checkpoint_callback])

Epoch 1/10




INFO:tensorflow:Assets written to: /content/checkpoint/assets


INFO:tensorflow:Assets written to: /content/checkpoint/assets


Epoch 2/10




INFO:tensorflow:Assets written to: /content/checkpoint/assets


INFO:tensorflow:Assets written to: /content/checkpoint/assets


Epoch 3/10




INFO:tensorflow:Assets written to: /content/checkpoint/assets


INFO:tensorflow:Assets written to: /content/checkpoint/assets


Epoch 4/10
Epoch 5/10




INFO:tensorflow:Assets written to: /content/checkpoint/assets


INFO:tensorflow:Assets written to: /content/checkpoint/assets


Epoch 6/10

In [None]:
!zip -r /content/saved_model.zip /content/checkpoint



INFO:tensorflow:Assets written to: saved_model/base/assets


INFO:tensorflow:Assets written to: saved_model/base/assets


In [None]:
pred = model_base.predict(np.array(train_x))

In [None]:
pred = np.reshape(pred, (pred.shape[0]))
sqrt = abs(abs(pred) - abs(train_y))
df = pd.DataFrame({"true": train_y,
                   "predicted": pred,
                   "loss":sqrt})
df

Unnamed: 0,true,predicted,loss
0,-1.518350,-1.913984,0.395634
1,-0.548807,-0.626918,0.078111
2,-0.193262,-1.096326,0.903064
3,-1.033799,-0.405483,0.628316
4,-1.725606,-1.337326,0.388280
...,...,...,...
2262,-3.309178,-2.146888,1.162290
2263,-0.216738,-0.230652,0.013914
2264,-1.584384,-2.196375,0.611991
2265,-2.034688,-2.149682,0.114994


In [None]:
root_mean_squared_error(df.loc[(df.loss >= 0.001) & (df.loss <= 0.01), "true"].values,df.loc[(df.loss >= 0.001) & (df.loss <= 0.01), "predicted"].values)

<tf.Tensor: shape=(), dtype=float64, numpy=0.09114875463687887>

In [None]:
lose_idx = df.loc[(df.loss >= 0.001) & (df.loss <= 0.01)].index.values
lose_idx

array([  15,   72,  120,  190,  194,  390,  396,  400,  446,  447,  468,
        516,  524,  543,  812,  892,  900,  931,  938,  958,  968, 1007,
       1210, 1303, 1329, 1347, 1375, 1443, 1487, 1532, 1553, 1572, 1626,
       1632, 1808, 1900, 2019, 2024, 2187, 2197])

In [None]:
def detect_best_loss(model, predictor, values, data_past):
  pred = model.predict(predictor)
  pred = np.reshape(pred, (pred.shape[0]))
  sqrt = abs(abs(pred) - abs(values))
  df = pd.DataFrame({"true": values,
                   "predicted": pred,
                   "loss":sqrt})
  lose_idx = df.loc[(df.loss >= 0.001) & (df.loss <= 0.01)].index.values
  return data_past.

In [None]:
trainc = pd.DataFrame({"x":train_x, "y":train_y})
trainc

Unnamed: 0,x,y
0,The building of rotary presses for printing il...,-1.518350
1,The idea of a trip on Bob's yacht suited every...,-0.548807
2,"Seeing the front door wide open, the enchanter...",-0.193262
3,"The widow she cried over me, and called me a p...",-1.033799
4,"Jacobitism was (and, to a much smaller extent,...",-1.725606
...,...,...
2262,The steam is supplied by two circular return t...,-3.309178
2263,Living things are different from things that a...,-0.216738
2264,"I'd always longed for adventures. You see, my ...",-1.584384
2265,In these times one dread lies heavy on heart a...,-2.034688


In [None]:
tr_1 = trainc.drop(lose_idx)
tr_1[10:20]

Unnamed: 0,x,y
10,There are two types of lithosphere:\nOceanic l...,-2.366802
11,In those terms I was informed of what my perso...,-1.585193
12,"The three brothers left the empty house, and w...",-1.556398
13,At the end of the last century Humphry Davy ob...,-2.953939
14,If any of our boys and girls have found their ...,-1.51912
16,Mayday festivals were not confined to the Brit...,-0.561084
17,"They were seated along the edge of the lake, s...",-0.509804
18,"Sometimes, bacteria swim or float freely in li...",-0.573611
19,"To produce the electric current, all that is n...",-2.17734
20,"The priest, smiling at this speech, answered: ...",-1.972671


In [None]:
model_dif_1 = albert_model()
epochs = 10
batch_size = 32
steps_per_epoch = round(len(tr_1.index)//batch_size)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')
model_dif_1.compile(optimizer=optimizer,
                         loss=root_mean_squared_error,
                         metrics=tf.keras.metrics.RootMeanSquaredError())

In [None]:
history_diff_1 = model_dif_1.fit(x=tr_1.x.to_numpy(),
                y=tr_1.y.to_numpy(),
                validation_data=(np.array(test_x), test_y),
                epochs=epochs,
                batch_size= batch_size,
                callbacks = [callbacks])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 00006: early stopping


In [None]:
pred_test = model_base.predict(test_x)
pred_test_d1 = model_dif_1.predict(test_x)
root_mean_squared_error((pred_test + pred_test_d1)/2, test_y).numpy()

1.201863799235557

In [None]:
def bag_data(data, n):
  list_idx = []
  for x in range(n):
    data = train.sample(frac=0.8,  replace=True, random_state=1)
    list_idx.append(data.index.values)
  return list_idx

In [None]:
idx = bag_data(train, 5)
epochs = 10
batch_size = 32

In [None]:
for d in range(len(idx)):
  print (d)
  x = train.loc[idx[d], "excerpt"].to_numpy()
  y = train.loc[idx[d], "target"].to_numpy()

  val_x = train.drop(idx[d]).excerpt.to_numpy()
  val_y = train.drop(idx[d]).target.to_numpy()

  model_albert =  albert_model()
  steps_per_epoch = round(len(x)//batch_size)
  num_train_steps = steps_per_epoch * epochs
  num_warmup_steps = int(0.1*num_train_steps)

  init_lr = 3e-5
  optimizer = optimization.create_optimizer(init_lr=init_lr,
                                            num_train_steps=num_train_steps,
                                            num_warmup_steps=num_warmup_steps,
                                            optimizer_type='adamw')
  
  model_albert.compile(optimizer=optimizer,
                         loss=root_mean_squared_error,
                         metrics=tf.keras.metrics.RootMeanSquaredError())
  history = model_albert.fit(x=x,
                y=y,
                validation_data=(val_x, val_y),
                epochs=epochs,
                batch_size= batch_size,
                callbacks = [callbacks])
  name = "saved_model_bag/" + str(d)
  model_albert.save(name)

0
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: saved_model_bag/0/assets


INFO:tensorflow:Assets written to: saved_model_bag/0/assets


1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 00009: early stopping




INFO:tensorflow:Assets written to: saved_model_bag/1/assets


INFO:tensorflow:Assets written to: saved_model_bag/1/assets


2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: saved_model_bag/2/assets


INFO:tensorflow:Assets written to: saved_model_bag/2/assets


3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: saved_model_bag/3/assets


INFO:tensorflow:Assets written to: saved_model_bag/3/assets


4
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 00010: early stopping




INFO:tensorflow:Assets written to: saved_model_bag/4/assets


INFO:tensorflow:Assets written to: saved_model_bag/4/assets


In [None]:
name = "saved_model_bag/" + "1"
name

'saved_model/1'

In [None]:
!zip -r /content/bag5.zip /content/saved_model_bag

  adding: content/saved_model_bag/ (stored 0%)
  adding: content/saved_model_bag/4/ (stored 0%)
  adding: content/saved_model_bag/4/variables/ (stored 0%)
  adding: content/saved_model_bag/4/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: content/saved_model_bag/4/variables/variables.index (deflated 68%)
  adding: content/saved_model_bag/4/assets/ (stored 0%)
  adding: content/saved_model_bag/4/assets/30k-clean.model (deflated 49%)
  adding: content/saved_model_bag/4/keras_metadata.pb (deflated 90%)
  adding: content/saved_model_bag/4/saved_model.pb (deflated 91%)
  adding: content/saved_model_bag/1/ (stored 0%)
  adding: content/saved_model_bag/1/variables/ (stored 0%)
  adding: content/saved_model_bag/1/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: content/saved_model_bag/1/variables/variables.index (deflated 68%)
  adding: content/saved_model_bag/1/assets/ (stored 0%)
  adding: content/saved_model_bag/1/assets/30k-clean.model (deflated 49%)
  adding

In [None]:
def netflix(es, ps, e0, l=.0001):
    """Combine predictions with the optimal weights to minimize RMSE.
    Ref: Töscher, A., Jahrer, M., & Bell, R. M. (2009). The bigchaos solution to the netflix grand prize.
    Args:
        es (list of float): RMSEs of predictions
        ps (list of np.array): predictions
        e0 (float): RMSE of all zero prediction
        l (float): lambda as in the ridge regression
    Returns:
        (tuple):
            - (np.array): ensemble predictions
            - (np.array): weights for input predictions
    """
    m = len(es)
    n = len(ps[0])

    X = np.stack(ps).T
    pTy = .5 * (n * e0**2 + (X**2).sum(axis=0) - n * np.array(es)**2)

    w = np.linalg.pinv(X.T.dot(X) + l * n * np.eye(m)).dot(pTy)

    return X.dot(w), w

In [None]:
model_load_1 = tf.keras.models.load_model("/content/saved_model_bag/0",  compile=False)
model_load_2 = tf.keras.models.load_model("/content/saved_model_bag/1",  compile=False)
model_load_3 = tf.keras.models.load_model("/content/saved_model_bag/2",  compile=False)
model_load_4 = tf.keras.models.load_model("/content/saved_model_bag/3",  compile=False)
model_load_5 = tf.keras.models.load_model("/content/saved_model_bag/4",  compile=False)

In [None]:
pred_1_s = model_load_1.predict(np.array(test_x))
pred_2_s = model_load_2.predict(np.array(test_x))
pred_3_s = model_load_3.predict(np.array(test_x))
pred_4_s = model_load_4.predict(np.array(test_x))
pred_5_s = model_load_5.predict(np.array(test_x))
pred_1_s = np.reshape(pred_1_s, (pred_1_s.shape[0]))
pred_2_s = np.reshape(pred_2_s, (pred_2_s.shape[0]))
pred_3_s = np.reshape(pred_3_s, (pred_3_s.shape[0]))
pred_4_s = np.reshape(pred_4_s, (pred_4_s.shape[0]))
pred_5_s = np.reshape(pred_5_s, (pred_5_s.shape[0]))

In [None]:
a = root_mean_squared_error(pred_1_s, test_y).numpy()
b = root_mean_squared_error(pred_2_s, test_y).numpy()
c = root_mean_squared_error(pred_3_s, test_y).numpy()
d = root_mean_squared_error(pred_4_s, test_y).numpy()
e = root_mean_squared_error(pred_5_s, test_y).numpy()
f = [a, b, c, d, e]
f

[0.5429434463848567,
 0.5446279706250821,
 0.5379367576621794,
 0.5418394598622831,
 0.5425445277894885]

In [None]:
e0 = root_mean_squared_error(test_y, np.zeros_like(test_y)).numpy()

In [None]:
p, w = netflix(f, [pred_1_s, pred_2_s, pred_3_s, pred_4_s, pred_5_s], e0, l=0.0001) 

In [None]:
p[1:10]

array([-1.54742703, -1.75596474, -1.12774306, -0.67281782, -3.03414249,
       -2.04513398, -2.39564863, -0.66814153, -1.23250397])

In [None]:
root_mean_squared_error(test_y, p).numpy()

0.5340304066982201

In [None]:
g = np.dot(kn,[pred_1_s, pred_2_s, pred_3_s, pred_4_s, pred_5_s])

In [None]:
sd = (w[0]*pred_1_s + w[1]*pred_2_s + w[2]*pred_3_s + w[3]*pred_4_s + w[4]*pred_5_s)/5
sd[1:10]

array([-0.3094854 , -0.35119295, -0.22554862, -0.13456357, -0.6068285 ,
       -0.4090268 , -0.47912973, -0.13362831, -0.2465008 ], dtype=float32)

In [None]:
root_mean_squared_error(test_y, g).numpy()

0.5340304066982878

In [None]:
np.zeros_like(test_y).shape

(567,)

In [None]:
w

array([-0.30335415,  0.39361939,  1.26997993, -0.33299488, -0.08171325])

In [None]:
kn = np.array([-0.30335415,  0.39361939,  1.26997993, -0.33299488, -0.08171325])

In [None]:
kn.shape

(5,)