Install Google pre-trained BERT model

In [0]:
!pip install bert-serving-client
!pip install -U bert-serving-server[http]
!wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
!unzip chinese_L-12_H-768_A-12.zip
!wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip
!unzip cased_L-24_H-1024_A-16.zip


Install sentence transformers

In [0]:
pip install -U sentence-transformers

Connect to BERT server

In [0]:
!nohup bert-serving-start -model_dir=./chinese_L-12_H-768_A-12 > out.file 2>&1 &

Download and import dataset

In [0]:
from os.path import exists
if not exists('enzh_data.zip'):
    !wget -O enzh_data.zip https://competitions.codalab.org/my/datasets/download/03e23bd7-8084-4542-997b-6a1ca6dd8a5f
    !unzip enzh_data.zip

In [0]:
def read_data(path):
  with open(path) as dataset:
    raw_data=dataset.readlines()
    return raw_data

raw_english_train=read_data("./train.enzh.src")
raw_chinese_train=read_data("./train.enzh.mt")
zh_train_scores =read_data("./train.enzh.scores")

raw_english_val = read_data("./dev.enzh.src")
raw_chinese_val = read_data("./dev.enzh.mt")
zh_val_scores = read_data("./dev.enzh.scores")
sentence_embeddings_train = []

Use **Sentence Transformer** to do English word embedding.

In [0]:
from sentence_transformers import SentenceTransformer
import numpy as np
embedding = SentenceTransformer('bert-large-nli-mean-tokens')
zh_train_src = np.array(embedding.encode(raw_english_train))
zh_val_src = np.array(embedding.encode(raw_english_val))

Use **BERT-Chinese** to do Chinese embedding

In [0]:
from bert_serving.client import BertClient
bc = BertClient()
zh_train_mt = bc.encode(raw_chinese_train)
zh_val_mt = bc.encode(raw_chinese_val)

bc.close()

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


In [0]:
print(zh_train_src.shape)
print(zh_train_mt.shape)
print(zh_val_src.shape)
print(zh_val_mt.shape)

(7000, 1024)
(7000, 768)
(1000, 1024)
(1000, 768)


Combine two languages together and get the training input.

In [0]:
from sklearn import preprocessing


X_train_zh= np.concatenate((zh_train_src,zh_train_mt),axis = 1)



X_val_zh = np.concatenate((zh_val_src,zh_val_mt),axis = 1)

#Scores
train_scores = np.array(zh_train_scores).astype(float)
y_train_zh =train_scores
# pre = preprocessing.MinMaxScaler()
# y_train_zh = pre.fit_transform(y_train_zh)
# y_train_zh = y_train_zh.reshape(7000,)

val_scores = np.array(zh_val_scores).astype(float)
y_val_zh =val_scores

In [0]:
print(X_train_zh.shape)
print(X_val_zh.shape)
print(y_train_zh.shape)
print(y_val_zh.shape)

(7000, 1792)
(1000, 1792)
(7000,)
(1000,)


SVM model

In [0]:
import numpy as np

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())


In [0]:
from sklearn.svm import SVR
from scipy.stats.stats import pearsonr

for k in ['rbf']:
    clf_t = SVR(kernel=k)
    print('Start training:')
    clf_t.fit(X_train_zh, y_train_zh)
    print(k)
    predictions = clf_t.predict(X_val_zh)
    pearson = pearsonr(y_val_zh, predictions)
    print(f'RMSE: {rmse(predictions,y_val_zh)} Pearson {pearson[0]}')
    print()


Start training:
rbf
RMSE: 0.8581345136057762 Pearson 0.4106063417749696



Use **Keras** to build a fully connected neural network.

In [0]:
from keras.models import Sequential
from keras.layers import Dense,Conv2D,Dropout, Activation, Flatten,MaxPooling2D
from scipy.stats.stats import pearsonr
def baseline_model():
    model = Sequential()
    model.add(Dense(128,input_dim=1792,init='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32,init='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(8,init='normal', activation='relu'))
    model.add(Dense(1, init='normal'))
      # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam',metrics=['accuracy']) 
    return model


Using TensorFlow backend.


Train and test the model.

In [0]:
model = baseline_model()
model.fit(X_train_zh, y_train_zh, nb_epoch=5, batch_size=64,validation_data=(X_val_zh,y_val_zh))
predictions = model.predict(X_val_zh)
predictions = predictions.astype(np.float64).reshape(1000,)
print(predictions.shape)
pearson = pearsonr(y_val_zh, predictions)
print(f'Pearson {pearson[0]}')

  
  
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  


Train on 7000 samples, validate on 1000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
(1000,)
Pearson 0.4171055394873464


Test prediction and download the result

In [0]:
import os

def writeScores(method_name,scores):
    fn = "predictions.txt"
    print("")
    with open(fn, 'w') as output_file:
        for idx,x in enumerate(scores):
            #out =  metrics[idx]+":"+str("{0:.2f}".format(x))+"\n"
            #print(out)
            output_file.write(f"{x}\n")

In [0]:
#EN_ZH
english_test_path='./test.enzh.src'
chinese_test_path='./test.enzh.mt'
raw_english_test=read_data(english_test_path)
raw_chinese_test=read_data(chinese_test_path)
zh_test_src = embedding.encode(raw_english_test)
bc = BertClient()
zh_test_mt = bc.encode(raw_chinese_test)
bc.close()
X_test_zh= np.concatenate((np.array(zh_test_src),np.array(zh_test_mt)),axis=1) 

#Predict
predictions = model.predict(X_test_zh)
predictions_zh = predictions.astype(np.float64).reshape(1000,)
print(predictions_zh.shape)

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


(1000,)


In [0]:
from google.colab import files
from zipfile import ZipFile


writeScores("FCNN",predictions_zh)

with ZipFile("en-zh_fcnn.zip","w") as newzip:
	newzip.write("predictions.txt")
 
files.download('en-zh_fcnn.zip') 




Result

The result from codalab after submitting is 0.4260.