In [14]:
import pandas as pd
from collections import Counter
from rouge import Rouge
import sys
import numpy as np
import torch
from transformers import BertModel, BertTokenizerFast
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler 
import bleurt.score as bleurt_score

# model
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split,KFold
from tqdm import tqdm_notebook as tqdm

In [34]:
get_metric_from_groupSize = lambda n: ('rouge-%d' % n)

def calculate_recall(ref, candidate, group_size=1):
    #print(ref)
    #print(candidate)
    #Instantiate Rouge
    rouge_metric = get_metric_from_groupSize(group_size)
    rouge_stat = "r"
    rouge = Rouge(metrics=[rouge_metric], stats=[rouge_stat])

    outcome = rouge.get_scores(candidate, ref)
    return outcome[0][rouge_metric][rouge_stat]

def calculate_precision(ref, candidate, group_size=1):

    #Instantiate Rouge
    rouge_metric = get_metric_from_groupSize(group_size)
    rouge_stat = "p"
    rouge = Rouge(metrics=[rouge_metric], stats=[rouge_stat])

    outcome = rouge.get_scores(candidate, ref)
    return outcome[0][rouge_metric][rouge_stat]

def calculate_f1(ref, candidate, group_size=1):

    #Instantiate Rouge
    rouge_metric = get_metric_from_groupSize(group_size)
    rouge_stat = "f"
    rouge = Rouge(metrics=[rouge_metric], stats=[rouge_stat])

    outcome = rouge.get_scores(candidate, ref)
    return outcome[0][rouge_metric][rouge_stat]

In [5]:
def BLEU(ref, candidate, group_size):
    
    ref = ref.lower().split()
    candidate = candidate.lower().split()

    if(group_size > 1):
        ref = formGroups(ref, group_size)
        candidate = formGroups(candidate, group_size)

    # compute word frequencies for the references and the candidate
    ref_counts = Counter(ref)
    candidate_counts = Counter(candidate)

    covered = 0
    total = 0
    
    # compute the coverage for each word
    for word, count in candidate_counts.items():
        covered += min(count, ref_counts[word])
        total += count
    
    return covered / len(candidate_counts)

def formGroups(words, group_size):

    if(group_size > len(words)):
        return words

    #Form groups
    to_return = []
    i = 0
    while (i + group_size - 1) < len(words):
        to_return.append(' '.join(words[i:i + group_size]))
        i+=1

    return to_return

In [10]:
#BLEU
def calculate_bleu(dataframe, reference_col='reference', translation_col='translation', word_group_size=1):
    series_data = [BLEU(row[reference_col], row[translation_col], group_size= word_group_size) for idx, row in dataframe.iterrows()]
    return pd.Series(data=series_data, index=dataframe.index) 

#ROUGE
def calculate_rouge_recall(dataframe, reference_col='reference', translation_col='translation', word_group_size=1):
    series_data = [calculate_recall(row[reference_col], row[translation_col], word_group_size) for idx, row in dataframe.iterrows()]
    return pd.Series(data=series_data, index=dataframe.index) 

def calculate_rouge_precision(dataframe, reference_col='reference', translation_col='translation', word_group_size=1):
    series_data = [calculate_precision(row[reference_col], row[translation_col], word_group_size) for idx, row in dataframe.iterrows()]
    return pd.Series(data=series_data, index=dataframe.index) 

def calculate_rouge_f1(dataframe, reference_col='reference', translation_col='translation', word_group_size=1):
    series_data = [calculate_f1(row[reference_col], row[translation_col], word_group_size) for idx, row in dataframe.iterrows()]
    return pd.Series(data=series_data, index=dataframe.index) 

#AUXILIARY
def getCSV(path):
    return pd.read_csv(path)


def initBLEURT(csv):
    checkpoint = "bleurt\\test_checkpoint"
    scorer = bleurt_score.BleurtScorer(checkpoint)
    references = csv.reference
    candidates = csv.translation
    print("BLEURT calculating...")
    scores_bluert = [scorer.score(references=[row["reference"]],candidates= [row["translation"]])[0] for idx, row in csv.iterrows()]
    return scores_bluert

#MAIN
def main(filePath):
    #Get file path from arguments
    #filePath = sys.argv[1]
    
    #Import CSV
    csv = getCSV(filePath)

    #BLEU calculation
    csv['bleu_w1'] = calculate_bleu(csv)
    csv['bleu_w2'] = calculate_bleu(csv, word_group_size=2)
    print("BLEU calculated...")

    #ROUGE calculation
    csv['rouge_recall_w1'] = calculate_rouge_recall(csv)
    csv['rouge_precision_w1'] = calculate_rouge_precision(csv)
    csv['rouge_f1_w1'] = calculate_rouge_f1(csv)
    print("ROUGE 1-gram calculated...")

    csv['rouge_precision_w2'] = calculate_rouge_precision(csv, word_group_size=2)
    csv['rouge_recall_w2'] = calculate_rouge_recall(csv, word_group_size=2)
    csv['rouge_f1_w2'] = calculate_rouge_f1(csv, word_group_size=2)
    print("ROUGE 2-gram calculated...")   
    csv['BLEURT'] = initBLEURT(csv)

    #PRINT
    #print(csv)
    return csv

In [28]:
path = "C:\\Users\\Admin\\Documents\\GitHub\\text-mining-project\\corpus"
corpus = "\\zh-en"
filePath = path + corpus + "\\scores.csv"

In [37]:
pd.read_csv(filePath)

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,他性格活泼，这对英国赛马来说是好事，但是除此之外，他还是一位不可思议的骑师。,"His character is good for the British horse, b...",He's a lively character which is good for Brit...,0.625559,92.75,4
1,近日刚搬至旧金山的一位28岁厨师本周被发现死于当地一家商场的楼梯间。,"A 28 chef, who has just moved to San Francisco...",A 28-year-old chef who had recently moved to S...,0.550952,92.00,4
2,去年，有官员表示，胡克先生的团队所得出的结论是针对伊斯兰国炼油厂的空袭并未大幅削减恐怖组织的...,"Last year, officials said Mr. Hooker's team ha...","Last year, officials said, Mr. Hooker's team c...",0.540814,89.00,5
3,尤其值得玩味的是政府对于饥饿民众们的回应，比如总统市民赫伯特·胡佛“别人的事我可管不了”的态度。,It is particularly interesting to note the gov...,Especially savory are the accounts of the gove...,-0.793944,49.50,4
4,不过，从20世纪90年代至今，人类共进行了18次火星探测，而月球探测只进行了14次。,"However, from the 1990s to the present, human ...","However, ever since the 1990s, a total of 18 h...",0.046532,77.50,4
...,...,...,...,...,...,...
26414,根据一份联合声明，“Wood Group 以及 Unite 和 RMT 工会可以确认我们已同...,"According to a joint statement , "" Wood Group ...","""Wood Group and the Unite and RMT unions can c...",0.563658,81.00,1
26415,2016年8月12日，在里约奥运会女子50米步枪三种姿势比赛中，获得铜牌。,"On August 12 , 2016 , a bronze medal was obtai...","On August 12, 2016, she won the bronze medal i...",-0.358579,64.00,1
26416,这会给我带来太大的压力。,This will give me too big pressure.,That would have meant too much pressure.,0.554093,76.00,1
26417,这名女性当场死亡。,The woman killed the spot.,She died at the scene.,-1.724330,36.00,1


In [35]:
data = main(filePath)

BLEU calculated...


ValueError: Reference is empty.

In [22]:
data.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators,bleu_w1,bleu_w2,rouge_recall_w1,rouge_precision_w1,rouge_f1_w1,rouge_precision_w2,rouge_recall_w2,rouge_f1_w2,BLEURT
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1,0.25,0.2,0.266667,0.25,0.258065,0.2,0.214286,0.206897,-0.369962
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2,0.75,0.636364,0.75,0.75,0.75,0.636364,0.636364,0.636364,0.702672
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1,0.65,0.285714,0.545455,0.545455,0.545455,0.285714,0.285714,0.285714,0.447156
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2,0.461538,0.230769,0.6,0.428571,0.5,0.230769,0.333333,0.272727,0.31179
4,Weil sie Wassertemperaturen um die sechs Grad ...,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909,87.0,2,0.722222,0.333333,0.6,0.631579,0.615385,0.277778,0.263158,0.27027,0.504052


In [23]:
def kfold_CV_pipe(x_train, y_train, model, n_splits = 5):
    all_scores = []
    # set up k-fold
    kf = KFold(n_splits=5)
    oof_prediction = np.zeros((len(scores),))
    test_preds = 0
    for  _, (trn_idx, val_idx) in tqdm(enumerate(kf.split(scores, data['z-score']))):
        # set up the splitted data
        train       , val        = scores.loc[trn_idx,:], scores.loc[val_idx,:]
        train_target, val_target = data['z-score'][trn_idx], data['z-score'][val_idx]      
        #print(train,val)
        # encode     
        # model fitting
        model.fit(train, train_target)
        # get predicted values for oof data and whole test set
        temp_oof = model.predict(val)
        # get predicted values for whole data set aggregate from each fold iter
        oof_prediction[val_idx] = temp_oof    
        fold_score = pd.Series(val_target).corr(pd.Series(temp_oof,index = val_idx),method='pearson')
        print(fold_score)
        all_scores.append(fold_score)               
    return  oof_prediction, all_scores

In [24]:
params = {'n_estimators': 120,
          'max_depth': 6,
          'min_samples_split': 5,
          'learning_rate': 0.01}

In [25]:
scaler = StandardScaler()
scores = pd.DataFrame(scaler.fit_transform(data.loc[:,'bleu_w1':]),columns=data.loc[:,'bleu_w1':].columns, index = data.loc[:,'bleu_w1':].index)
model = ensemble.GradientBoostingRegressor(**params)
oof_prediction, all_scores = kfold_CV_pipe(scores,data['z-score'],model)

data['z-score'].corr(pd.Series(oof_prediction,index = data.index),method='pearson')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for  _, (trn_idx, val_idx) in tqdm(enumerate(kf.split(scores, data['z-score']))):


0it [00:00, ?it/s]

0.40333461356646244
0.43235953387601506
0.4014679954701772
0.3699752559454467
0.3873631673991355


0.40111958618070875

## Bleurt

In [9]:
checkpoint = "bleurt\\test_checkpoint"
scorer = bleurt_score.BleurtScorer(checkpoint)

INFO:tensorflow:Reading checkpoint bleurt\test_checkpoint.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint dbleurt_tiny
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:dbleurt_tiny
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


In [10]:
csv = getCSV(filePath)
references = csv.reference
candidates = csv.translation
#scores_bluert = scorer.score(references=references, candidates=candidates)
scores_bluert = [scorer.score(references=[row["reference"]],candidates= [row["translation"]])[0] for idx, row in csv.iterrows()]

In [11]:
data['BLEURT'] = scores_bluert

In [12]:
scaler = StandardScaler()
scores = pd.DataFrame(scaler.fit_transform(data.loc[:,'bleu_w1':]),columns=data.loc[:,'bleu_w1':].columns, index = data.loc[:,'bleu_w1':].index)

In [13]:
scores

Unnamed: 0,bleu_w1,bleu_w2,rouge_recall_w1,rouge_precision_w1,rouge_f1_w1,rouge_precision_w2,rouge_recall_w2,rouge_f1_w2,BLEURT
0,1.264540,0.137482,0.634414,1.195416,0.948848,0.158922,-0.079202,0.044152,-0.028511
1,0.465324,0.771770,0.064105,0.398461,0.256429,0.802442,0.549969,0.691346,-0.406516
2,1.141584,0.737533,1.018991,0.591662,0.837755,0.767706,1.044318,0.921315,0.821842
3,-0.067486,-0.186870,0.966340,0.162326,0.550251,-0.170150,0.220403,0.009019,-1.462404
4,-1.205764,-0.803139,-0.980406,-1.358927,-1.208427,-0.820398,-0.646875,-0.743953,-0.455236
...,...,...,...,...,...,...,...,...,...
11580,-0.094127,0.458745,0.064105,-0.132842,-0.016103,0.536640,0.691212,0.631039,0.638268
11581,-0.115924,-0.747114,-0.294979,-0.248343,-0.254568,-0.965907,-0.978018,-0.973118,-0.927007
11582,0.604318,1.269765,0.402066,0.444661,0.460827,0.852966,0.819614,0.860205,1.008398
11583,0.508525,0.712549,0.064105,0.566241,0.332870,0.742359,0.412011,0.585206,-0.565352


### Check for normal score

In [14]:
mse = np.array([])
pcorr = np.array([])
kendallcorr = np.array([])
metrics = scores.columns
for col in scores.columns:
    mse = np.append(mse,mean_squared_error(data['z-score'],scores[col]))
    pcorr = np.append(pcorr,data['z-score'].corr(scores[col],method='pearson'))
    kendallcorr = np.append(kendallcorr,data['z-score'].corr(scores[col],method='kendall'))
    
pd.DataFrame({'mse':mse,'pcorr':pcorr,'kendallcorr':kendallcorr},index = metrics)

Unnamed: 0,mse,pcorr,kendallcorr
bleu_w1,1.053754,0.404921,0.271061
bleu_w2,1.079262,0.390275,0.272864
rouge_recall_w1,1.11344,0.37065,0.24905
rouge_precision_w1,1.019766,0.424436,0.2866
rouge_f1_w1,1.035963,0.415136,0.279233
rouge_precision_w2,1.08908,0.384637,0.267882
rouge_recall_w2,1.135497,0.357986,0.247705
rouge_f1_w2,1.100779,0.37792,0.260021
BLEURT,0.940529,0.469932,0.329338


In [103]:
X= scores
y= data['z-score']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=13)

params = {'n_estimators': 120,
          'max_depth': 6,
          'min_samples_split': 5,
          'learning_rate': 0.01}

In [104]:
model = ensemble.GradientBoostingRegressor(**params)
model.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=6, min_samples_split=5,
                          n_estimators=120)

In [105]:
mse = mean_squared_error(y_train, model.predict(X_train))
print("The mean squared error (MSE) on train set: {:.4f}".format(mse))
mse = mean_squared_error(y_test, model.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on train set: 0.5194
The mean squared error (MSE) on test set: 0.5888


In [29]:
y_predict = pd.Series(reg.predict(X_test),index = y_test.index,name = 'Predict_score')

In [31]:
merged_score =pd.merge(data['z-score'],y_predict,left_index= True,right_index= True)

In [32]:
print('mse',mean_squared_error(merged_score['z-score'],merged_score['Predict_score']))
print('pcorr',merged_score['z-score'].corr(merged_score['Predict_score'],method='pearson'))
print('kendall',merged_score['z-score'].corr(merged_score['Predict_score'],method='kendall'))

mse 0.6510223507551757
pcorr 0.5068492287785946
kendall 0.3629815342755539


In [None]:
def final_results(scores,z_score)
    mse = np.array([])
    pcorr = np.array([])
    kendallcorr = np.array([])
    metrics = scores.columns
    for col in scores.columns:
        mse = np.append(mse,mean_squared_error(z_score,scores[col]))
        pcorr = np.append(pcorr,z_score.corr(scores[col],method='pearson'))
        kendallcorr = np.append(kendallcorr,z_score.corr(scores[col],method='kendall'))
    pd.DataFrame({'mse':mse,'pcorr':pcorr,'kendallcorr':kendallcorr},index = metrics)

In [82]:
def kfold_CV_pipe(x_train, y_train, model, n_splits = 5):
    all_scores = []
    # set up k-fold
    kf = KFold(n_splits=5)
    oof_prediction = np.zeros((len(scores),))
    test_preds = 0
    for  _, (trn_idx, val_idx) in tqdm(enumerate(kf.split(scores, data['z-score']))):
        # set up the splitted data
        train       , val        = scores.loc[trn_idx,:], scores.loc[val_idx,:]
        train_target, val_target = data['z-score'][trn_idx], data['z-score'][val_idx]      
        #print(train,val)
        # encode     
        # model fitting
        model.fit(train, train_target)
        # get predicted values for oof data and whole test set
        temp_oof = model.predict(val)
        # get predicted values for whole data set aggregate from each fold iter
        oof_prediction[val_idx] = temp_oof    
        fold_score = pd.Series(val_target).corr(pd.Series(temp_oof,index = val_idx),method='pearson')
        print(fold_score)
        all_scores.append(fold_score)               
    return  oof_prediction, all_scores

In [106]:
model = ensemble.GradientBoostingRegressor(**params)
oof_prediction, all_scores = kfold_CV_pipe(scores,data['z-score'],model)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for  _, (trn_idx, val_idx) in tqdm(enumerate(kf.split(scores, data['z-score']))):


0it [00:00, ?it/s]

0.5151114418970998
0.45190028203146526
0.5390717229564809
0.5361070573926131
0.5445100717320295


In [108]:
data['z-score'].corr(pd.Series(oof_prediction,index = data.index),method='pearson')

0.5173917437819486