In [14]:
import pandas as pd
from collections import Counter
from rouge import Rouge
import sys
import numpy as np
import torch
from transformers import BertModel, BertTokenizerFast
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler 
import bleurt.score as bleurt_score

# model
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split,KFold
from tqdm import tqdm_notebook as tqdm

In [47]:
get_metric_from_groupSize = lambda n: ('rouge-%d' % n)

def calculate_recall(ref, candidate, group_size=1):
    #print(ref)
    try: 
        #print(candidate)
        #Instantiate Rouge
        rouge_metric = get_metric_from_groupSize(group_size)
        rouge_stat = "r"
        rouge = Rouge(metrics=[rouge_metric], stats=[rouge_stat])
        outcome = rouge.get_scores(candidate, ref)
        return outcome[0][rouge_metric][rouge_stat]
    except:
        return None
          
def calculate_precision(ref, candidate, group_size=1):
    try:
        #Instantiate Rouge
        rouge_metric = get_metric_from_groupSize(group_size)
        rouge_stat = "p"
        rouge = Rouge(metrics=[rouge_metric], stats=[rouge_stat])

        outcome = rouge.get_scores(candidate, ref)
        return outcome[0][rouge_metric][rouge_stat]
    except:
        return None    
def calculate_f1(ref, candidate, group_size=1):
    try:
        #Instantiate Rouge
        rouge_metric = get_metric_from_groupSize(group_size)
        rouge_stat = "f"
        rouge = Rouge(metrics=[rouge_metric], stats=[rouge_stat])

        outcome = rouge.get_scores(candidate, ref)
        return outcome[0][rouge_metric][rouge_stat]
    except:
        return None    

In [48]:
def BLEU(ref, candidate, group_size):
    
    ref = ref.lower().split()
    candidate = candidate.lower().split()

    if(group_size > 1):
        ref = formGroups(ref, group_size)
        candidate = formGroups(candidate, group_size)

    # compute word frequencies for the references and the candidate
    ref_counts = Counter(ref)
    candidate_counts = Counter(candidate)

    covered = 0
    total = 0
    
    # compute the coverage for each word
    for word, count in candidate_counts.items():
        covered += min(count, ref_counts[word])
        total += count
    
    return covered / len(candidate_counts)

def formGroups(words, group_size):

    if(group_size > len(words)):
        return words

    #Form groups
    to_return = []
    i = 0
    while (i + group_size - 1) < len(words):
        to_return.append(' '.join(words[i:i + group_size]))
        i+=1

    return to_return

In [49]:
#BLEU
def calculate_bleu(dataframe, reference_col='reference', translation_col='translation', word_group_size=1):
    series_data = [BLEU(row[reference_col], row[translation_col], group_size= word_group_size) for idx, row in dataframe.iterrows()]
    return pd.Series(data=series_data, index=dataframe.index) 

#ROUGE
def calculate_rouge_recall(dataframe, reference_col='reference', translation_col='translation', word_group_size=1):
    series_data = [calculate_recall(row[reference_col], row[translation_col], word_group_size) for idx, row in dataframe.iterrows()]
    return pd.Series(data=series_data, index=dataframe.index) 

def calculate_rouge_precision(dataframe, reference_col='reference', translation_col='translation', word_group_size=1):
    series_data = [calculate_precision(row[reference_col], row[translation_col], word_group_size) for idx, row in dataframe.iterrows()]
    return pd.Series(data=series_data, index=dataframe.index) 

def calculate_rouge_f1(dataframe, reference_col='reference', translation_col='translation', word_group_size=1):
    series_data = [calculate_f1(row[reference_col], row[translation_col], word_group_size) for idx, row in dataframe.iterrows()]
    return pd.Series(data=series_data, index=dataframe.index) 

#AUXILIARY
def getCSV(path):
    return pd.read_csv(path)


def initBLEURT(csv):
    checkpoint = "bleurt\\test_checkpoint"
    scorer = bleurt_score.BleurtScorer(checkpoint)
    references = csv.reference
    candidates = csv.translation
    print("BLEURT calculating...")
    scores_bluert = [scorer.score(references=[row["reference"]],candidates= [row["translation"]])[0] for idx, row in csv.iterrows()]
    return scores_bluert

#MAIN
def main(filePath):
    #Get file path from arguments
    #filePath = sys.argv[1]
    
    #Import CSV
    csv = getCSV(filePath)

    #BLEU calculation
    csv['bleu_w1'] = calculate_bleu(csv)
    csv['bleu_w2'] = calculate_bleu(csv, word_group_size=2)
    print("BLEU calculated...")

    #ROUGE calculation
    csv['rouge_recall_w1'] = calculate_rouge_recall(csv)
    csv['rouge_precision_w1'] = calculate_rouge_precision(csv)
    csv['rouge_f1_w1'] = calculate_rouge_f1(csv)
    print("ROUGE 1-gram calculated...")

    csv['rouge_precision_w2'] = calculate_rouge_precision(csv, word_group_size=2)
    csv['rouge_recall_w2'] = calculate_rouge_recall(csv, word_group_size=2)
    csv['rouge_f1_w2'] = calculate_rouge_f1(csv, word_group_size=2)
    print("ROUGE 2-gram calculated...")   
    csv['BLEURT'] = initBLEURT(csv)

    #PRINT
    #print(csv)
    return csv

In [103]:
path = "C:\\Users\\Admin\\Documents\\GitHub\\text-mining-project\\corpus"
corpus = "\\en-zh"
filePath = path + corpus + "\\scores.csv"

In [104]:
data = main(filePath)

BLEU calculated...
ROUGE 1-gram calculated...
ROUGE 2-gram calculated...
INFO:tensorflow:Reading checkpoint bleurt\test_checkpoint.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint dbleurt_tiny
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:dbleurt_tiny
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.
BLEURT calculating...


In [105]:
data.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators,bleu_w1,bleu_w2,rouge_recall_w1,rouge_precision_w1,rouge_f1_w1,rouge_precision_w2,rouge_recall_w2,rouge_f1_w2,BLEURT
0,"""In the GISS model's simulation, Venus' slow s...",GSIS的科学家AnthonyDelGenio在新闻稿中解释说：“在GISS模型的模拟模型中...,戈达德太空研究所科学家安东尼·德尔·杰尼奥在新闻发布会上解释说：“在戈达德太空研究所的模型模...,-1.171867,50.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.193127
1,Ai Yanhan of China in the Women's 4 x 200m Fre...,中国在英国女性4x200mFreestreyWTE中的最后被称为：“中国14岁的孩子从球下降...,参加女子4x200米自由泳接力赛决赛的中国小将艾衍含被这样描述：“那名14岁的中国小姑娘犯了...,-2.255403,26.5,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338784
2,"Then came 2012, when nothing much went right f...",然后来到2012年，当她和她的队友们没有什么好处。,2012年，她和她的队友都不被看好。,-2.508996,21.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108004
3,"Since last year, Guodian Group has exported a ...",自去年以来，GoudianGroup从南非通过南非港口出口了163套风力发电项目。,自去年以来，国电集团共计有163套风电项目陆续从连云港港出口南非。,-2.41678,23.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.30325
4,"Some alleged that the Kempinski hotel simply ""...","一些人指称，Kempinski旅馆只是""被捕""，以满足阿拉伯客户的要求。",有人认为凯宾斯基酒店简直是为了满足阿拉伯客户的要求而“卑躬屈膝”。,-1.489676,45.0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130959


In [106]:
def kfold_CV_pipe(x_train, y_train, model, n_splits = 5):
    all_scores = []
    # set up k-fold
    kf = KFold(n_splits=5)
    oof_prediction = np.zeros((len(x_train),))
    test_preds = 0
    for  _, (trn_idx, val_idx) in tqdm(enumerate(kf.split(x_train, y_train))):
        # set up the splitted data
        train       , val        = x_train.loc[trn_idx,:], x_train.loc[val_idx,:]
        train_target, val_target = y_train[trn_idx], y_train[val_idx]      
        #print(train,val)
        # encode     
        # model fitting
        model.fit(train, train_target)
        # get predicted values for oof data and whole test set
        temp_oof = model.predict(val)
        # get predicted values for whole data set aggregate from each fold iter
        oof_prediction[val_idx] = temp_oof    
        fold_score = pd.Series(val_target).corr(pd.Series(temp_oof,index = val_idx),method='pearson')
        print(fold_score)
        all_scores.append(fold_score)               
    return  oof_prediction, all_scores

In [107]:
params = {'n_estimators': 120,
          'max_depth': 6,
          'min_samples_split': 5,
          'learning_rate': 0.01}

In [108]:
scores.dropna(inplace=True)

In [109]:
data.dropna(inplace=True)

In [110]:
scores.reset_index(drop=True,inplace=True)
data.reset_index(drop=True,inplace=True)

In [119]:
scaler = StandardScaler()
scores = pd.DataFrame(scaler.fit_transform(data[['BLEURT']]),columns=['BLEURT'], index = data.index)
model = ensemble.GradientBoostingRegressor(**params)
oof_prediction, all_scores = kfold_CV_pipe(scores,data['z-score'],model)

data['z-score'].corr(pd.Series(oof_prediction,index = data.index),method='pearson')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for  _, (trn_idx, val_idx) in tqdm(enumerate(kf.split(x_train, y_train))):


0it [00:00, ?it/s]

0.11056370827333754
0.15099479571549362
0.20296832592487027
0.18770508141172668
0.15157997357922964


0.0977554939587571

In [120]:
data['z-score'].corr(data['BLEURT'],method='pearson')

0.15199563182865442

## Bleurt

In [9]:
checkpoint = "bleurt\\test_checkpoint"
scorer = bleurt_score.BleurtScorer(checkpoint)

INFO:tensorflow:Reading checkpoint bleurt\test_checkpoint.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint dbleurt_tiny
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:dbleurt_tiny
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


In [10]:
csv = getCSV(filePath)
references = csv.reference
candidates = csv.translation
#scores_bluert = scorer.score(references=references, candidates=candidates)
scores_bluert = [scorer.score(references=[row["reference"]],candidates= [row["translation"]])[0] for idx, row in csv.iterrows()]

In [11]:
data['BLEURT'] = scores_bluert

In [12]:
scaler = StandardScaler()
scores = pd.DataFrame(scaler.fit_transform(data.loc[:,'bleu_w1':]),columns=data.loc[:,'bleu_w1':].columns, index = data.loc[:,'bleu_w1':].index)

In [13]:
scores

Unnamed: 0,bleu_w1,bleu_w2,rouge_recall_w1,rouge_precision_w1,rouge_f1_w1,rouge_precision_w2,rouge_recall_w2,rouge_f1_w2,BLEURT
0,1.264540,0.137482,0.634414,1.195416,0.948848,0.158922,-0.079202,0.044152,-0.028511
1,0.465324,0.771770,0.064105,0.398461,0.256429,0.802442,0.549969,0.691346,-0.406516
2,1.141584,0.737533,1.018991,0.591662,0.837755,0.767706,1.044318,0.921315,0.821842
3,-0.067486,-0.186870,0.966340,0.162326,0.550251,-0.170150,0.220403,0.009019,-1.462404
4,-1.205764,-0.803139,-0.980406,-1.358927,-1.208427,-0.820398,-0.646875,-0.743953,-0.455236
...,...,...,...,...,...,...,...,...,...
11580,-0.094127,0.458745,0.064105,-0.132842,-0.016103,0.536640,0.691212,0.631039,0.638268
11581,-0.115924,-0.747114,-0.294979,-0.248343,-0.254568,-0.965907,-0.978018,-0.973118,-0.927007
11582,0.604318,1.269765,0.402066,0.444661,0.460827,0.852966,0.819614,0.860205,1.008398
11583,0.508525,0.712549,0.064105,0.566241,0.332870,0.742359,0.412011,0.585206,-0.565352


### Check for normal score

In [14]:
mse = np.array([])
pcorr = np.array([])
kendallcorr = np.array([])
metrics = scores.columns
for col in scores.columns:
    mse = np.append(mse,mean_squared_error(data['z-score'],scores[col]))
    pcorr = np.append(pcorr,data['z-score'].corr(scores[col],method='pearson'))
    kendallcorr = np.append(kendallcorr,data['z-score'].corr(scores[col],method='kendall'))
    
pd.DataFrame({'mse':mse,'pcorr':pcorr,'kendallcorr':kendallcorr},index = metrics)

Unnamed: 0,mse,pcorr,kendallcorr
bleu_w1,1.053754,0.404921,0.271061
bleu_w2,1.079262,0.390275,0.272864
rouge_recall_w1,1.11344,0.37065,0.24905
rouge_precision_w1,1.019766,0.424436,0.2866
rouge_f1_w1,1.035963,0.415136,0.279233
rouge_precision_w2,1.08908,0.384637,0.267882
rouge_recall_w2,1.135497,0.357986,0.247705
rouge_f1_w2,1.100779,0.37792,0.260021
BLEURT,0.940529,0.469932,0.329338


In [103]:
X= scores
y= data['z-score']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=13)

params = {'n_estimators': 120,
          'max_depth': 6,
          'min_samples_split': 5,
          'learning_rate': 0.01}

In [104]:
model = ensemble.GradientBoostingRegressor(**params)
model.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=6, min_samples_split=5,
                          n_estimators=120)

In [105]:
mse = mean_squared_error(y_train, model.predict(X_train))
print("The mean squared error (MSE) on train set: {:.4f}".format(mse))
mse = mean_squared_error(y_test, model.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on train set: 0.5194
The mean squared error (MSE) on test set: 0.5888


In [29]:
y_predict = pd.Series(reg.predict(X_test),index = y_test.index,name = 'Predict_score')

In [31]:
merged_score =pd.merge(data['z-score'],y_predict,left_index= True,right_index= True)

In [32]:
print('mse',mean_squared_error(merged_score['z-score'],merged_score['Predict_score']))
print('pcorr',merged_score['z-score'].corr(merged_score['Predict_score'],method='pearson'))
print('kendall',merged_score['z-score'].corr(merged_score['Predict_score'],method='kendall'))

mse 0.6510223507551757
pcorr 0.5068492287785946
kendall 0.3629815342755539


In [None]:
def final_results(scores,z_score)
    mse = np.array([])
    pcorr = np.array([])
    kendallcorr = np.array([])
    metrics = scores.columns
    for col in scores.columns:
        mse = np.append(mse,mean_squared_error(z_score,scores[col]))
        pcorr = np.append(pcorr,z_score.corr(scores[col],method='pearson'))
        kendallcorr = np.append(kendallcorr,z_score.corr(scores[col],method='kendall'))
    pd.DataFrame({'mse':mse,'pcorr':pcorr,'kendallcorr':kendallcorr},index = metrics)

In [82]:
def kfold_CV_pipe(x_train, y_train, model, n_splits = 5):
    all_scores = []
    # set up k-fold
    kf = KFold(n_splits=5)
    oof_prediction = np.zeros((len(scores),))
    test_preds = 0
    for  _, (trn_idx, val_idx) in tqdm(enumerate(kf.split(scores, data['z-score']))):
        # set up the splitted data
        train       , val        = scores.loc[trn_idx,:], scores.loc[val_idx,:]
        train_target, val_target = data['z-score'][trn_idx], data['z-score'][val_idx]      
        #print(train,val)
        # encode     
        # model fitting
        model.fit(train, train_target)
        # get predicted values for oof data and whole test set
        temp_oof = model.predict(val)
        # get predicted values for whole data set aggregate from each fold iter
        oof_prediction[val_idx] = temp_oof    
        fold_score = pd.Series(val_target).corr(pd.Series(temp_oof,index = val_idx),method='pearson')
        print(fold_score)
        all_scores.append(fold_score)               
    return  oof_prediction, all_scores

In [106]:
model = ensemble.GradientBoostingRegressor(**params)
oof_prediction, all_scores = kfold_CV_pipe(scores,data['z-score'],model)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for  _, (trn_idx, val_idx) in tqdm(enumerate(kf.split(scores, data['z-score']))):


0it [00:00, ?it/s]

0.5151114418970998
0.45190028203146526
0.5390717229564809
0.5361070573926131
0.5445100717320295


In [108]:
data['z-score'].corr(pd.Series(oof_prediction,index = data.index),method='pearson')

0.5173917437819486