In [18]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import multiprocessing
import jieba


In [19]:
# Load the datasets

df_gemini = pd.read_csv("results_data/experimental_design_results_gemini-2.0-flash-thinking-exp_reprocessed.csv")
df_gemini['model'] = 'G2FTE' # gemini-2.0-flash-thinking-exp

df_deepseek = pd.read_csv("results_data/experimental_design_results_deepseek-chat.csv")
df_deepseek['model'] = 'DSV3' # deepseek-chat v3

# Stack data frames
df = pd.concat([df_gemini, df_deepseek], ignore_index=True)

# Print
print(df)


                                              abstract  Repetition  \
0    文章首先阐述了工程教育专业认证与应用化学专业生产实习课程的关联，然后论述了工程教育专业认证背...           1   
1    文章首先阐述了工程教育专业认证与应用化学专业生产实习课程的关联，然后论述了工程教育专业认证背...           2   
2    文章首先阐述了工程教育专业认证与应用化学专业生产实习课程的关联，然后论述了工程教育专业认证背...           3   
3    “天然药物化学”是高等学校药学及相关专业的必修课程。课程章节内容多、理论性强,学生学习面临较...           1   
4    “天然药物化学”是高等学校药学及相关专业的必修课程。课程章节内容多、理论性强,学生学习面临较...           2   
..                                                 ...         ...   
529  <正>化学作为一门自然科学，是人们认识世界和改造世界的重要途径。在历史长河中，化学是经由无数...           2   
530  <正>化学作为一门自然科学，是人们认识世界和改造世界的重要途径。在历史长河中，化学是经由无数...           3   
531  绿色化学分析技术，即最大限度地减少或者避免有害化学品被应用于分析过程当中，从而实现环境保护与...           1   
532  绿色化学分析技术，即最大限度地减少或者避免有害化学品被应用于分析过程当中，从而实现环境保护与...           2   
533  绿色化学分析技术，即最大限度地减少或者避免有害化学品被应用于分析过程当中，从而实现环境保护与...           3   

                                                 ZH_EN  \
0    The article first elaborates on the connection...   
1    The article first elaborates on the re

In [20]:
# Tokenization function for Chinese texts using Jieba
def tokenize_chinese_jieba(text):
    # Use Jieba to cut the Chinese text into words
    return list(jieba.cut(text))

# Function to calculate BLEU (using unigram and bigram)
def calculate_bleu(candidate_tokens, reference_tokens):
    # We use weights (0.5, 0.5) for unigrams and bigrams
    try:
        return sentence_bleu([reference_tokens], candidate_tokens, weights=(0.5, 0.5, 0, 0))
    except Exception as e:
        return 0

# Function to calculate CHRF (character n-gram F-score)
def calculate_chrf(candidate_tokens, reference_tokens):
    candidate_str = "".join(candidate_tokens)
    reference_str = "".join(reference_tokens)
    # Calculates the ratio between the intersection and the union of the characters of the reference
    return len(set(candidate_str) & set(reference_str)) / len(set(reference_str)) if len(set(reference_str)) > 0 else 0

# Function to calculate TER using TF-IDF vectorization and mean squared error
def calculate_ter(candidate_text, reference_text):
    vectorizer = TfidfVectorizer() #token_pattern=r"(?u)\b\w+\b" - Removed token pattern because it is not needed for chinese
    try:
        tfidf_matrix = vectorizer.fit_transform([candidate_text, reference_text])
        return mean_squared_error(tfidf_matrix[0].toarray(), tfidf_matrix[1].toarray())
    except Exception as e:
        return 0

# Function to calculate Semantic Similarity (TF-IDF + cosine)
def calculate_semantic_similarity(original, translated):
    vectorizer = TfidfVectorizer() #token_pattern=r"(?u)\b\w+\b" - Removed token pattern because it is not needed for chinese
    try:
        tfidf_matrix = vectorizer.fit_transform([original, translated])
        return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    except Exception as e:
        return 0

# Apply metrics to each row of the DataFrame
def calculate_metrics_for_row(row):
    original_text = row['abstract']
    back_translation = row['EN_ZH']

    # Tokenize the texts using Jieba
    original_tokens = tokenize_chinese_jieba(original_text)
    translated_tokens = tokenize_chinese_jieba(back_translation)

    # Calculate the metrics:
    bleu_value = calculate_bleu(translated_tokens, original_tokens)
    chrf_value = calculate_chrf(translated_tokens, original_tokens)
    ter_value = calculate_ter("".join(translated_tokens), "".join(original_tokens))
    semantic_similarity = calculate_semantic_similarity(original_text, back_translation)

    return pd.Series([bleu_value, chrf_value, ter_value, semantic_similarity])

def calculate_metrics_for_df(df):
    # tqdm.pandas(desc="Calculating metrics") # Removed progress bar
    return df.apply(calculate_metrics_for_row, axis=1)

def apply_parallel(df, func, n_cores=multiprocessing.cpu_count()):
    df_split = np.array_split(df, n_cores)
    pool = multiprocessing.Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

df[['BLEU', 'CHRF', 'TER', 'Semantic Similarity']] = apply_parallel(df, calculate_metrics_for_df)

  return bound(*args, **kwds)


Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Loading model from cache /tmp/jieba.cache
Building prefix d

In [21]:
# Display the results:
print(df[['model', 'EN_ZH', 'BLEU', 'CHRF', 'TER', 'Semantic Similarity']])


     model                                              EN_ZH      BLEU  \
0    G2FTE  文章首先阐述了工程教育专业认证与应用化学专业生产实习课程之间的联系。接着，探讨了在工程教育专...  0.693889   
1    G2FTE  文章首先阐述了工程教育专业认证与应用化学专业生产实习课程之间的关系。其次，探讨了在工程教育专...  0.694830   
2    G2FTE  文章首先阐述工程教育专业认证与应用化学生产实习课程的关系。然后探讨工程教育专业认证背景下应用...  0.698251   
3    G2FTE  天然产物化学是高等院校药学及相关专业的必修课。课程内容章节丰富且理论性强，学生在学习中面临诸...  0.413904   
4    G2FTE  天然药物化学是高校药学及相关专业的必修课。该课程内容广泛且理论性强，对学生的学习造成挑战。因...  0.427470   
..     ...                                                ...       ...   
529   DSV3  化学作为一门自然科学，是认识和改造世界的重要途径。纵观历史，化学在无数次的实验探索中逐步发展...  0.520751   
530   DSV3  化学作为一门自然科学，是人类认识和改造世界的重要途径。纵观历史，化学在无数次的实验探索中逐步...  0.499128   
531   DSV3  绿色化学分析技术旨在最小化或避免在分析过程中使用有害化学品，从而实现环境保护和可持续发展。文...  0.518412   
532   DSV3  绿色化学分析技术旨在分析过程中尽量减少或避免使用有害化学品，从而实现环境保护和可持续发展。文...  0.528168   
533   DSV3  绿色化学分析技术旨在分析过程中尽量减少或避免使用有害化学品，从而实现环境保护和可持续发展。文...  0.527144   

         CHRF       TER  Semantic Similarity  
0    0.909091  0.141732             0.078745  
1    

In [22]:
# Global Descriptive Statistics
global_stats = df[['BLEU', 'CHRF', 'TER', 'Semantic Similarity']].describe()
print("\nGlobal Descriptive Statistics:")
print(global_stats.to_markdown())  # Output as markdown for journal

# Descriptive Statistics by Model
model_stats = df.groupby('model')[['BLEU', 'CHRF', 'TER', 'Semantic Similarity']].describe().transpose()

# Flatten the multi-level index for better readability in the table
model_stats.columns = [' '.join(col).strip() for col in model_stats.columns.values]

print("\nDescriptive Statistics by Model:")
print(model_stats.to_markdown()) # Output as markdown for journal


Global Descriptive Statistics:
|       |       BLEU |        CHRF |         TER |   Semantic Similarity |
|:------|-----------:|------------:|------------:|----------------------:|
| count | 534        | 534         | 534         |           534         |
| mean  |   0.601498 |   0.851231  |   0.0808073 |             0.121537  |
| std   |   0.108127 |   0.0650691 |   0.101409  |             0.110924  |
| min   |   0        |   0         |   0.013339  |             0         |
| 25%   |   0.532977 |   0.821892  |   0.0521626 |             0.0532538 |
| 50%   |   0.606116 |   0.856105  |   0.0688319 |             0.09194   |
| 75%   |   0.678872 |   0.888889  |   0.0867233 |             0.159001  |
| max   |   0.834775 |   0.96875   |   1         |             0.658562  |

Descriptive Statistics by Model:
|                                  |     D S V 3 |   G 2 F T E |
|:---------------------------------|------------:|------------:|
| ('BLEU', 'count')                | 267         | 267

In [24]:
df.to_csv("results_metrics/results_metrics.csv", index=False)
df.to_excel("results_metrics/results_metrics.xlsx", index=False)
