In [399]:
import pandas as pd
import sklearn.metrics as skmetrics
import krippendorff as kd
from sklearn.feature_extraction.text import CountVectorizer

# train set
# bi grams - on target and on src -> 2 and 3 grams

https://practicaldatascience.co.uk/machine-learning/how-to-use-count-vectorization-for-n-gram-analysis

In [400]:
def get_ngrams(text, ngram_from=2, ngram_to=2, n=None, max_features=20000):
    vec = CountVectorizer(ngram_range=(ngram_from, ngram_to),
                          max_features=max_features,
                          stop_words='english'
).fit(text)
    print(vec)
    bag_of_words = vec.transform(text)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, i]) for word, i in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)

    return words_freq[:n]

In [401]:
df_train = pd.read_csv(r"Evaluation_Data_And_Annotation\train_data.csv")  # need to change train_data to location
df_train

Unnamed: 0,id,source,target
0,335,Stop acting like you know everything.,"Your knowledge is appreciated, but let's ensur..."
1,239,I'm not interested in hanging out with you in ...,"I appreciate the invite, but I am completely b..."
2,104,I couldn't be happier to decline this invite.,Darn! not able to fit it in.
3,111,I don't even understand what your'e asking,Could you just clarify your question for me?
4,341,Stop being a pushover. Stand up for yourself.,Advocating for oneself is an important skill i...
...,...,...,...
503,71,He works for me.,We work together.
504,106,I deserve a raise.,Given my contributions to the company's succes...
505,273,Leave me alone and let me do my job.,Some space to complete my work that would be v...
506,439,This is urgent.,Please prioritize this as soon as possible.


In [427]:
# get top 20 2-grams for train set - source
bigrams_source = get_ngrams(df_train['source'], ngram_from=2, ngram_to=2, n=20)
bigrams_df_train_source = pd.DataFrame(bigrams_source)
bigrams_df_train_source.columns = ["Bigram", "Frequency"]
bigrams_df_train_source.to_csv(r"Evaluation_Data_And_Annotation/ngrams_results/train_set_source_2bigrams.csv")
bigrams_df_train_source

CountVectorizer(max_features=20000, ngram_range=(2, 2), stop_words='english')


Unnamed: 0,Bigram,Frequency
0,don want,8
1,sounds like,7
2,stop trying,5
3,wasting time,5
4,waste time,5
5,don understand,4
6,time ll,4
7,don know,4
8,don need,4
9,stop acting,3


In [426]:
# get top 20 3-grams for train set - source
three_grams_source = get_ngrams(df_train['source'], ngram_from=3, ngram_to=3, n=20)
three_grams_df_train_source = pd.DataFrame(three_grams_source)
three_grams_df_train_source.columns = ["Bigram", "Frequency"]
three_grams_df_train_source.to_csv(r"Evaluation_Data_And_Annotation/ngrams_results/train_set_source_3bigrams.csv")
three_grams_df_train_source

CountVectorizer(max_features=20000, ngram_range=(3, 3), stop_words='english')


Unnamed: 0,Bigram,Frequency
0,stop acting like,3
1,answer questions asked,2
2,questions asked just,2
3,really don want,2
4,doesn make sense,2
5,acting like know,1
6,interested hanging free,1
7,hanging free time,1
8,couldn happier decline,1
9,happier decline invite,1


In [428]:
# get top 20 2-grams for train set - target
bigrams_target = get_ngrams(df_train['target'], ngram_from=2, ngram_to=2, n=20)
bigrams_df_test_target = pd.DataFrame(bigrams_target)
bigrams_df_test_target.columns = ["Bigram", "Frequency"]
bigrams_df_test_target.to_csv(r"Evaluation_Data_And_Annotation/ngrams_results/train_set_target_2bigrams.csv")
bigrams_df_test_target

CountVectorizer(max_features=20000, ngram_range=(2, 2), stop_words='english')


Unnamed: 0,Bigram,Frequency
0,let know,14
1,let ensure,9
2,work related,8
3,shared spaces,8
4,keeping personal,6
5,personal items,6
6,make sure,6
7,working hours,6
8,helps maintain,5
9,maintaining professional,5


In [429]:
# get top 20 3-grams for train set - target
three_grams_target = get_ngrams(df_train['target'], ngram_from=3, ngram_to=3, n=20)
three_grams_df_test_target = pd.DataFrame(three_grams_target)
three_grams_df_test_target.columns = ["Bigram", "Frequency"]
three_grams_df_test_target.to_csv(r"Evaluation_Data_And_Annotation/ngrams_results/train_set_target_3bigrams.csv")
three_grams_df_test_target

CountVectorizer(max_features=20000, ngram_range=(3, 3), stop_words='english')


Unnamed: 0,Bigram,Frequency
0,focusing work related,3
1,sorry committed hope,2
2,committed hope understand,2
3,order successfully complete,2
4,sharing personal experiences,2
5,maintaining focus tasks,2
6,respect time helps,2
7,pleasant shared space,2
8,help maintain professional,2
9,maintain professional focus,2


# human annotation

In [407]:
df_r1 = pd.read_csv(r"Corpify_comparing_sents_Maayan.csv")  # rater no.1
df_r2 = pd.read_csv(r"Corpify_comparing_sents_Nitzan.csv")  # rater no.2
remove_cols = ["Corpy_detection_target", "Corpy_detection_t5detox", "Corpy_detection_t5formal",
               "Corpy_detection_t5large", "Corpy_detection_flanlarge"]
df_r1 = df_r1.drop(columns=remove_cols)
df_r2 = df_r2.drop(columns=remove_cols)

In [408]:
df_r1.describe(include="all")

Unnamed: 0,src,target,pred_t5detox,pred_t5formal,pred_t5large,pred_flanlarge,Fluency_target,Fluency_t5detox,Fluency_t5formal,Fluency_t5large,...,content_intention_target,content_intention_t5detox,content_intention_t5formal,content_intention_t5large,content_intention_flanlarge,essential_details_target,essential_details_t5detox,essential_details_t5formal,essential_details_t5large,essential_details_flanlarge
count,64,64,63,64,64,64,64.0,64.0,64.0,64.0,...,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0
unique,64,64,63,61,64,52,,,,,...,,,,,,,,,,
top,I don't get paid enough to deal with this buls...,This falls outside my responsibilities but I w...,I'm not able to get paid enough to handle this.,I'm not sure I understand your question.,I'm not able to take this on as I'm not able t...,I'm not sure I'm the right person for that.,,,,,...,,,,,,,,,,
freq,1,1,1,2,1,9,,,,,...,,,,,,,,,,
mean,,,,,,,2.0,1.8125,1.859375,1.78125,...,1.96875,1.125,1.171875,1.21875,0.890625,1.921875,1.046875,1.25,1.1875,0.9375
std,,,,,,,0.0,0.530798,0.499752,0.57649,...,0.175368,0.934353,0.882901,0.933822,0.961434,0.323899,0.91599,0.872872,0.940702,0.940702
min,,,,,,,2.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,,,,2.0,2.0,2.0,2.0,...,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
50%,,,,,,,2.0,2.0,2.0,2.0,...,2.0,1.5,1.0,2.0,0.0,2.0,1.0,2.0,2.0,1.0
75%,,,,,,,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [409]:
df_r2.describe(include='all')

Unnamed: 0,src,target,pred_t5detox,pred_t5formal,pred_t5large,pred_flanlarge,Fluency_target,Fluency_t5detox,Fluency_t5formal,Fluency_t5large,...,content_intention_target,content_intention_t5detox,content_intention_t5formal,content_intention_t5large,content_intention_flanlarge,essential_details_target,essential_details_t5detox,essential_details_t5formal,essential_details_t5large,essential_details_flanlarge
count,64,64,63,64,64,64,64.0,64.0,64.0,64.0,...,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0
unique,64,64,63,61,64,52,,,,,...,,,,,,,,,,
top,I don't get paid enough to deal with this buls...,This falls outside my responsibilities but I w...,I'm not able to get paid enough to handle this.,I'm not sure I understand your question.,I'm not able to take this on as I'm not able t...,I'm not sure I'm the right person for that.,,,,,...,,,,,,,,,,
freq,1,1,1,2,1,9,,,,,...,,,,,,,,,,
mean,,,,,,,2.0,1.78125,1.875,1.75,...,1.984375,1.125,1.375,1.390625,1.03125,1.71875,1.03125,1.140625,1.4375,0.875
std,,,,,,,0.0,0.57649,0.454257,0.563436,...,0.125,0.863731,0.82616,0.847352,0.925284,0.548266,0.815889,0.833185,0.7533,0.863731
min,,,,,,,2.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,,,,2.0,2.0,2.0,2.0,...,2.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0
50%,,,,,,,2.0,2.0,2.0,2.0,...,2.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0
75%,,,,,,,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


# human evaluation -
### agreement scores

In [410]:
# make sure both annotators have exact same columns and same line lengths
print(df_r1.columns.tolist() == df_r2.columns.tolist())
for col in df_r1.columns.tolist():
    if len(df_r1[col]) != len(df_r2[col]):
        print(f"column {col} is not same length in both dfs")

True


In [411]:
# get only columns for measuring agreement
data_no_eval_cols = ['src', 'target', "pred_t5detox", "pred_t5formal", "pred_t5large", "pred_flanlarge"]
all_col_names = df_r1.columns.tolist()
only_calc_col_names = [x for x in all_col_names if x not in data_no_eval_cols]
print(only_calc_col_names)

['Fluency_target', 'Fluency_t5detox', 'Fluency_t5formal', 'Fluency_t5large', 'Fluency_flanlarge', 'Corpy_level_target', 'Corpy_level_t5detox', 'Corpy_level_t5formal', 'Corpy_level_t5large', 'Corpy_level_flanlarge', 'content_intention_target', 'content_intention_t5detox', 'content_intention_t5formal', 'content_intention_t5large', 'content_intention_flanlarge', 'essential_details_target', 'essential_details_t5detox', 'essential_details_t5formal', 'essential_details_t5large', 'essential_details_flanlarge']


In [413]:
def get_percentage_agreement_score(df_calc: pd.DataFrame, rater1_col: pd.DataFrame, rater2_col: pd.DataFrame) -> float:
    df_calc['percentage_agreement_score'] = rater1_col == rater2_col
    # number of agreements / total number of evaluations
    return df_calc['percentage_agreement_score'].sum() / len(df_calc['percentage_agreement_score'])


def get_cohen_kappar_score(rater1_col: pd.DataFrame, rater2_col: pd.DataFrame, ) -> float:
    r1_list = rater1_col.to_list()
    r2_list = rater2_col.to_list()
    return 1 if r1_list == r2_list else skmetrics.cohen_kappa_score(r1_list, r2_list)


def get_krippendorff_score(rater1_col: pd.DataFrame, rater2_col: pd.DataFrame, level_of_measurement) -> float:
    if rater1_col.to_list() == rater2_col.to_list():
        return 1
    return kd.alpha([rater1_col, rater2_col], level_of_measurement=level_of_measurement)


def get_dist_score(df_calc: pd.DataFrame, new_col: str, rater1_col: pd.DataFrame, rater2_col: pd.DataFrame) -> float:
    df_calc[new_col] = abs(rater1_col - rater2_col)
    return df_calc[new_col].mean()


def calc_min_max_normalize(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    # min - max normalize distance
    return (df[col_name] - df[col_name].min()) / (
            df[col_name].max() - df[col_name].min())
    # return df[normalized_col_name]


def get_raters_mean(rater1_col: pd.DataFrame, rater2_col: pd.DataFrame) -> float:
    df_calc['mean_score'] = (rater1_col + rater2_col) / 2
    return df_calc['mean_score'].mean()


def get_raters_mean_std(rater1_col: pd.DataFrame, rater2_col: pd.DataFrame) -> float:
    df_calc['mean_score'] = (rater1_col + rater2_col) / 2
    return df_calc['mean_score'].std()

In [417]:
# create new df with results of annotator agreement
res_agreement = {"test": ["percentage_of_agreement",
                          "cohen_kappa_score",
                          "krippendorff_score_ordinal",
                          "dist",
                          "dist_normalized",
                          "dist_normalized_var",
                          "dist_normalized_std",
                          "mean",
                          "mean_std"]}

df_calc = pd.DataFrame()  # helper df for calculations

for col_name in only_calc_col_names:
    percentage_agreement_score = get_percentage_agreement_score(df_calc, df_r1[col_name], df_r2[col_name])

    # calculate Cohen's Kappa (handle edge case when arrays are equal CK score should be 1)
    ck_score = get_cohen_kappar_score(df_r1[col_name], df_r2[col_name])

    # get krippendorff scores
    krippendorff_score_ordinal = get_krippendorff_score(df_r1[col_name], df_r2[col_name], 'ordinal')

    # distance between raters
    dist = get_dist_score(df_calc, "dist", df_r1[col_name], df_r2[col_name])

    # get normalized distance score
    # 1. min-max normalize each rater's scores :
    df_r1[f"{col_name}_normalized"] = calc_min_max_normalize(df_r1, col_name)
    df_r2[f"{col_name}_normalized"] = calc_min_max_normalize(df_r2, col_name)
    # 2. get normalized dist
    norm_dist = get_dist_score(df_calc, "norm_dist", df_r1[f"{col_name}_normalized"], df_r2[f"{col_name}_normalized"])
    # variance of normalized distance
    dist_var = df_calc["norm_dist"].var()
    # std of normalized distance
    dist_std = df_calc["norm_dist"].std()

    # get mean of both raters and then get
    mean_score = get_raters_mean(df_r1[col_name], df_r2[col_name])
    mean_std_score = get_raters_mean_std(df_r1[col_name], df_r2[col_name])

    # add all to res dict
    res_agreement[col_name] = [percentage_agreement_score,
                               ck_score,
                               krippendorff_score_ordinal,
                               dist,
                               norm_dist,
                               dist_var,
                               dist_std,
                               mean_score,
                               mean_std_score]

res_df = pd.DataFrame(data=res_agreement)
res_df

Unnamed: 0,test,Fluency_target,Fluency_t5detox,Fluency_t5formal,Fluency_t5large,Fluency_flanlarge,Corpy_level_target,Corpy_level_t5detox,Corpy_level_t5formal,Corpy_level_t5large,...,content_intention_target,content_intention_t5detox,content_intention_t5formal,content_intention_t5large,content_intention_flanlarge,essential_details_target,essential_details_t5detox,essential_details_t5formal,essential_details_t5large,essential_details_flanlarge
0,percentage_of_agreement,1.0,0.921875,0.9375,0.890625,0.921875,0.9375,0.546875,0.578125,0.515625,...,0.953125,0.53125,0.640625,0.75,0.765625,0.78125,0.59375,0.640625,0.671875,0.71875
1,cohen_kappa_score,1.0,0.673469,0.574043,0.621622,0.46932,0.308108,0.346709,0.429138,0.371555,...,-0.021277,0.259259,0.40693,0.549296,0.609756,0.197851,0.392257,0.439452,0.44918,0.558282
2,krippendorff_score_ordinal,1.0,0.806287,0.599294,0.73602,0.568891,0.300517,0.676956,0.769589,0.678615,...,-0.016,0.365425,0.638775,0.682651,0.763135,0.196987,0.631924,0.618098,0.625769,0.780767
3,dist,0.0,0.09375,0.078125,0.125,0.109375,0.09375,0.703125,0.625,0.734375,...,0.046875,0.65625,0.421875,0.328125,0.296875,0.265625,0.453125,0.421875,0.40625,0.3125
4,dist_normalized,,0.046875,0.039062,0.0625,0.054688,0.052083,0.175781,0.15625,0.183594,...,0.046875,0.328125,0.210938,0.164062,0.148438,0.132812,0.226562,0.210938,0.203125,0.15625
5,dist_normalized_var,,0.029514,0.026228,0.035714,0.040613,0.039572,0.056904,0.052579,0.060004,...,0.045387,0.15253,0.093688,0.095672,0.084759,0.073351,0.086744,0.093688,0.100942,0.070437
6,dist_normalized_std,,0.171796,0.16195,0.188982,0.201526,0.198928,0.238546,0.229302,0.244958,...,0.213042,0.390551,0.306085,0.309309,0.291135,0.270833,0.294523,0.306085,0.317714,0.265399
7,mean,2.0,1.796875,1.867188,1.765625,1.882812,1.9375,0.757812,0.671875,0.414062,...,1.976562,1.125,1.273438,1.304688,0.960938,1.820312,1.039062,1.195312,1.3125,0.90625
8,mean_std,0.0,0.524924,0.447588,0.53429,0.375248,0.243975,1.394411,1.348776,1.424153,...,0.106521,0.740013,0.776284,0.824319,0.887768,0.34923,0.783281,0.769545,0.774084,0.849253


In [418]:
res_df = res_df.T
res_df.columns = res_df.iloc[0]
res_df = res_df.drop(res_df.index[0])
res_df

test,percentage_of_agreement,cohen_kappa_score,krippendorff_score_ordinal,dist,dist_normalized,dist_normalized_var,dist_normalized_std,mean,mean_std
Fluency_target,1.0,1.0,1.0,0.0,,,,2.0,0.0
Fluency_t5detox,0.921875,0.673469,0.806287,0.09375,0.046875,0.029514,0.171796,1.796875,0.524924
Fluency_t5formal,0.9375,0.574043,0.599294,0.078125,0.039062,0.026228,0.16195,1.867188,0.447588
Fluency_t5large,0.890625,0.621622,0.73602,0.125,0.0625,0.035714,0.188982,1.765625,0.53429
Fluency_flanlarge,0.921875,0.46932,0.568891,0.109375,0.054688,0.040613,0.201526,1.882812,0.375248
Corpy_level_target,0.9375,0.308108,0.300517,0.09375,0.052083,0.039572,0.198928,1.9375,0.243975
Corpy_level_t5detox,0.546875,0.346709,0.676956,0.703125,0.175781,0.056904,0.238546,0.757812,1.394411
Corpy_level_t5formal,0.578125,0.429138,0.769589,0.625,0.15625,0.052579,0.229302,0.671875,1.348776
Corpy_level_t5large,0.515625,0.371555,0.678615,0.734375,0.183594,0.060004,0.244958,0.414062,1.424153
Corpy_level_flanlarge,0.359375,0.105048,0.256045,1.203125,0.300781,0.087658,0.296071,0.851562,1.060397


In [430]:
res_df.to_csv(r"Evaluation_Data_And_Annotation/annotators_agreement_and_mean_scores.csv")