In [4]:
from dotenv import find_dotenv
import os
import sys

sys.path.append(os.path.dirname(find_dotenv()))

In [5]:
import src.models.baseline_model as baseline
import src.data.preprocess_dataset as preprocess
import src.data.metrics as metrics

In [1]:
import pandas as pd

In [7]:
df = pd.read_csv("data/raw/filtered_paranmt/filtered.tsv", sep="\t", index_col=0)
df.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [8]:
import nltk

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
df = preprocess.dataframe_preprocess(df, semantic=False, df_max_len=5000)
print(len(df))
df.head()

5000


Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox,t1,t2
119572,And you think Grandpa is gonna protect us from...,you think your grandpa will protect us from Eric?,0.743803,0.358974,0.994886,0.00022,and you think grandpa is gonna protect us from...,you think your grandpa will protect us from eric
561815,"Might I add, very clever assholes!","I can deliver, very clever blunders!",0.605257,0.054054,0.997472,0.000586,might i add very clever assholes,i can deliver very clever blunders
221427,I hate dickheads.,I hate bees.,0.692225,0.277778,0.998042,0.006316,i hate dickheads,i hate bees
1189,"Jason, put down that stupid camera and come he...","Jason, put the camera down and help me!",0.828745,0.245283,0.999627,9.9e-05,jason put down that stupid camera and come hel...,jason put the camera down and help me
451132,what a scumbag!,What a punk!,0.890838,0.1875,0.005802,0.999683,what a scumbag,what a punk


In [10]:
new_df = baseline.predict(df)
new_df.head()

100%|██████████| 5000/5000 [02:08<00:00, 38.93it/s]


Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox,t1,t2,baseline_pred
119572,And you think Grandpa is gonna protect us from...,you think your grandpa will protect us from Eric?,0.743803,0.358974,0.994886,0.00022,and you think grandpa is gonna protect us from...,you think your grandpa will protect us from eric,And you think Grandpa is gonna protect us from...
561815,"Might I add, very clever assholes!","I can deliver, very clever blunders!",0.605257,0.054054,0.997472,0.000586,might i add very clever assholes,i can deliver very clever blunders,"Might I add, very clever son of a kick!"
221427,I hate dickheads.,I hate bees.,0.692225,0.277778,0.998042,0.006316,i hate dickheads,i hate bees,I hate dickheads.
1189,"Jason, put down that stupid camera and come he...","Jason, put the camera down and help me!",0.828745,0.245283,0.999627,9.9e-05,jason put down that stupid camera and come hel...,jason put the camera down and help me,"Jason, put down that poor fish camera and come..."
451132,what a scumbag!,What a punk!,0.890838,0.1875,0.005802,0.999683,what a scumbag,what a punk,what a scumbag!


In [12]:
indexes = new_df.index.tolist()
indexes[:10]

[119572, 561815, 221427, 1189, 451132, 170678, 366368, 508872, 477920, 249604]

In [11]:
semantic_res = metrics.cosine_similarity_score(
    new_df["reference"].tolist(), new_df["baseline_pred"].tolist()
)

100%|██████████| 5000/5000 [00:01<00:00, 2989.94it/s]


In [13]:
semantic_res_df = pd.DataFrame({"scores": semantic_res[0]}, index=indexes)
semantic_res_df.head()

Unnamed: 0,scores
119572,0.992307
561815,0.824182
221427,1.0
1189,0.976272
451132,1.0


In [15]:
semantic_res[1]

0.8923816736940294

In [16]:
semantic_res_df.to_csv("../data/interim/baseline_semantic_rating.csv")

In [18]:
bleu_res = metrics.blue_score(
    new_df["reference"].tolist(), new_df["baseline_pred"].tolist()
)

In [19]:
bleu_res[1]

0.711539723196702

In [21]:
bleu_res_df = pd.DataFrame({"scores": bleu_res[0]}, index=indexes)
bleu_res_df.head()

Unnamed: 0,scores
119572,0.815355
561815,0.479878
221427,1.0
1189,0.670342
451132,1.0


In [23]:
bleu_res_df.to_csv("../data/interim/baseline_bleu_rating.csv")

In [25]:
meteor_res = metrics.meteor_score(
    new_df["reference"].tolist(), new_df["baseline_pred"].tolist()
)

In [27]:
meteor_res[1]

0.9181080441098082

In [28]:
meteor_res_df = pd.DataFrame({"scores": meteor_res[0]}, index=indexes)

In [29]:
meteor_res_df.head()

Unnamed: 0,scores
119572,0.936389
561815,0.833538
221427,0.992188
1189,0.906359
451132,0.992188


In [30]:
meteor_res_df.to_csv("../data/interim/baseline_meteor_rating.csv")

In [None]:
toxicity_res = metrics.blue_score(new_df["baseline_pred"].tolist())

In [None]:
toxicity_res_df = pd.DataFrame({"scores": toxicity_res[0]}, index=indexes)
toxicity_res_df.head()

In [None]:
toxicity_res_df.to_csv("../data/interim/baseline_toxicity_rating.csv")

# Merge

Merge the results from separate dataframes into one. It was convenient to calculate scores separately, but for future work it is better to merge them


In [33]:
df1 = pd.read_csv("../data/interim/baseline_semantic_rating.csv", index_col=0)
df2 = pd.read_csv("../data/interim/baseline_bleu_rating.csv", index_col=0)
df3 = pd.read_csv("../data/interim/baseline_meteor_rating.csv", index_col=0)
df4 = pd.read_csv("../data/interim/baseline_toxicity_rating.csv", index_col=0)

In [34]:
df1 = df1.rename(columns={"scores": "semantic"})
df2 = df2.rename(columns={"scores": "bleu"})
df3 = df3.rename(columns={"scores": "meteor"})
df4 = df4.rename(columns={"scores": "toxicity"})

In [35]:
df1.head()

Unnamed: 0,semantic
119572,0.992307
561815,0.824182
221427,1.0
1189,0.976272
451132,1.0


In [36]:
merged = pd.merge(df1, df2, left_index=True, right_index=True)
merged.head()

Unnamed: 0,semantic,bleu
119572,0.992307,0.815355
561815,0.824182,0.479878
221427,1.0,1.0
1189,0.976272,0.670342
451132,1.0,1.0


In [37]:
merged = pd.merge(merged, df3, left_index=True, right_index=True)
merged.head()

Unnamed: 0,semantic,bleu,meteor
119572,0.992307,0.815355,0.936389
561815,0.824182,0.479878,0.833538
221427,1.0,1.0,0.992188
1189,0.976272,0.670342,0.906359
451132,1.0,1.0,0.992188


In [38]:
merged = pd.merge(merged, df4, left_index=True, right_index=True)
merged.head()

Unnamed: 0,semantic,bleu,meteor,toxicity
119572,0.992307,0.815355,0.936389,0.018808
561815,0.824182,0.479878,0.833538,0.074056
221427,1.0,1.0,0.992188,0.995659
1189,0.976272,0.670342,0.906359,0.004286
451132,1.0,1.0,0.992188,0.971446


Moreover, we might want to do some filtering and remove redundant fields with high scores, where nothing was changed but toxicity is high:


In [39]:
merged = merged[1.0 - merged["semantic"] > 0.01]
merged

Unnamed: 0,semantic,bleu,meteor,toxicity
561815,0.824182,0.479878,0.833538,0.074056
1189,0.976272,0.670342,0.906359,0.004286
170678,0.944853,0.525382,0.872253,0.001494
508872,0.966343,0.513345,0.854119,0.000847
249604,0.949799,0.339325,0.810179,0.156638
...,...,...,...,...
113832,0.774368,0.293946,0.806667,0.000892
80714,0.865834,0.516973,0.820015,0.327824
151117,0.000000,1.000000,0.997685,0.956547
163850,0.946303,0.216400,0.770548,0.000862


In [40]:
merged.to_csv("../data/interim/baseline_merged_rating.csv")