In [1]:
from dotenv import load_dotenv, find_dotenv
import os
import sys

# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
root = find_dotenv()
load_dotenv()
sys.path.append(os.path.dirname(find_dotenv()))

from src.data.preprocess_dataset import dataframe_preprocess

In [2]:
import pandas as pd

# Test Dataset


In [3]:
df = pd.read_csv("../data/raw/filtered_paranmt/filtered.tsv", sep="\t", index_col=0)
df.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [4]:
preprocessed_test = dataframe_preprocess(df, df_max_len=3000)
preprocessed_test.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox,t1,t2
119572,And you think Grandpa is gonna protect us from...,you think your grandpa will protect us from Eric?,0.743803,0.358974,0.994886,0.00022,and you think grandpa is gonna protect us from...,you think your grandpa will protect us from eric
561815,"Might I add, very clever assholes!","I can deliver, very clever blunders!",0.605257,0.054054,0.997472,0.000586,might i add very clever assholes,i can deliver very clever blunders
221427,I hate dickheads.,I hate bees.,0.692225,0.277778,0.998042,0.006316,i hate dickheads,i hate bees
1189,"Jason, put down that stupid camera and come he...","Jason, put the camera down and help me!",0.828745,0.245283,0.999627,9.9e-05,jason put down that stupid camera and come hel...,jason put the camera down and help me
451132,what a scumbag!,What a punk!,0.890838,0.1875,0.005802,0.999683,what a scumbag,what a punk


In [5]:
test_df = preprocessed_test[["t1", "t2"]]
test_df = test_df.rename(columns={"t1": "source", "t2": "target"})
test_df = test_df.reset_index(drop=True)
test_df.head()

Unnamed: 0,source,target
0,and you think grandpa is gonna protect us from...,you think your grandpa will protect us from eric
1,might i add very clever assholes,i can deliver very clever blunders
2,i hate dickheads,i hate bees
3,jason put down that stupid camera and come hel...,jason put the camera down and help me
4,what a scumbag,what a punk


In [6]:
test_df.to_csv("../data/interim/dataset/test.csv")

# Train/Val Dataset


In [7]:
df = pd.read_csv("../data/raw/filtered_paranmt/filtered.tsv", sep="\t", index_col=0)
df.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [8]:
preprocessed_train = dataframe_preprocess(df, random_state=73, df_max_len=10000)
preprocessed_train.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox,t1,t2
183510,"Let's say 6'1"", dark hair, medium build, looks...","let's say 185, dark hair, medium build, looks ...",0.809671,0.269231,0.991634,0.000812,let s say dark hair medium build looks like a ...,let s say dark hair medium build looks like zero
85903,No! Shut up!,he didn't shut up!,0.85077,0.315789,0.999533,0.004802,no shut up,he didn t shut up
539749,"I was supposed to protect them, not drag them ...","I should have protected them, not carry them i...",0.637433,0.056338,0.998318,4.2e-05,i was supposed to protect them not drag them i...,i should have protected them not carry them in...
503273,"You think I didn't see you fucking coming, huh?",do you think I didn't see you coming?!,0.797582,0.1875,0.99687,0.000151,you think i didn t see you fucking coming huh,do you think i didn t see you coming
12488,Nobody can fucking eat those.,he can't eat them.,0.708611,0.366667,0.992971,0.000241,nobody can fucking eat those,he can t eat them


In [9]:
train_df = preprocessed_train[["t1", "t2"]]
train_df = train_df.rename(columns={"t1": "source", "t2": "target"})
train_df = train_df.reset_index(drop=True)
train_df.head()

Unnamed: 0,source,target
0,let s say dark hair medium build looks like a ...,let s say dark hair medium build looks like zero
1,no shut up,he didn t shut up
2,i was supposed to protect them not drag them i...,i should have protected them not carry them in...
3,you think i didn t see you fucking coming huh,do you think i didn t see you coming
4,nobody can fucking eat those,he can t eat them


In [10]:
val_df = train_df.sample(frac=0.1, random_state=73)
train_df = train_df.drop(val_df.index)

In [11]:
train_df.to_csv("../data/interim/dataset/train.csv")
val_df.to_csv("../data/interim/dataset/validation.csv")