### For target style strenght we use FastText classifier trained ond the same train-dev-test split 
	- https://fasttext.cc/docs/en/supervised-tutorial.html
### For content preservation we measure BLEU score between generated and source sentences
	- https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
### To measure fluency we fine-tune GPT-2 on target sentences using same train-dev-test split. We use this model to measure perplexity of generated sentences
	- https://huggingface.co/spaces/evaluate-metric/perplexity
### We also compute GLEU score between target reference, generated sentence and source sentence
	- https://github.com/cnap/gec-ranking/tree/master/scripts

In [3]:
import fasttext
import pandas as pd
import os

BASE_PATH = '/Midgard/home/martinig/transformer-drg-style-transfer/data/yelp'
data_paths = {
	'train': {
		'0': f'{BASE_PATH}/sentiment.train.0',
		'1': f'{BASE_PATH}/sentiment.train.1'
	},
	'dev': {
		'0': f'{BASE_PATH}/sentiment.dev.0',
		'1': f'{BASE_PATH}/sentiment.dev.1'
	},
	'test': {
		'0': f'{BASE_PATH}/sentiment.test.0',
		'1': f'{BASE_PATH}/sentiment.test.1'
	}
}

In [6]:
def prepare_data_for_fasttext(data_paths):
	def read_file(file_path):
		with open(file_path) as fp:
			data = fp.read().splitlines()
		return data

	def label_and_concat(dfs):
		dfs['0']['label'] = '__label__0'
		dfs['1']['label'] = '__label__1'
		dfs['0'] = dfs['0'][['label', 'text']]
		dfs['1'] = dfs['1'][['label', 'text']]
		return pd.concat(dfs.values())

	dfs = {
		'train': {
			'0': pd.DataFrame(read_file(data_paths['train']['0']), columns=['text']),
			'1': pd.DataFrame(read_file(data_paths['train']['1']), columns=['text'])
		},
		'dev': {
			'0': pd.DataFrame(read_file(data_paths['dev']['0']), columns=['text']),
			'1': pd.DataFrame(read_file(data_paths['dev']['1']), columns=['text'])
		},
		'test': {
			'0': pd.DataFrame(read_file(data_paths['test']['0']), columns=['text']),
			'1': pd.DataFrame(read_file(data_paths['test']['1']), columns=['text'])
		},
	}

	dfs['train'] = label_and_concat(dfs['train'])
	dfs['dev'] = label_and_concat(dfs['dev'])
	dfs['test'] = label_and_concat(dfs['test'])

	return dfs

In [13]:
def save_data_for_fasttext(dfs):
	os.makedirs(os.path.join(BASE_PATH, 'fasttext'), exist_ok=True)
	for phase, df in dfs.items():
		df.to_csv(f'{BASE_PATH}/fasttext/{phase}.txt', index=False, sep='\t', header=None)

In [14]:
dfs = prepare_data_for_fasttext(data_paths=data_paths)
save_data_for_fasttext(dfs)

In [15]:
# Train, test
model = fasttext.train_supervised(f'{BASE_PATH}/fasttext/train.txt')
n_samples, precision, recall = model.test(f'{BASE_PATH}/fasttext/test.txt')
print(f'Precision: {precision}')

Read 4M words
Number of words:  9601
Number of labels: 2
Progress:  88.6% words/sec/thread: 1093642 lr:  0.011437 avg.loss:  0.125075 ETA:   0h 0m 0s

Precision: 0.964


Progress: 100.0% words/sec/thread: 1070172 lr:  0.000000 avg.loss:  0.123633 ETA:   0h 0m 0s


In [3]:
# Evaluate model output
import torch
from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, OpenAIGPTConfig

# https://huggingface.co/docs/transformers/v4.26.1/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.from_pretrained
# https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained

model_path = 'models/delete_and_generate'
special_tokens = ['<POS>', '<NEG>','<CON_START>','<START>','<END>'] # Set the special tokens
tokenizer = OpenAIGPTTokenizer.from_pretrained(model_path)
config = OpenAIGPTConfig.from_json_file(f"{model_path}/config.json")
model = OpenAIGPTLMHeadModel.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


['<CON_START>', '<END>', '<NEG>', '<POS>', '<START>']


OpenAIGPTLMHeadModel(
  (transformer): OpenAIGPTModel(
    (tokens_embed): Embedding(40483, 768)
    (positions_embed): Embedding(512, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): BertLayerNorm()
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): BertLayerNorm()
      )
      (1): Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): BertLayerNorm()
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout

In [21]:
model.eval()
beam_width=1
input_text = "<POS> <CON_START> it is not terrible , but it is very good . <START>"

sm = torch.nn.Softmax(dim=-1) # To calculate Softmax over the final layer Logits
tokens = tokenizer.tokenize(input_text) # Tokenize the input text
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) # Convert tokens to ids
index_tokens = [indexed_tokens for i in range(beam_width)] # Replication of Input ids for all the beams
torch_tensor = torch.tensor(index_tokens).to(device)

count=0
while count < model.config.n_positions:
	with torch.no_grad():
		# Calculate output probability distribution over the Vocab,
		preds = sm(model(torch_tensor)) #  shape = [beam_bidth, len(input_sen)+1,Vocab_length]
	# Prepare the current_state by concating original input and decoded beam indexes
	top_v, top_i = preds[:,-1,:].topk(beam_width) # Fatch top indexes and it's values
	torch_tensor = torch.cat((torch_tensor, top_i), dim=1)
	if top_i[0].item() == tokenizer.special_tokens["<END>"] or count > 20:
		break
	count += 1

In [22]:
tokenizer.decode(torch_tensor.tolist()[0])

"<POS> <CON_START> it is not terrible , but it is very good . <START> it is not terrible , but it is n ' t very good . <START> it is not terrible , but it"