# Paraphrase Generation

In [1]:
import numpy as np
import nltk
import pandas as pd
import json
import tqdm
from simple_colors import *
from nltk.corpus import wordnet as wn

import stanza
stanza.download('en')
stanza_pipe = stanza.Pipeline(lang='en', processors='tokenize,lemma,pos,depparse,constituency')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2022-02-17 14:33:19 INFO: Downloading default packages for language: en (English)...
2022-02-17 14:33:20 INFO: File exists: /Users/nehasrikanth/stanza_resources/en/default.zip.
2022-02-17 14:33:23 INFO: Finished downloading models and saved to /Users/nehasrikanth/stanza_resources.
2022-02-17 14:33:23 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| lemma        | combined |
| depparse     | combined |
| constituency | wsj      |

2022-02-17 14:33:23 INFO: Use device: cpu
2022-02-17 14:33:23 INFO: Loading: tokenize
2022-02-17 14:33:23 INFO: Loading: pos
2022-02-17 14:33:23 INFO: Loading: lemma
2022-02-17 14:33:23 INFO: Loading: depparse
2022-02-17 14:33:24 INFO: Loading: constituency
2022-02-17 14:33:24 INFO: Done loading processors!


### HellaSWAG

In [2]:
hellaswag_val = pd.read_json('../raw_data/hellaswag/data/hellaswag_val.jsonl', lines=True)
hellaswag_val = hellaswag_val[hellaswag_val['source_id'].str.contains('activitynet')]
hellaswag_val = hellaswag_val[hellaswag_val.ctx_a.str.split('.').map(len) == 2]

In [None]:
x = hellaswag_val[:15]
x['gold_ending'] = x.apply(lambda r: r.endings[r.label], axis=1)
x[['ctx_a', 'gold_ending']].to_csv('hellaswag_val.sample.csv', header=None, index=None)
hellaswag_val.ctx_a[:15].map(str.lower).to_csv('hellaswag_val.ctx_a.csv', header=None, index=None)

### Rule-Based Paraphrase Generation

In [None]:
nltk.download('omw-1.4')

In [20]:
hellaswag_val

Unnamed: 0,ind,activity_label,ctx_a,ctx_b,ctx,split,split_type,label,endings,source_id
0,24,Roof shingle removal,A man is sitting on a roof.,he,A man is sitting on a roof. he,val,indomain,3,"[is using wrap to wrap a pair of skis., is rip...",activitynet~v_-JhWjGDPHMY
2,106,Canoeing,Two women in a child are shown in a canoe whil...,the child and a different man,Two women in a child are shown in a canoe whil...,val,indomain,2,[are then shown paddling down a river in a boa...,activitynet~v_-xQvJmC2jhk
3,114,High jump,A boy is running down a track.,the boy,A boy is running down a track. the boy,val,zeroshot,2,"[runs into a car., gets in a mat., lifts his b...",activitynet~v_-zHX3Gdx6I4
7,170,Sumo,A cartoon animation video is shown with people...,two men,A cartoon animation video is shown with people...,val,indomain,0,[fight robots of evil and ends with a to be co...,activitynet~v_0WVkoTBmhA0
8,180,Sharpening knives,A man is holding a pocket knife while sitting ...,then he,A man is holding a pocket knife while sitting ...,val,zeroshot,1,"[opens a can of oil put oil on the knife, and ...",activitynet~v_0bosp4-pyTM
...,...,...,...,...,...,...,...,...,...,...
3229,50023,High jump,A teen skip on a court and then jumps high ove...,then,A teen skip on a court and then jumps high ove...,val,zeroshot,1,"[, a teen skip carrying a pole to get into a j...",activitynet~v_vth3IYGHu5k
3232,50045,Canoeing,Names are shown on a list and leads into a per...,more people,Names are shown on a list and leads into a per...,val,indomain,3,[are seen riding down a river on tubes while o...,activitynet~v_wd7W8NTi_58
3234,50070,Tai chi,We see two pieces of ancient asian art.,we,We see two pieces of ancient asian art. we,val,indomain,3,"[see people playing a game outdoors., see the ...",activitynet~v_x18x9BKMAlk
3237,50079,Ice fishing,Two men are sitting on chairs in front of snow...,there,Two men are sitting on chairs in front of snow...,val,zeroshot,0,"[is a hole in the ice in front of them., are s...",activitynet~v_x768VAsOQSw


In [None]:
synonyms = []
for syn in wn.synsets("dance"):
    for l in syn.lemmas():
        synonyms.append(l.name())

print(set(synonyms))

In [None]:
x = stanza_pipe('pat loves chris.')

In [None]:
x.sentences[0].words[0].text

In [None]:
' '.join([q.text for q in x.sentences[0].words])

In [None]:
def get_constituency_tree(sentence):
    return stanza_pipe(sentence).sentences[0].constituency

In [None]:
get_constituency_tree('they where go')

In [None]:
def get_nltk_tree_from_stanza(parse_tree):
    return nltk.tree.Tree(
        parse_tree.label, 
        [
            get_nltk_tree_from_stanza(child) for child in parse_tree.children
        ]) if not parse_tree.is_leaf() else parse_tree.label

tree = get_constituency_tree('Pat showed a nice demo. ')
nltk_tree = get_nltk_tree_from_stanza(tree)

nltk_tree

In [None]:
get_nltk_tree_from_stanza(get_constituency_tree("Pat's demo was nice."))

In [None]:
from nltk.tokenize import word_tokenize

' '.join(word_tokenize(str(tree)))

In [None]:
'( ROOT ( S ( NP ) ( VP ( VP ( PP ( NP ) ) ) ) ) )'

In [None]:
visited = [False] * (max(self.graph) + 1)
 
queue = []

# Mark the source node as
# visited and enqueue it
queue.append(s)
visited[s] = True

while queue:

    # Dequeue a vertex from
    # queue and print it
    s = queue.pop(0)
    print (s, end = " ")

    # Get all adjacent vertices of the
    # dequeued vertex s. If a adjacent
    # has not been visited, then mark it
    # visited and enqueue it
    for i in self.graph[s]:
        if visited[i] == False:
            queue.append(i)
            visited[i] = True

In [None]:
from nltk.corpus import framenet as fn

In [None]:
f = fn.frame(202)

In [None]:
s = wn.synsets('roof')[0]

In [None]:
fn.frame('arrest')

In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

input_sentence = "They were there to enjoy us and they were there to pray for us."

model = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase', cache='hf_cache')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
tokenizer = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')
batch = tokenizer(input_sentence, return_tensors='pt')
generated_ids = model.generate(batch['input_ids'])
generated_sentence = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

print(generated_sentence)

In [None]:
import vaex


In [None]:
!ls ../raw_data/ppdb-2.0-tldr

In [None]:
ppdb = vaex.from_csv(
    '../raw_data/ppdb-small',
    engine='python',
    sep=' ||| ',
    convert=True,
    progress=True
)

In [None]:
!pip install numpy requests nlpaug

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
import os

from nlpaug.util import Action


aug = naw.SynonymAug(aug_src='ppdb', model_path='../raw_data/ppdb-2.0-tldr')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

In [21]:
pd.read_

NameError: name 'ppdb' is not defined