Creates a PPDB dictionary of the following format:

```
{
    "phrase1": {
        "paraphrase11": ("ppdb_score11", "entailment11"),
        "paraphrase12": ("ppdb_score12", "entailment12"),
        ...
    },
    "phrase2": {
        "paraphrase21": ("ppdb_score21", "entailment21"),
        "paraphrase22": ("ppdb_score22", "entailment22"),
        ...
    },
    ...
}
```

and stores it in `data/ppdb-2.0-tldr-dict.pkl`

In [1]:
import pandas as pd

In [2]:
ppdb_data_path = "../data/external/ppdb/ppdb-2.0-xl-lexical"

### PPDB Data Format

In [3]:
with open(ppdb_data_path, "r") as f:
    print(f.readline())

[NN] ||| transplant ||| transplantation ||| PPDB2.0Score=5.24981 PPDB1.0Score=3.295900 -logp(LHS|e1)=0.18597 -logp(LHS|e2)=0.14031 -logp(e1|LHS)=11.83583 -logp(e1|e2)=1.80507 -logp(e1|e2,LHS)=1.46728 -logp(e2|LHS)=11.47593 -logp(e2|e1)=1.49083 -logp(e2|e1,LHS)=1.10738 AGigaSim=0.63439 Abstract=0 Adjacent=0 CharCountDiff=5 CharLogCR=0.40547 ContainsX=0 Equivalence=0.371472 Exclusion=0.000344 GlueRule=0 GoogleNgramSim=0.03067 Identity=0 Independent=0.078161 Lex(e1|e2)=9.64663 Lex(e2|e1)=59.48919 Lexical=1 LogCount=4.67283 MVLSASim=NA Monotonic=1 OtherRelated=0.372735 PhrasePenalty=1 RarityPenalty=0 ForwardEntailment=0.177287 SourceTerminalsButNoTarget=0 SourceWords=1 TargetComplexity=0.98821 TargetFormality=0.98464 TargetTerminalsButNoSource=0 TargetWords=1 UnalignedSource=0 UnalignedTarget=0 WordCountDiff=0 WordLenDiff=5.00000 WordLogCR=0 ||| 0-0 ||| OtherRelated



### Read as CSV and cleanup features column (keep only ppdb 2.0 score)

In [5]:
# currently reads only 50,000 rows
ppdb_df = pd.read_csv(ppdb_data_path, delimiter=" \|\|\| ",
                      engine="python", header=None,
                      names=["lhs", "phrase", "paraphrase", "features", "alignment", "entailment"])

In [6]:
ppdb_df.loc[:, "ppdb2_score"] = ppdb_df.loc[:, "features"].apply(lambda f: f.split(" ")[0][13:]).astype("float")

In [7]:
ppdb_df["entailment"].value_counts()

Independent          1583737
OtherRelated          161676
Equivalence            60862
ForwardEntailment      24554
ReverseEntailment      22777
Exclusion                129
Name: entailment, dtype: int64

### grouping all paraphrase belonging to one phrase in a single row with the ppdb score and it's entailment type

| | |
|---|---|
|phrase1 | {paraphrase11: (ppdb_score11, entailment11), paraphrase12: (ppdb_score12, entailment12), ...}|
|phrase2 | {paraphrase21: (ppdb_score21, entailment21), paraphrase22: (ppdb_score22, entailment22), ...}|
.
.
.

In [8]:
ppdb_grouped_df = ppdb_df.groupby("phrase").apply(lambda x: dict(zip(x["paraphrase"], list(zip(x["ppdb2_score"], x["entailment"]))))).reset_index()

In [14]:
ppdb_grouped_df.sample(50)

Unnamed: 0,phrase,0
319,accessing,"{'access': (4.0842, 'Equivalence')}"
11125,precisely,"{'exactly': (3.73881, 'Equivalence'), 'accurat..."
10552,patentable,"{'patented': (4.06043, 'ForwardEntailment')}"
13650,showed,"{'showcased': (3.85214, 'Independent'), 'demon..."
6653,guatemalan,"{'guatemala': (3.73154, 'ForwardEntailment')}"
9043,messaging,"{'communications': (3.7557, 'Independent')}"
9421,multi-faceted,"{'multidimensional': (3.77672, 'Equivalence')}"
13683,simpler,"{'straightforward': (3.77129, 'OtherRelated')}"
5100,encircled,"{'circled': (3.89045, 'ForwardEntailment')}"
3264,contravenes,"{'violates': (3.8484, 'Equivalence'), 'contrad..."


In [9]:
ppdb_dict = ppdb_grouped_df.set_index("phrase").T.to_dict("records")[0]

In [10]:
import pickle

In [18]:
ppdb_dict["positioned"]["posted"]

(3.96436, 'ReverseEntailment')

In [11]:
with open("../data/processed/ppdb/ppdb-2.0-xl-lexical.pkl", "wb") as f:
    pickle.dump(ppdb_dict, f)