# Cleaning and Aggregating Reversal Clues
- We are drawing reversal indicators from [RDeits cryptic crosswords](https://github.com/rdeits/CrypticCrosswords.jl/corpora/indicators)
- We are using data from [Cryptonite](https://github.com/aviaefrat/cryptonite)
- Data from [Rozner et al paper](https://github.com/jsrozner/decrypt)

In [40]:
import os
import json
cur_dir = "/Users/dom/Desktop/CS224u/newfinal"

In [41]:
def loadEzData(pathToEzData,providedContainer=None):
    container = []
    if (providedContainer != None):
        container = providedContainer
    curFile = open(pathToEzData, 'r')
    curData = curFile.readlines()
    for datum in curData:
        container.append(datum.strip())
    return container

In [42]:
def checkIfReversal2(trimmedEntry):
    clue = trimmedEntry["clue"]
    revAnswer = trimmedEntry["answer"][::-1]
    return revAnswer in clue

In [43]:
def outputData(data,outputName):
    foutName = os.path.join(cur_dir, "revClues_" + outputName)
    fout = open(foutName, 'w')
    #fout.write(json.dumps(data))
    for datum in data:
        fout.write(json.dumps(datum))
        fout.write('/n')
    fout.close()

In [63]:
def processCryptonite(pathToData, outputName, outputting=True):
    data = []
    with open(pathToData) as f:
        #for i in range(10000):
        lines = f.readlines()
        for line in lines:
            curline = json.loads(line)
            trimmedEntry = {
                "clue":curline["clue"],
                "answer":curline["answer"],
            }
            if (checkIfReversal2(trimmedEntry)):
                #and checkIfReversal(trimmedEntry,reversal_indicators, verbose=True)):
                data.append(trimmedEntry)
    print("Processed: ", len(data), " data points")

    if outputting:
        outputData(data, outputName)
        return
    else:
        return data

def processRozner(pathToData, outputName, outputting=True):
    data = []
    with open(pathToData) as f:
        allData = json.loads(f.readline())
        print(type(allData))
        if (type(allData) == type(data)):
            i = 0
            for curDatum in allData:
                trimmedEntry = {
                    "clue":curDatum["clue"],
                    "answer":curDatum["soln"],
                    }
                if (checkIfReversal2(trimmedEntry)):
                    #and checkIfReversal(trimmedEntry,reversal_indicators, verbose=True)):
                    data.append(trimmedEntry)
        else:
            Exception("Train and not train breakout")
    print("Processed: ", len(data), " data points")

    if outputting:
        outputData(data, outputName)
        return
    else:
        return data

def processRozner2(pathToData, outputName, outputting=True):
    data = []
    with open(pathToData) as f:
        allData = json.loads(f.readline())
        print(type(allData))
        if (type(allData) == type(data)):
            i = 0
            for curDatum in allData:
                trimmedEntry = {
                    "clue":curDatum["input"],
                    "answer":curDatum["target"],
                    }
                if (checkIfReversal2(trimmedEntry)):
                    #and checkIfReversal(trimmedEntry,reversal_indicators, verbose=True)):
                    data.append(trimmedEntry)
        else:
            Exception("Train and not train breakout")
    print("Processed: ", len(data), " data points")

    if outputting:
        outputData(data, outputName)
        return
    else:
        return data

def processRozner3(pathToData, outputName, outputting=True):
    data = []
    with open(pathToData) as f:
        allData = json.loads(f.readline())
        if (type(allData) == type(data)):
            i = 0
            for curDatum in allData:
                trimmedEntry = {
                    "clue":curDatum["input"],
                    "answer":curDatum["target"],
                    }
                if (checkIfReversal2(trimmedEntry)):
                    #and checkIfReversal(trimmedEntry,reversal_indicators, verbose=True)):
                    data.append(trimmedEntry)
        else:
            Exception("Train and not train breakout")
    print("Processed: ", len(data), " data points")

    if outputting:
        outputData(data, outputName)
        return
    else:
        return data

In [68]:
curData = processRozner2("/Users/dom/Desktop/CS224u/cs224u_crossword/decrypt-main/data/naive_random.json","",outputting=False)

<class 'dict'>
Processed:  0  data points


In [69]:
pathToIndcs = "/Users/dom/Desktop/CS224u/newfinal/indicators/Reversal"

cryptonie_test_path = "/Users/dom/Desktop/CS224u/newfinal/cryptonite-official-split/cryptonite-test.jsonl"
cryptonitet_train_path = "/Users/dom/Desktop/CS224u/newfinal/cryptonite-official-split/cryptonite-train.jsonl"
cryptonite_validation_path = "/Users/dom/Desktop/CS224u/newfinal/cryptonite-official-split/cryptonite-val.jsonl"
rozner_acw_data_train_path="/Users/dom/Desktop/CS224u/cs224u_crossword/decrypt-main/data/clue_json/curricular/ACW_data/train.json"
rozner_wordinitdisjoint_train_path = "/Users/dom/Desktop/CS224u/cs224u_crossword/decrypt-main/data/clue_json/guardian/word_init_disjoint/train.json"
rozner_guardian_path = "/Users/dom/Desktop/CS224u/cs224u_crossword/decrypt-main/data/guardian_2020_10_08.json"
rozner_naiverand_path = "/Users/dom/Desktop/CS224u/cs224u_crossword/decrypt-main/data/naive_random.json"

In [71]:
aggData = []
for curCryp in [cryptonie_test_path, cryptonite_validation_path]:
    curData = processCryptonite(curCryp, "", outputting=False)
    for datum in curData:
        aggData.append(datum)
for curRoz in [rozner_guardian_path]:
    curData = processRozner(curRoz, "", outputting=False)
    for datum in curData:
        aggData.append(datum)
for curRoz2 in [rozner_acw_data_train_path, rozner_wordinitdisjoint_train_path, rozner_naiverand_path]:
    curData = processRozner3(curRoz2,"",outputting=False)
    for datum in curData:
        aggData.append(datum)

Processed:  29  data points
Processed:  23  data points
<class 'list'>
Processed:  159  data points
Processed:  1298  data points
Processed:  81  data points
Processed:  0  data points


In [72]:
outputData(aggData, "revClues_aggregated_2")

# Aggregate all of the data into one megaset

In [None]:
subsets = [
    "/Users/dom/Desktop/CS224u/newfinal/revClues_rozner_wordinitdisjoint_train",
    "/Users/dom/Desktop/CS224u/newfinal/revClues_roznerACW_train",
    "/Users/dom/Desktop/CS224u/newfinal/revClues_val_noIndcs",
    "/Users/dom/Desktop/CS224u/newfinal/revClues_rozner_guardian",
]

aggregatedData = []
for curFile in subsets:
    loadEzData(curFile, providedContainer=aggregatedData)
print(len(aggregatedData))


1402


In [82]:
with open("/Users/dom/Desktop/CS224u/cs224u_crossword/revClues_aggregated_3",'r') as oaky:
    data = json.loads(oaky.read())
    print(data[0]["clue"])

only a short reflection from jeremiah (4)


In [85]:
def evalDataset(filepath):
  with open(filepath, 'r') as fin:
    evalData = json.loads(fin.read())

  input_sequences = [] 
  output_sequences = []
  i = 0
  for entry in evalData:
      input_sequences.append(entry['clue'])
      output_sequences.append(entry['answer'])
      i+=1
  return input_sequences, output_sequences

In [86]:
inps, outps = evalDataset("/Users/dom/Desktop/CS224u/cs224u_crossword/revClues_aggregated_3")