# Transformer for DeReKo cooccurrence profiles

In [13]:
import json

class coocETL():

    def __init__(self, datapath, outpath, root=None, minllr=1000, intype="index", outformat="json", indexllr=2, indexwords=7):
        self.datapath = datapath
        self.outpath = outpath
        self.root = root
        self.minllr = minllr
        self.intype = intype
        self.outformat = outformat
        self.indexllr = indexllr
        self.indexwords = indexwords

    def extract(self):
        datafile = open(self.datapath, encoding='utf-8', mode = "r")
        columnindex = 0
        self.parseddata = { "nodes": [], "links": [] }
        self.nodes = []
        self.nodesValues = {}
        actualllr = ""
        
        index_words = self.indexwords
        index_llr = self.indexllr
        
        for line in datafile:
            if line == "\n":
                continue

            line = line.replace(u"\xa0", u" ")
            fields = line.split(";")
            if "Kookkurrenzen" in fields:
                print("Header gefunden: ", fields)
                continue
            
            words = fields[index_words].split(" ")
            if words[0] not in self.nodes:
                self.nodes.append(words[0])
                self.nodesValues[words[0]] = fields[index_llr]
            if self.root and self.root not in self.nodes:
                self.nodes.append(self.root)
                self.nodesValues[self.root] = fields[index_llr]
            
            if fields[index_llr] != "":
                actualllr = fields[index_llr]
                
                if self.root:
                    if self.intype == "index":
                        self.parseddata["links"].append({ "source": self.nodes.index(self.root), "target": nodes.index(words[0]), "value": fields[index_llr]})
                    else:
                        self.parseddata["links"].append({ "source": self.root, "target": words[0], "value": fields[index_llr]})
            
            lastword = ""
            
            for word in words:
                if word not in self.nodes:
                    self.nodes.append(word)
                    self.nodesValues[word] = actualllr
                if lastword != "":
                    if self.intype == "index":
                        self.parseddata["links"].append({ "source": self.nodes.index(lastword), "target": self.nodes.index(word), "value": actualllr})
                    else:
                        self.parseddata["links"].append({ "source": lastword, "target": word, "value": actualllr})
                lastword = word
            
            if int(actualllr) < int(self.minllr):
                break

    def transform(self):
        for node in self.nodes:
            if self.intype == "index":
                self.parseddata["nodes"].append({ "id": self.nodes.index(node), "name": node, "group": 0, "value": self.nodesValues[node] })
            else:
                self.parseddata["nodes"].append({ "id": node, "group": 0, "value": self.nodesValues[node] })

        self.parseddata["directed"] = "false"
        self.parseddata["multigraph"] = "false"
        self.parseddata["graph"] = {}

	
    def load(self):
        if self.outformat == "json":
            outfile = open(self.outpath + ".json", "w")
            outfile.write(json.dumps(self.parseddata))
        else:
            outfilelinks = open(self.outpath+".links.tsv", "w")
            outfilelinks.write("source\ttarget\tweight\n")
            for link in self.parseddata["links"]:
                outfilelinks.write(str(link["source"]) + "\t" + str(link["target"]) + "\t" + str(link["value"]) + "\n")
            
            outfilenodes = open(args.out+".nodes.tsv", "w")
            if self.intype == "index":
                outfilenodes.write("id\tname\tgroup\tvalue\n")
                for node in self.parseddata["nodes"]:
                    outfilenodes.write(str(self.nodes.index(node["name"])) + "\t" + str(node["name"]) + "\t" + str(node["group"]) + "\t" + str(node["value"]) + "\n")
            else:
                outfilenodes.write("id\tgroup\tvalue\n")
                for node in self.parseddata["nodes"]:
                    outfilenodes.write(str(node["id"]) + "\t" + str(node["group"]) + "\t" + str(node["value"]) + "\n")

datapath = "Kookkurrenzprofil2.csv"
outpath = "out"
root = None
minllr = 1000
intype = "index"
outformat = "json"
indexllr = 2
indexwords = 7

etl = coocETL(datapath, outpath, root, minllr, intype, outformat, indexllr, indexwords)
etl.extract()
etl.transform()
etl.load()

', 'Der', 'Rita', 'Hauptamtsleiterin', 'Bürgermeisterin', 'stellvertretende', 'Amtsleiterin', 'Wahlleiterin', 'Souffleuse', 'Ohne', 'abnehmen', 'weiße', 'Biedermanns', 'Eine', 'gerissen', 'Mask', 'The', 'of', 'aufsetzt', 'Kopf', 'versteckt', 'Sehschlitzen', 'Box-Weltmeister', 'Schulz', 'Annas', 'Oper', 'Heftis', 'Philip', 'Novelle', 'Sulzer', 'Hefti', 'Auftragsoper', 'Lohengrin', 'verbarg', 'Schutzanzug', 'ihre', 'Goldene', 'Theaterpreis', 'Requisiten', 'Völkerball-Musiker', 'nachgebildeten', 'Outfits', 'mühevoll', 'Vorsprechen']
['Henry', 'eine', 'hinter', 'Gesicht', 'eisernen', 'einer', 'Blau', 'Operette', 'Raymonds', 'Fred', 'Potpourri', '', 'Melodien', 'Raymond', 'Tragen', 'Zorro', 'Banderas', 'Antonio', 'sw', 'Die', 'Mann', 'getragen', 'Hinter', 'Kostüme', 'schwarze', 'unter', 'verbirgt', 'Frosch', 'Requisite', 'Boxer', 'aufzusetzen', 'Bühnenbild', 'Rocchigiani', 'Graciano', 'Nase', 'Mund', 'Todes', 'roten', 'Poes', 'Allan', 'Edgar', 'Roten', 'Poe', 'trage', 'Badener', 'Tournée', 