In [9]:
from googletrans import Translator


"HI! I'm Pablo and my boss won't pay me\n\nThis is my story:"

In [1]:
import spacy
from spacy.language import Language
from spacy.tokens import Token, Doc
import pandas as pd
import wordfreq
import numpy as np
import simplemma
from googletrans import Translator

translator = Translator()

Token.set_extension('gram_features_', default={"family":"","subtype":"","aux_labels":[],"aux_string":""},force=True)
Token.set_extension("question_labels_",default=[],force=True)
Token.set_extension("newword_",default=[],force=True)
Token.set_extension("html_formatted_SN", default="",force=True)
Token.set_extension("html_formatted_SV", default="",force=True)
Token.set_extension("html_formatted_new", default="",force=True)
Doc.set_extension("translation", default="",force=True)
Doc.set_extension("html_formatted_SN", default="",force=True)
Doc.set_extension("html_formatted_SV", default="",force=True)
Doc.set_extension("html_formatted_new", default="",force=True)


##################################################
### Grammar
##################################################
def process_verb(token):

    token._.gram_features_["aux_labels"] = []

    if token._.gram_features_["family"]=="": #only words without previous category can become main verb
        token._.gram_features_["family"] = "VERB"
        token._.gram_features_["subtype"] = "MAIN"

    morphoverb = token.morph.to_dict()
    token._.gram_features_["aux_labels"] += [morphoverb[i] for i in morphoverb if i in ['Mood','Tense', 'VerbForm']]

    for child in token.children:
        #separable verbs
        if child.dep_ == "svp": 
            child._.gram_features_["family"] = "VERB" 
            child._.gram_features_["subtype"] = "SVP"
            child._.question_labels_ = ["SVP"]

            token._.gram_features_["aux_labels"] += ["SEPARABLE_VERB"]
            token._.gram_features_["aux_string"] = child.text
        #auxiliary verbs
        elif child.pos_ == "AUX": 
            child._.gram_features_["family"] = "VERB"
            child._.gram_features_["subtype"] = "AUX"
        #Reflexive verbs
        else: 
            checks=0
            morphochild = child.morph.to_dict()
            #Reflexive verb detected by pipeline
            if morphochild.get("Reflex","No")=="Yes": 
                token._.gram_features_["aux_labels"] += ["REFLEXIVE_VERB"]
                child._.gram_features_["family"] = "VERB"
                child._.gram_features_["subtype"] = "REFLEXPRON"
            #Reflexive verb - number and person agreement with verb
            elif ("Number" in morphoverb) and ("Person" in morphoverb): 
                if (child.pos_=="PRON") and (morphochild["PronType"]=="Prs")and (morphochild["PronType"] in ["Acc","Dat"]):
                    for check in ["Number","Person"]:
                        checks+=(morphochild[check]==morphoverb[check])
                    if checks == 2:
                        child._.gram_features_["family"] = "VERB"
                        child._.gram_features_["subtype"] = "REFLEXPRON"
                        token._.gram_features_["aux_labels"] += ["REFLEXIVE_VERB"]


def process_nonverb(token):
    token._.gram_features_["aux_labels"] = []
    #noun type
    if token.pos_ in ["NOUN", "PRON", "PROPN", "ADJ", "DET"]: 
        if token._.gram_features_["family"] == "":
            token._.gram_features_["family"] = "NOUN"
            token._.gram_features_["subtype"] = token.pos_
        morphodict = token.morph.to_dict()
        token._.gram_features_["aux_labels"] += [morphodict[i] for i in morphodict if i in ["Gender","Case","Number"]]
    #others
    elif token.pos_ not in ["X","SPACE","PUNCT"]:
        if token._.gram_features_["family"] == "":
            token._.gram_features_["family"] = "OTHERS"
            token._.gram_features_["subtype"] = token.pos_
            token._.question_labels_ = ["OTHERS", token.pos_]

@Language.component("grammarfeatures")
def process_all_tokens(doc):
    for token in doc:
        if token.pos_ in ["VERB","AUX"]:
            process_verb(token)
        else:
            process_nonverb(token)
    return doc


##################################################
### VOCABULARY
##################################################



def basewordfreq(token):

    if token.pos_ not in ["NUM","X","PROPN","SPACE","PUNCT"] and not ((token._.gram_features_["family"] == "VERB") and (token._.gram_features_["subtype"] == "SVP")):
        word_candidates = [token.lemma_, token.text, simplemma.lemmatize(token.text, lang='de')]
        
        if "SEPARABLE_VERB" in token._.gram_features_["aux_labels"]:
            word_candidates = [token._.gram_features_["aux_string"] + word_candidate for word_candidate in word_candidates] # particle zu in the middle? Check grammar
        
        maxfreq = max([wordfreq.word_frequency(i,"de") for i in word_candidates])
                      
        return token, token._.gram_features_["family"], list(set(word_candidates)), maxfreq
   
    return token, np.nan, np.nan, np.nan

def marknewword(token):

    token._.newword_=True

    if len(token._.question_labels_)==0:
        token._.question_labels_ = ["NEW"]
    else:
        token._.question_labels_ += ["NEW"]

    if (token._.gram_features_["family"] == "VERB") and (token._.gram_features_["subtype"] == "MAIN"):
        for child in token.children:
            if (child._.gram_features_["family"] == "VERB") and (child._.gram_features_["subtype"] in ["REFLEXPRON", "SVP"]):
                child._.newword_ = True
                if len(child._.question_labels_)==0:
                    child._.question_labels_ = ["NEW"]
                else: #controllare che new non c'è già
                    token._.question_labels_ += ["NEW"]                


@Language.component("vocabulary")    
def processvocabulary(doc):
    history_path = "userdata/"+user + "/history.csv"
    profile_path = "userdata/"+user + "/profile.csv"
    try:
        history = pd.read_csv(history_path,sep="|", index_col = 0)
        profile = pd.read_csv(profile_path,sep="|", index_col = 0)
        new_user = False
    except:
        new_user = True


    dfvocab = pd.Series(doc).apply(basewordfreq).apply(pd.Series)
    nr_words = len(dfvocab.dropna())
    dfvocab = dfvocab.explode(2)
        
    dfvocab.columns = ["token", "family", "base", "freq"]

    dfaux = dfvocab.groupby(["family", "base"]).agg({"freq":["count","max"]})
    dfaux.columns = ["nr_appearances", "freq_corpus"]
    dfaux=dfaux.reset_index()
    dfaux["freq_text"]=dfaux["nr_appearances"]/nr_words
    
    if not new_user:
        history.columns = ["family", "base", "nr_appearances_old", "freq_corpus_old","highlighted"]
        dfaux = pd.merge(dfaux,history, on= ["family","base"], how = 'outer')
        dfaux["nr_appearances"] = dfaux["nr_appearances"].fillna(0)
        dfaux["nr_appearances_old"] = dfaux["nr_appearances_old"].fillna(0)
        dfaux["nr_app_tot"] = dfaux["nr_appearances"]+dfaux["nr_appearances_old"]
        dfaux["highlight"]=(dfaux["highlighted"]==False)&((dfaux["nr_app_tot"]>30)|((dfaux["freq_text"]>0.01)&(dfaux["nr_appearances"]>5)))
    else:
        dfaux["highlight"]=(dfaux["freq_corpus"]>0.01)|((dfaux["freq_text"]>0.01)&(dfaux["nr_appearances"]>5))
        
    if (dfaux["highlight"].sum()<10):
        if not new_user:
            highlight = dfaux[(dfaux["highlight"]==False)&(dfaux["highlighted"]==False)].sort_values("freq_corpus",ascending=False).iloc[:10-dfaux["highlight"].sum()][["family","base"]].drop_duplicates()
        else:
            highlight = dfaux[dfaux["highlight"]==False].sort_values("freq_corpus",ascending=False).iloc[:10-dfaux["highlight"].sum()][["family","base"]].drop_duplicates()
        tohighlight = list(highlight["family"]+"_"+highlight["base"])
        dfaux["highlight"] = dfaux["highlight"]|dfaux.apply(lambda x: x["family"]+"_"+x["base"] in tohighlight, axis = 1)

    pd.merge(dfvocab,dfaux[dfaux["highlight"]])["token"].apply(marknewword)
    
    if not new_user:
        dfaux["nr_appearances"] = dfaux["nr_app_tot"]
        dfaux["freq_corpus"] = dfaux.apply(lambda x: max([x["freq_corpus_old"],x["freq_corpus"]]),axis = 1)
        dfaux["highlighted"] = dfaux["highlighted"]|dfaux["highlight"]
    else:
        dfaux=dfaux[["family","base","nr_appearances","freq_corpus","highlight"]]
        dfaux.columns = ["family","base","nr_appearances","freq_corpus","highlighted"]
    dfaux=dfaux[["family","base","nr_appearances","freq_corpus","highlighted"]]
    dfaux.to_csv(history_path,sep="|")
    
    if not new_user:
        profile.iloc[0,0]=profile.iloc[0,0]+1
        profile.iloc[1,0]=profile.iloc[1,0]+nr_words
        profile.to_csv(profile_path,sep="|")
    else:
        pd.Series([1, nr_words]).to_csv(profile_path,sep="|")
    
    return doc


##################################################
### FORMAT
##################################################

@Language.component("textformatter")
def formatsingletoken(token):
    attributes = token._.gram_features_
    formatted = token.text
    formatted_v = token.text
    if attributes["family"]=="VERB":
        formatted="<b>{}</b>".format(formatted)
        formatted_v="<b>{}</b>".format(formatted_v)
        if "Fin" in attributes["aux_labels"]:
            formatted_v ='<span style="color: blue">{}</span>'.format(formatted_v)
        if "Part" in attributes["aux_labels"]:
            formatted_v ='<span style="color: pink">{}</span>'.format(formatted_v)
        if "Inf" in attributes["aux_labels"]:
            formatted_v ='<span style="color: orange">{}</span>'.format(formatted_v)
        if "Sub" in attributes["aux_labels"]:
            formatted_v = '<span class="wavyUnderline">{}</span>'.format(formatted_v)
        if "Ind" in attributes["aux_labels"]:
            #formatted_v = '<span class="normalUnderline">{}</span>'.format(formatted_v)
            pass
        if "Pres" in attributes["aux_labels"]:
            pass
            #formatted_v = '<span class="normalOverline">{}</span>'.format(formatted_v)
        if "Past" in attributes["aux_labels"]:
            formatted_v = '<i>{}</i>'.format(formatted_v)
    elif token._.gram_features_["family"] == "NOUN": #prima cera un else e basta
        #formatted_v = "<b>{}</b>".format(formatted_v)
        if "Nom" in attributes["aux_labels"]:
            formatted = '<span class="doubleUnderline">{}</span>'.format(formatted)
            formatted_v = formatted
        if "Acc" in attributes["aux_labels"]:
            formatted = '<span class="normalUnderline">{}</span>'.format(formatted)
            formatted_v = formatted
        if "Dat" in attributes["aux_labels"]:
            formatted = '<span class="dashedUnderline">{}</span>'.format(formatted)
            formatted_v = formatted
        if "Gen" in attributes["aux_labels"]:
            formatted = '<span class="dottedUnderline">{}</span>'.format(formatted)

        if "Masc" in attributes["aux_labels"]:
            formatted ='<span style="color: blue">{}</span>'.format(formatted)
        if "Fem" in attributes["aux_labels"]:
            formatted ='<span style="color: pink">{}</span>'.format(formatted)
        if "Neut" in attributes["aux_labels"]:
            formatted ='<span style="color: orange">{}</span>'.format(formatted)
            
        if "Plur" in attributes["aux_labels"]:
            formatted ='<span class="doubleOverline">{}</span>'.format(formatted)
            if ("Nom" in attributes["aux_labels"]) or ("Dat" in attributes["aux_labels"]) or ("Acc" in attributes["aux_labels"]):
                formatted_v = '<span class="doubleOverline">{}</span>'.format(formatted_v)
        if "Sing" in attributes["aux_labels"]:
            formatted ='<span class="normalOverline">{}</span>'.format(formatted)
            #formatted_v = '<span class="normalOverline">{}</span>'.format(formatted_v)
    formatted_new = token.text
    if token._.newword_:
        formatted_new = "<b>{}</b>".format(formatted_new)


    token._.html_formatted_SN= formatted
    token._.html_formatted_SV= formatted_v
    token._.html_formatted_new= formatted_new

@Language.component("translation")
def gettranslatednumbered(text_with_n):

    translator = Translator()
    final = ""
    final_t = ""
    elements = text_with_n.split("\n")
    nr_el = len(elements)
    ciphers = len(str(nr_el))
    for i in range(nr_el):
        if elements[i]!="":
            final+= "("+str(i).zfill(ciphers)+") "+elements[i]+"</br>"
            final_t+= "("+str(i).zfill(ciphers)+") "+translator.translate(elements[i]).text+"</br>"
        else:
            final+="</br>"
            final_t+="</br>"
    return final,final_t

@Language.component("format")
def computeformat(doc):
    
    html_sn = ""
    html_sv = ""
    html_new = ""

    for token in doc:

        formatsingletoken(token)

        if token.is_punct==True:
            html_sn += token._.html_formatted_SN
            html_sv += token._.html_formatted_SV
            html_new += token._.html_formatted_new
        else:
            html_sn += " "+ token._.html_formatted_SN
            html_sv += " "+ token._.html_formatted_SV
            html_new += " "+ token._.html_formatted_new
    
    doc._.html_formatted_SN = html_sn
    doc._.html_formatted_SV = html_sv
    doc._.html_formatted_new = html_new

    return doc

@Language.component("translate")
def translate(doc):
    doc._.translation = translator.translate(doc.text).text
    return doc

In [2]:
user = "pablo"
nlp = spacy.load('de_dep_news_trf')
nlp.add_pipe("grammarfeatures")
nlp.add_pipe("vocabulary")
nlp.add_pipe("format")
nlp.add_pipe("translate")

  from .autonotebook import tqdm as notebook_tqdm


<function __main__.translate(doc)>

In [3]:
text = """
In der Türkei läuft die entscheidende Stichwahl um das Präsidentenamt. Die Wahllokale öffneten am Sonntagmorgen (Ortszeit). Wahlberechtigt sind insgesamt mehr als 64 Millionen Türken. Rund dreieinhalb Millionen im Ausland lebende Staatsbürger konnten bereits zwischen dem 20. und 24. Mai abstimmen. Es ist die erste Stichwahl in der Geschichte des Landes.

Der islamisch-konservative Staatschef Recep Tayyip Erdogan hatte in der ersten Wahlrunde vor zwei Wochen deutlich besser abgeschnitten als von Meinungsforschern erwartet, verfehlte mit 49,5 Prozent der Stimmen aber knapp die für einen Sieg erforderliche absolute Mehrheit. Sein sozialdemokratischer Herausforderer Kemal Kilicdaroglu kam auf 44,9 Prozent.

Vor dem ersten Durchgang am 14. Mai waren Oppositionsführer Kilicdaroglu, der an der Spitze eines Sechs-Parteien-Bündnisses antritt, gute Siegeschancen zugesprochen worden. In der Stichwahl gilt nun aber Erdogan als klarer Favorit, zumal der drittplatzierte Kandidat Sinan Ogan eine Wahlempfehlung für den Amtsinhaber aussprach.

Richtungsweisende Wahl
Erdogan (69) ist bereits seit 20 Jahren an der Macht. Kritiker befürchten, dass die Türkei mit ihren rund 85 Millionen Einwohnern vollends in die Autokratie abgleiten könnte, sollte er erneut gewinnen. Kilicdaroglu (74) versprach, das Land zu demokratisieren.

Zuletzt hatte das Thema Migration den Wahlkampf bestimmt. Vor allem Kilicdaroglu drängte auf die Rückführung von Flüchtlingen nach Syrien. Weiteres Thema war die schlechte wirtschaftliche Lage mit einer massiven Inflation.
"""

In [3]:
text = """Prinzipiell findet Maximilian Pichl von der Universität Kassel das auch gut: Im Sinne einer effektiven humanitären Hilfe sei ein schnelles Handeln der EU wichtig, "zumal sie in der Vergangen­heit bei der Aufnahme von Flüchtlingen oft uneinheitlich und repressiv vorging", schreibt der Rechts- und Politikwissenschaftler im Grundrechte-Report, jährlich veröffentlicht seit 1997 von Bürgerrechts- und Menschenrechtsorganisationen. Dazu gehören Pro Asyl, die Humanistische Union und die Internationale Liga für Menschrechte."""

In [4]:
doc = nlp(text)

In [6]:
print(doc._.translation)

In Turkey, the decisive run-off election for the presidency is underway. The polling stations opened on Sunday morning (local time). A total of more than 64 million Turks are entitled to vote. Around three and a half million citizens living abroad were able to vote between May 20th and 24th. It is the first runoff election in the country's history.

The Islamic-conservative head of state Recep Tayyip Erdogan did significantly better than pollsters had expected in the first round of the election two weeks ago, but with 49.5 percent of the votes just missed the absolute majority required for victory. His Social Democratic challenger Kemal Kilicdaroglu got 44.9 percent.

Before the first round on May 14, opposition leader Kilicdaroglu, who is leading a six-party alliance, had been given good chances of victory. However, Erdogan is now the clear favorite in the run-off election, especially since the third-placed candidate, Sinan Ogan, made a recommendation for the incumbent.

Pioneering ch

In [None]:


    #Token.set_extension('svp', default="",force=True)
    Token.set_extension("html_formatted_SN", default="",force=True)
    Token.set_extension("html_formatted_SV", default="",force=True)
    Token.set_extension("html_formatted_new", default="",force=True)
    Token.set_extension("newword",default=False,force=True)
    nlp = spacy.load("de_core_news_lg")

In [None]:
############################################################
########## MEANING AND GRAMMAR
############################################################

from googletrans import Translator
import pandas as pd
import spacy
import random
from datetime import date
import sys
import codecs
import sqlite3


# new step pipeline https://spacy.io/usage/processing-pipelines
def process_verb(token):

    token._.gram_features_["aux_labels"] = []

    if token._.gram_features_["family"]=="": #only words without previous category can become main verb
        token._.gram_features_["family"] = "VERB"
        token._.gram_features_["subtype"] = "MAIN"

    morphoverb = token.morph.to_dict()
    token._.gram_features_["aux_labels"] += [morphoverb[i] for i in morphoverb if i in ['Mood','Tense', 'VerbForm']]

    for child in token.children:
        #separable verbs
        if child.dep_ == "svp": 
            child._.gram_features_["family"] = "VERB" 
            child._.gram_features_["subtype"] = "SVP"
            child._.question_labels_ = ["SVP"]

            token._.gram_features_["aux_labels"] += ["SEPARABLE_VERB"]
            token._.gram_features_["aux_string"] = child.text
        #auxiliary verbs
        elif child.pos_ == "AUX": 
            child._.gram_features_["family"] = "VERB"
            child._.gram_features_["subtype"] = "AUX"
        #Reflexive verbs
        else: 
            checks=0
            morphochild = child.morph.to_dict()
            #Reflexive verb detected by pipeline
            if morphochild.get("Reflex","No")=="Yes": 
                token._.gram_features_["aux_labels"] += ["REFLEXIVE_VERB"]
                child._.gram_features_["family"] = "VERB"
                child._.gram_features_["subtype"] = "REFLEXPRON"
            #Reflexive verb - number and person agreement with verb
            elif ("Number" in morphoverb) and ("Person" in morphoverb): 
                if (child.pos_=="PRON") and (morphochild["PronType"]=="Prs")and (morphochild["PronType"] in ["Acc","Dat"]):
                    for check in ["Number","Person"]:
                        checks+=(morphochild[check]==morphoverb[check])
                    if checks == 2:
                        child._.gram_features_["family"] = "VERB"
                        child._.gram_features_["subtype"] = "REFLEXPRON"
                        token._.gram_features_["aux_labels"] += ["REFLEXIVE_VERB"]


def process_nonverb(token):
    token._.gram_features_["aux_labels"] = []
    #noun type
    if token.pos_ in ["NOUN", "PRON", "PROPN", "ADJ", "DET"]: 
        if token._.gram_features_["family"] == "":
            token._.gram_features_["family"] = "NOUN"
            token._.gram_features_["subtype"] = token.pos_
        morphodict = token.morph.to_dict()
        token._.gram_features_["aux_labels"] += [morphodict[i] for i in morphodict if i in ["Gender","Case","Number"]]
    #others
    elif token.pos_ not in ["X","SPACE","PUNCT"]:
        if token._.gram_features_["family"] == "":
            token._.gram_features_["family"] = "OTHERS"
            token._.gram_features_["subtype"] = token.pos_
            token._.question_labels_ = ["OTHERS", token.pos_]

def process_all_tokens(doc):
    for token in doc:
        if token.pos_ in ["VERB","AUX"]:
            process_verb(token)
        else:
            process_nonverb(token)
    return doc



def gettranslatednumbered(text_with_n):

    translator = Translator()
    final = ""
    final_t = ""
    elements = text_with_n.split("\n")
    nr_el = len(elements)
    ciphers = len(str(nr_el))
    for i in range(nr_el):
        if elements[i]!="":
            final+= "("+str(i).zfill(ciphers)+") "+elements[i]+"</br>"
            final_t+= "("+str(i).zfill(ciphers)+") "+translator.translate(elements[i]).text+"</br>"
        else:
            final+="</br>"
            final_t+="</br>"
    return final,final_t

def numberparagraphs(text_with_n):
    final = ""
    elements = text_with_n.split("\n")
    nr_el = len(elements)
    ciphers = len(str(nr_el))
    for i in range(nr_el):
        if elements[i]!="":
            final+= "("+str(i).zfill(ciphers)+") "+elements[i]+"</br>"
        else:
            final+="</br>"
    return final




##################################################
### VOCABULARY
##################################################

import wordfreq
import numpy as np
import simplemma
langdata = simplemma.load_data('de')

def basewordfreq(x):
    if x.pos_ not in ["NUM","X","PROPN","SPACE","PUNCT"] and not ((x._.gram_features_["family"] == "VERB") and (x._.gram_features_["subtype"] == "SVP")):
        word_candidates = [x.lemma_, x.text, simplemma.lemmatize(x.text, langdata)]
        if "SEPARABLE_VERB" in x._.gram_features_["aux_labels"]:
            word_candidates = [x._.gram_features_["aux_string"]+i for i in word_candidates]
        maxfreq = max([wordfreq.word_frequency(i,"de") for i in word_candidates])
                      
        return x, x._.gram_features_["family"], list(set(word_candidates)), maxfreq
    return x, np.nan, np.nan, np.nan

def isanewword(x):
    x._.newword=True
    if len(x._.question_labels_)==0:
        x._.question_labels_ = ["NEW"]
    else:
        x._.question_labels_ += ["NEW"]
    if (x._.gram_features_["family"] == "VERB") and (x._.gram_features_["subtype"] == "MAIN"):
        for i in x.children:
            if (i._.gram_features_["family"] == "VERB") and (i._.gram_features_["subtype"] in ["REFLEXPRON", "SVP"]):
                i._.newword = True
                if len(i._.question_labels_)==0:
                    i._.question_labels_ = ["NEW"]
                else: #controllare che new non c'è già
                    x._.question_labels_ += ["NEW"]                
    
def processvocabulary(doc):
    try:
        history = pd.read_csv("history.csv",sep="|", index_col = 0)
        profile = pd.read_csv("profile.csv",sep="|", index_col = 0)

        dfvocab = pd.Series(doc).apply(basewordfreq).apply(pd.Series)
        nr_words = len(dfvocab.dropna())
        dfvocab = dfvocab.explode(2)
        
        dfvocab.columns = ["token", "family", "base", "freq"]

        dfaux = dfvocab.groupby(["family", "base"]).agg({"freq":["count","max"]})
        dfaux.columns = ["nr_appearances", "freq_corpus"]
        dfaux=dfaux.reset_index()
        dfaux["freq_text"]=dfaux["nr_appearances"]/nr_words
        
        history.columns = ["family", "base", "nr_appearances_old", "freq_corpus_old","highlighted"]
        dfaux = pd.merge(dfaux,history, on= ["family","base"], how = 'outer')
        dfaux["nr_appearances"] = dfaux["nr_appearances"].fillna(0)
        dfaux["nr_appearances_old"] = dfaux["nr_appearances_old"].fillna(0)
        dfaux["nr_app_tot"] = dfaux["nr_appearances"]+dfaux["nr_appearances_old"]
        dfaux["highlight"]=(dfaux["highlighted"]==False)&((dfaux["nr_app_tot"]>30)|((dfaux["freq_text"]>0.01)&(dfaux["nr_appearances"]>5)))
        if (dfaux["highlight"].sum()<10):
            highlight = dfaux[(dfaux["highlight"]==False)&(dfaux["highlighted"]==False)].sort_values("freq_corpus",ascending=False).iloc[:10-dfaux["highlight"].sum()][["family","base"]].drop_duplicates()
            tohighlight = list(highlight["family"]+"_"+highlight["base"])
            dfaux["highlight"] = dfaux["highlight"]|dfaux.apply(lambda x: x["family"]+"_"+x["base"] in tohighlight, axis = 1)
        pd.merge(dfvocab,dfaux[dfaux["highlight"]])["token"].apply(isanewword)
        dfaux["nr_appearances"] = dfaux["nr_app_tot"]
        dfaux["freq_corpus"] = dfaux.apply(lambda x: max([x["freq_corpus_old"],x["freq_corpus"]]),axis = 1)
        dfaux["highlighted"] = dfaux["highlighted"]|dfaux["highlight"]
        dfaux=dfaux[["family","base","nr_appearances","freq_corpus","highlighted"]]
        dfaux.to_csv("history.csv",sep="|")
        
        profile.iloc[0,0]=profile.iloc[0,0]+1
        profile.iloc[1,0]=profile.iloc[1,0]+nr_words
        profile.to_csv("profile.csv",sep="|")
    except:
        dfvocab = pd.Series(doc).apply(basewordfreq).apply(pd.Series)
        nr_words = len(dfvocab.dropna())
        dfvocab = dfvocab.explode(2)
        dfvocab.columns = ["token", "family", "base", "freq"]
        dfaux = dfvocab.groupby(["family", "base"]).agg({"freq":["count","max"]})
        dfaux.columns = ["nr_appearances", "freq_corpus"]
        dfaux=dfaux.reset_index()
        dfaux["freq_text"]=dfaux["nr_appearances"]/nr_words
        dfaux["highlight"]=(dfaux["freq_corpus"]>0.01)|((dfaux["freq_text"]>0.01)&(dfaux["nr_appearances"]>5))
        if (dfaux["highlight"].sum()<10):
            highlight = dfaux[dfaux["highlight"]==False].sort_values("freq_corpus",ascending=False).iloc[:10-dfaux["highlight"].sum()][["family","base"]].drop_duplicates()
            tohighlight = list(highlight["family"]+"_"+highlight["base"])
            dfaux["highlight"] = dfaux["highlight"]|dfaux.apply(lambda x: x["family"]+"_"+x["base"] in tohighlight, axis =1)
        pd.merge(dfvocab,dfaux[dfaux["highlight"]])["token"].apply(isanewword)
        dfaux=dfaux[["family","base","nr_appearances","freq_corpus","highlight"]]
        dfaux.columns = ["family","base","nr_appearances","freq_corpus","highlighted"]
        dfaux.to_csv("history.csv",sep="|")
        pd.Series([1, nr_words]).to_csv("profile.csv",sep="|")
        
        
##################################
## FORMAT
##################################

def formattokensSNSV(x):
    attributes = x._.gram_features_
    formatted = x.text
    formatted_v = x.text
    if attributes["family"]=="VERB":
        formatted="<b>{}</b>".format(formatted)
        formatted_v="<b>{}</b>".format(formatted_v)
        if "Fin" in attributes["aux_labels"]:
            formatted_v ='<span style="color: blue">{}</span>'.format(formatted_v)
        if "Part" in attributes["aux_labels"]:
            formatted_v ='<span style="color: pink">{}</span>'.format(formatted_v)
        if "Inf" in attributes["aux_labels"]:
            formatted_v ='<span style="color: orange">{}</span>'.format(formatted_v)
        if "Sub" in attributes["aux_labels"]:
            formatted_v = '<span class="wavyUnderline">{}</span>'.format(formatted_v)
        if "Ind" in attributes["aux_labels"]:
            #formatted_v = '<span class="normalUnderline">{}</span>'.format(formatted_v)
            pass
        if "Pres" in attributes["aux_labels"]:
            pass
            #formatted_v = '<span class="normalOverline">{}</span>'.format(formatted_v)
        if "Past" in attributes["aux_labels"]:
            formatted_v = '<i>{}</i>'.format(formatted_v)
    elif x._.gram_features_["family"] == "NOUN": #prima cera un else e basta
        #formatted_v = "<b>{}</b>".format(formatted_v)
        if "Nom" in attributes["aux_labels"]:
            formatted = '<span class="doubleUnderline">{}</span>'.format(formatted)
            formatted_v = formatted
        if "Acc" in attributes["aux_labels"]:
            formatted = '<span class="normalUnderline">{}</span>'.format(formatted)
            formatted_v = formatted
        if "Dat" in attributes["aux_labels"]:
            formatted = '<span class="dashedUnderline">{}</span>'.format(formatted)
            formatted_v = formatted
        if "Gen" in attributes["aux_labels"]:
            formatted = '<span class="dottedUnderline">{}</span>'.format(formatted)

        if "Masc" in attributes["aux_labels"]:
            formatted ='<span style="color: blue">{}</span>'.format(formatted)
        if "Fem" in attributes["aux_labels"]:
            formatted ='<span style="color: pink">{}</span>'.format(formatted)
        if "Neut" in attributes["aux_labels"]:
            formatted ='<span style="color: orange">{}</span>'.format(formatted)
            
        if "Plur" in attributes["aux_labels"]:
            formatted ='<span class="doubleOverline">{}</span>'.format(formatted)
            if ("Nom" in attributes["aux_labels"]) or ("Dat" in attributes["aux_labels"]) or ("Acc" in attributes["aux_labels"]):
                formatted_v = '<span class="doubleOverline">{}</span>'.format(formatted_v)
        if "Sing" in attributes["aux_labels"]:
            formatted ='<span class="normalOverline">{}</span>'.format(formatted)
            #formatted_v = '<span class="normalOverline">{}</span>'.format(formatted_v)
    formatted_new = x.text
    if x._.newword_:
        formatted_new = "<b>{}</b>".format(formatted_new)


    x._.html_formatted_SN= formatted
    x._.html_formatted_SV= formatted_v
    x._.html_formatted_new= formatted_new

def computeformat(doc):
    for i in doc:
        formattokensSNSV(i)
        
def replacequestion(doc,qcat,qcatno=""): #aggiungere una probabilità di sostituire oppure no.
    string = ""
    wordlist = []
    for i in doc:
        if i.is_punct == True:
            string+=i.text
        else:
            if i.is_alpha and (qcat in i._.question_labels_) and (qcatno not in i._.question_labels_):
                string+= " ________ "
                wordlist.append(i.text.lower())
            else:
                string+= " "+i.text
    random.shuffle(wordlist)

    string += """

    You should use the words in the following list:

    """ + str(wordlist)
    return string

def printFormattedText(doc):
    initialstring = ""
    css = """
    /* css here */

{
  border: 1px dotted black;
}

p.question {
  font-family: Arial, sans-serif;
  font-size:20px;
  color: #2E2E2E;
  margin-bottom:0px;
}

h2.quizHeader {
  font-family: Arial, sans-serif;
  font-weight:normal;
  font-size:25px;
  line-height: 27px;
  margin: 24px 0 12px 0;
  padding: 0 0 4px 0;
  border-bottom: 1px solid #a2a2a2;
}

h2.quizScore{
  font-family: Arial, sans-serif;
  font-size:25px;
}

div.quizAnswers{
  font-family: Arial, sans-serif;
  font-size:16px;
  color: #424242;
  padding: 4px 0 4px 0;
}

label {
  font-family: Arial, sans-serif;
  font-size:14px;
  color: #424242;
  vertical-align:top;
}

input.answer[type="radio"] {
  margin-bottom: 10px;
}

input.quizSubmit[type="submit"] {
  -webkit-background-clip: border-box;
  -webkit-background-origin: padding-box;
  -webkit-background-size: auto;
  -webkit-transition-delay: 0s, 0s;
  -webkit-transition-duration: 0.2s, 0.2s;
  -webkit-transition-property: color, background-color;
  -webkit-transition-timing-function: ease, ease;
  box-shadow: rgba(0, 0, 0, 0.498039) 0px 0px 5px 0px;
  color: #ffffff;
  background-color: #c30b0a;
  margin: 0;
  border: 0;
  outline: 0;
  text-transform:uppercase;
  height:35px;
  width:85px;
  border: 1px solid #5E5E5E;
  border-radius:5px;

 }

input.quizSubmit[type="submit"]:hover {
  color: #ffffff;
  background: #680f11;
  text-decoration: none;
}

table {
  background-color: #F2F2F2;
  border:1px solid #BDBDBD;
  border-radius:5px;
  padding:10px;
  padding-left:25px;
  box-shadow: rgba(0, 0, 0, 0.498039) 0px 0px 1px 0px;
}

th {

}

tr {

}

td {

}

.submitter {
	  width:85px;
}

.hide {
	  display:none;
}
.normalUnderline {
    text-decoration:underline #000;
}

.normalOverline {
    text-decoration: overline 1px #000;
}

.wavyUnderline {
    text-decoration:underline wavy #000;
}

.wavyOverline {
    text-decoration: overline wavy 1px #000;
}

.doubleOverline {
    text-decoration: overline 1px double #000;
}

.doubleUnderline {
    text-decoration:underline #000;
    border-bottom: 1px solid #000;
}

.dottedUnderline {
    border-bottom: 1px dotted #000;
    text-decoration: none;
}

.dashedUnderline {
    border-bottom: 1px dashed #000;
    text-decoration: none;
}
/*SFS light red = #c30b0a;
SFS dark red = #9f2026; */

    """
    initialstring_v = ""
    initialstring_new = ""
    initialstring_qsvp = replacequestion(doc,"SVP")
    initialstring_qnew = replacequestion(doc,"NEW")
    initialstring_qadp = replacequestion(doc,"ADP")
    initialstring_qoth = replacequestion(doc,"OTHERS","ADP")

        
                
    
    html = '''<!DOCTYPE html><html lang="en"><head><style>{}</style></head><body><p style="line-height: 200%">{}</p></body></html>'''

    for i in doc:
        if i.is_punct==True:
            initialstring+=i._.html_formatted_SN
            initialstring_v += i._.html_formatted_SV
            initialstring_new += i._.html_formatted_new
        else:
            initialstring+= " "+ i._.html_formatted_SN
            initialstring_v +=" "+ i._.html_formatted_SV
            initialstring_new += " "+ i._.html_formatted_new



    final,final_t = gettranslatednumbered(doc.text)
    html3 = html.format(css,numberparagraphs(initialstring))
    html4 = html.format(css,numberparagraphs(initialstring_v))
    html5 = html.format(css,numberparagraphs(initialstring_new))
    html6 = html.format(css,numberparagraphs(initialstring_qsvp))
    html7 = html.format(css,numberparagraphs(initialstring_qnew))
    html8 = html.format(css,numberparagraphs(initialstring_qadp))
    html9 = html.format(css,numberparagraphs(initialstring_qoth))
    html1 = html.format(css,final)
    html2 = html.format(css,final_t)
    return [html1, html2, html3, html4, html5, html6,html7,html8, html9]
        
        
##################################
## PIPELINE
##################################


def pipeline(folder, file, questions = False):
    print("1")
    Token.set_extension('att_li', default={"family":"","subtype":"","aux_labels":[],"aux_string":""},force=True)
    #Token.set_extension('svp', default="",force=True)
    Token.set_extension("html_formatted_SN", default="",force=True)
    Token.set_extension("html_formatted_SV", default="",force=True)
    Token.set_extension("html_formatted_new", default="",force=True)
    Token.set_extension("questioncategory",default=[],force=True)
    Token.set_extension("newword",default=False,force=True)
    nlp = spacy.load("de_core_news_lg")
    print("2")

    f = codecs.open(folder+"/"+file, "r", "utf-8")    # suess_sweet.txt file contains two
    text = f.read()                          # comma-separated words: süß, sweet
    f.close()

    doc = nlp(text)
    assert doc.has_annotation("SENT_START")
    computeattli(doc)
    computeformat(doc)
    return printFormattedText(doc)


def pipeline2(text, folder = "Templates"):
    global doc
    print("1")
    Token.set_extension('att_li', default={"family":"","subtype":"","aux_labels":[],"aux_string":""},force=True)
    #Token.set_extension('svp', default="",force=True)
    Token.set_extension("html_formatted_SN", default="",force=True)
    Token.set_extension("html_formatted_SV", default="",force=True)
    Token.set_extension("html_formatted_new", default="",force=True)
    Token.set_extension("questioncategory",default="",force=True)
    Token.set_extension("newword",default=False,force=True)
    nlp = spacy.load("de_core_news_lg")
    print("2")

    doc = nlp(text)
    assert doc.has_annotation("SENT_START")
    computeattli(doc)
    
    processvocabulary(doc)
    computeformat(doc)
    htmls =  printFormattedText(doc)
    
    fw = open("original.html", "w")
    fw.write(htmls[0])
    fw.close()
    
    fw = open("translation.html", "w")
    fw.write(htmls[1])
    fw.close()
    
    fw = open("SN.html", "w")
    fw.write(htmls[2])
    fw.close()
    
    fw = open("SV.html", "w")
    fw.write(htmls[3])
    fw.close()

    fw = open("newwords.html", "w")
    fw.write(htmls[4])
    fw.close()
    
    fw = open("qsvp.html", "w")
    fw.write(htmls[5])
    fw.close()
    
    fw = open("qnew.html", "w")
    fw.write(htmls[6])
    fw.close()    
    fw = open("qadp.html", "w")
    fw.write(htmls[7])
    fw.close()    
    fw = open("qoth.html", "w")
    fw.write(htmls[8])
    fw.close()