Instalation dependncies: 
- pip install Flask
- pip instal gevent

In [1]:
import os
import sys
import subprocess

from flask import Flask
from flask import request
from flask import jsonify
from gevent.pywsgi import WSGIServer

In [2]:
import sys
import nltk
import pickle
import pandas as pd
from nltk.corpus import stopwords

stop_words = list(stopwords.words('spanish'))
lemmatization_script = r'C:\Document Recognition REST Server\lemmatization-es.txt'
path_tfidf = r'C:\Document Recognition REST Server\tfidf.pickle'


def clean_1(df):
    disclaimer = r'(?="En relación)(.*?)(\)")' # nota prawna na dole dokumentu
    df['Content1'] = df['attachment_body'].str.replace("\r", " ")
    df['Content1'] = df['Content1'].str.replace("\n", " ")
    df['Content1'] = df['Content1'].str.replace(disclaimer, "")
    df['Content1'] = df['Content1'].str.replace('"', "")
    df['Content1'] = df['Content1'].str.replace('/', " ")
    return df


def clean_2(df):
    df['Content2'] = df['Content1'].str.lower()
    df['Content2'] = df['Content2'].str.replace('tlf', "teléfono")
    df['Content2'] = df['Content2'].str.replace(':', " ")
    df['Content2'] = df['Content2'].str.replace(',', " ")
    df['Content2'] = df['Content2'].str.replace('correo electrónico:', "email")
    df['Content2'] = df['Content2'].str.replace('jdo', "juzgado")
    df['Content2'] = df['Content2'].str.replace('xdo', "juzgado")
    df['Content2'] = df['Content2'].str.replace('upad ', "juzgado ")
    df['Content2'] = df['Content2'].str.replace('upad', "juzgado")
    df['Content2'] = df['Content2'].str.replace('j.primera', "juzgado primera")
    df['Content2'] = df['Content2'].str.replace('j. primera', "juzgado primera")
    df['Content2'] = df['Content2'].str.replace('inst.', "instancia ", regex=False)
    df['Content2'] = df['Content2'].str.replace('n.i.g.', "nig")
    df['Content2'] = df['Content2'].str.replace('d.a.c.', "dac")
    df['Content2'] = df['Content2'].str.replace('l.e.c.', "lec")
    df['Content2'] = df['Content2'].str.replace('ª', "a")
    return df


def clean_3(df):
    pattern = r"\b[a-zA-Z]\b" #pojedyncze litery
    punctuation_signs = list("?!.;%&+") # bedziemy usuwac znaki specjalne
    df['Content3'] = df['Content2'].str.replace(pattern, "")
    for punct_sign in punctuation_signs:
        df['Content3'] = df['Content3'].str.replace(punct_sign, ' ')
    return df


def clean_4(df):
    pattern = r"(?=código)(.*?)(?=página)" #notka prawna na dole dokumentu
    df['Content4'] = df['Content3'].str.replace(pattern, "")
    df['Content4'] = df['Content4'].str.replace("(", "")
    df['Content4'] = df['Content4'].str.replace(")", "")
    df['Content4'] = df['Content4'].str.replace("-", "")
    return df


def remove_stop_words(df, stop_words):
    df['Content5'] = df['Content4']    
    for stop_word in stop_words:
        regex_stopword = r"\b" + stop_word + r"\b"
        df['Content5'] = df['Content5'].str.replace(regex_stopword, '')
    return df


def remove_digits(df):
    df['Content6'] = df['Content5'].str.replace('\d+', '')
    return df


def lemmatize(word, lemmaDict):
    return lemmaDict.get(word, word)


def lemmatize_df(df):
    lemmaDict = {}
    with open(lemmatization_script, 'rb') as f:
        data = f.read().decode('utf8').replace(u'\r', u'').split(u'\n')
        data = [a.split(u'\t') for a in data]
        data[0][0] = '1'
        
    for a in data:
        if len(a) >1:
            lemmaDict[a[1]] = a[0]
    
    df['Content7'] = df['Content6'].map(lambda x: " ".join(x.split()))
    df['Content7'] = df['Content7'].str.split(" ").tolist()
    df['Content8'] = df['Content7'].map(lambda x: [lemmatize(word, lemmaDict) for word in x])
    df['Content8'] = df['Content8'].map(lambda x: " ".join(x))    
    return df


def clean(df):
    pd_request = {'attachment_body': [df]}
    df = pd.DataFrame(pd_request)
    df = clean_1(df)
    df = clean_2(df)
    df = clean_3(df)
    df = clean_4(df)
    df = remove_stop_words(df, stop_words)
    df = remove_digits(df)
    df = lemmatize_df(df)
    return df

def discretization(df):
    df = clean(df)
    with open(path_tfidf, 'rb') as data:
        tfidf = pickle.load(data)
    return tfidf.transform(df['Content8']).toarray()

In [3]:
import sys
import pickle

path_model = r'C:\Document Recognition REST Server\RFC-md4-ne70.pickle'


def model_prediction(parsed_content):
    with open(path_model, 'rb') as data:
        model = pickle.load(data)

    prediction = model.predict(parsed_content)
    pred_proba = model.predict_proba(parsed_content)

    output = [prediction[0], pred_proba[0][prediction[0]]]
    output = ';'.join(str(v) for v in output)
    return output

In [4]:
def ExecutePythonScript(path_to_script, arguments):
    try:
        return subprocess.check_output([sys.executable, path_to_script, arguments])
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
        return e.output

In [5]:
clean_script = r'C:\Document Recognition REST Server\CleanAndLemmatize.py'
mlmodel_script = r'C:\Document Recognition REST Server\MLmodel.py'
test_script = r'C:\Document Recognition REST Server\Test.py'

app = Flask(__name__)

@app.route('/')
def index():
    return "Welcome on ML Server"        

@app.route('/test', methods=['POST'])
def test():
    
    # work with input 
    
    body = request.data.decode("utf-8")
    print("body: ", body)
    
    print (request.is_json)
    content = request.get_json()
    print (content)
    
    # work with script
    
    output = ExecutePythonScript(test_script, body).decode("utf-8")
    
    # Work with output
    
    o_data = {
        'pred'  : 0.9,
        'confidence' : 0.8
    }
    resp = jsonify(o_data)
    resp.status_code = 200
    
    print("response: ", resp)
    return resp


@app.route('/m', methods=['POST'])
def run_model():
    pdf_text = request.data.decode("ansi")
    
    input_to_model = discretization(pdf_text)
    output = model_prediction(input_to_model)
    
    #input_to_model = ExecutePythonScript(clean_script, pdf_text)
    #output = ExecutePythonScript(mlmodel_script, "b").decode("utf-8")
    
    print("output: ", output)
    return output

In [None]:
if __name__ == '__main__':
    http_server = WSGIServer(('', 5000), app)
    http_server.serve_forever()

body:  
False
None
response:  <Response 30 bytes [200 OK]>


::1 - - [2019-08-02 18:52:46] "POST /test HTTP/1.1" 200 138 0.078146
