In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from integrators.pod.client import PodClient
from integrators.data.basic import *
from integrators.importers.email import EmailImporter, DEFAULT_GMAIL_HOST
from integrators.importers.util import *
from integrators.imports import *
from integrators.data.schema import *
from io import BufferedReader, BytesIO
from time import sleep

import os
import signal
import subprocess

In [None]:
def get_importer_run(imap_user, imap_pw):
    importer_run = ImporterRun.from_data(progress=0, username=imap_user, password=imap_pw)
    importer_run.add_edge('genericAttribute', GenericAttribute(name='host', stringValue=DEFAULT_GMAIL_HOST))
    importer_run.add_edge('genericAttribute', GenericAttribute(name='port', intValue=993))
    importer_run.add_edge('genericAttribute', GenericAttribute(name='max_number', intValue=10))
    return importer_run

In [None]:
# This cell is meant to be able to test the importer locally
def get_gmail_creds():
    return read_file(HOME_DIR / '.memri' / 'credentials_gmail.txt').split("\n")[:2]

In [None]:
pod_client = PodClient(database_key="4611381926941334880159360851454586449609479987074865708625425195",
                       owner_key="5778183340629856722841524962349989924776547454184070224457769328")

In [None]:
pod_client = PodClient()

# Import data

In [None]:
imap_user, imap_pw = get_gmail_creds()
importer           = EmailImporter.from_data()
importer_run       = get_importer_run(imap_user, imap_pw)
importer_run.add_edge('importer', importer)
pod_client.create(importer_run)
importer.run(importer_run=importer_run, pod_client=pod_client)

Using, HOST: imap.gmail.com, PORT: 993
RUN STATUS: running
PROGRESS MESSAGE: downloading emails
PROGRESS: Importing 50.0% of 10 
PROGRESS: Importing 100.0% of 10 
PROGRESS MESSAGE: merging duplicate items
PROGRESS MESSAGE: creating accounts
PROGRESS MESSAGE: creating threads
Finished running EmailImporter (#None)
RUN STATUS: done


In [None]:
# assert importer_run.progress == 1.0
# assert importer_run.runStatus == "done"

# Query data from pod

In [None]:
emails = pod_client.search_by_fields({"_type": 'EmailMessage'})

In [None]:
len(emails)

10

# Configure annotation UI

## Prepare data

A function to clean html of messages, we will use this later

In [None]:
def clean_html(html):
    res = strip_html(html)
    res = replace_urls(res)
    res = replace_emails(res)
    res = re.sub(r'\xa0', "\n", res)
    res = re.sub(r'[\n]{2,}', '\n\n', res.strip())
    res = re.sub(r'\n ', r'\n', res)
    res = re.sub(r' \n ', r'\n', res)
    return res

## Prepare Labels

In [None]:
labels = [
    {"text": "Primary", "suffix_key": "i", "background_color": "#F9B9F2", "text_color": "#ffffff"},
    {"text": "Promotions", "suffix_key": "o", "background_color": "#B8B8F3", "text_color": "#ffffff"},
    {"text": "Social & Updates", "suffix_key": "s", "background_color": "#BAF2BB", "text_color": "#ffffff"},
]

## Start docanno

In [None]:
def start_docanno():
    process = subprocess.Popen('doccano', stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid)
    # wait untill doccano is started, so we can start using the API's
    sleep(8)
    return process

In [None]:
def stop_docanno(process): os.killpg(os.getpgid(process.pid), signal.SIGTERM)

In [None]:
docanno_process = start_docanno()

## Doccanno client

In [None]:
def get_project_json(name, description="empty", project_type="DocumentClassification",
                     guideline="Please write annotation guideline.", resourcetype="TextClassificationProject",
                     randomize_document_order=False, collaborative_annotation=False):
    return {"name":name, "description":"empty","project_type":project_type,"guideline": guideline,
            "resourcetype": resourcetype,"randomize_document_order": randomize_document_order,
            "collaborative_annotation": collaborative_annotation}

In [None]:

class DoccanoClient():
    """Client for Doccano NLP Annotation UI"""
    def __init__(self, baseurl="http://0.0.0.0:8000", username="admin", password="password", version="v1"):
        self.url=f"{baseurl}/{version}"
        self.username=username
        self.password=password
        self.login(self.username, self.password)
   
    def login(self, username, password):
        r = requests.post(f'{self.url}/auth-token',
                          json={"username": username, "password": password})
        self.token = r.json()["token"]
        self.session = requests.Session()   
        self.session.headers.update({"Authorization": f"Token {self.token}",
                                     "Accept": "application/json"})
        
    def create_task(self, task_name):
        project_json = get_project_json(task_name)
        
        r = self.session.post(f'{self.url}/projects', json=project_json)
        assert r.status_code == 201
        project_id = r.json()["id"]
        return project_id 
    
    def upload_dataset(self, project_id, items):
        jsonl = "\n".join([json.dumps(i) for i in items])
        filebuffer = BufferedReader(BytesIO(str.encode(jsonl)))
        r = self.session.post(f"{self.url}/projects/{project_id}/docs/upload",
                              data={"format": "json"},
                              files={'file': filebuffer})
        assert r.status_code == 201
        
    def upload_labels(self, project_id, labels):
        labels_json = json.dumps(labels)
        filebuffer = BufferedReader(BytesIO(str.encode(labels_json)))
        r = self.session.post(f"{self.url}/projects/{project_id}/label-upload",
                              data={"format": "json"},
                              files={'file': filebuffer})
        assert r.status_code == 201
        
    def download_dataset(self, project_id):
        r = self.session.get(f"{self.url}/projects/{project_id}/docs/download?q=json")
        r.encoding = 'utf-8'
        assert r.status_code == 200
        return [json.loads(jline) for jline in r.text.splitlines()]
    
    def get_labels(self, project_id):
        r = self.session.get(f"{self.url}/projects/{project_id}/labels")
        return r.json()

## Get authentication token

In [None]:
d_client = DoccanoClient()

## Create a new task

In [None]:
# TODO, check if task exist (by name), if so fetch that task

In [None]:
project_id = d_client.create_task("Test2")

## Upload data

In [None]:
items = [{"text": clean_html(m.content), "meta": {"uid": m.uid}} for m in emails]

In [None]:
d_client.upload_dataset(project_id, items)

## Upload Labels

In [None]:
labels

[{'text': 'Primary',
  'suffix_key': 'i',
  'background_color': '#F9B9F2',
  'text_color': '#ffffff'},
 {'text': 'Promotions',
  'suffix_key': 'o',
  'background_color': '#B8B8F3',
  'text_color': '#ffffff'},
 {'text': 'Social & Updates',
  'suffix_key': 's',
  'background_color': '#BAF2BB',
  'text_color': '#ffffff'}]

In [None]:
d_client.upload_labels(project_id, labels)

## Open annotation window

In [None]:
from selenium.webdriver import Chrome, ChromeOptions
from webdriver_manager.chrome import ChromeDriverManager
import time

In [None]:
def selenium_navigate_to_annotation_doccano(project_id):
    options = ChromeOptions()
    options.add_argument("--kiosk")
    driver = Chrome(ChromeDriverManager().install())
    driver.maximize_window()
    driver.get(f"http://localhost:8000/projects/{project_id}")
    time.sleep(0.5)
    username = driver.find_element_by_name("username")
    pas = driver.find_element_by_name("password")
    button = driver.find_element_by_class_name("text-none")
    username.send_keys("admin")
    pas.send_keys("password")
    button.click()
    time.sleep(0.5)
    driver.get(f"http://localhost:8000/projects/{project_id}/text-classification?page=1")

In [None]:
selenium_navigate_to_annotation_doccano(project_id)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [/Users/koen/.wdm/drivers/chromedriver/mac64/88.0.4324.96/chromedriver] found in cache


 


## Download data

In [None]:
json_items = d_client.download_dataset(project_id)

## Write back to pod

In [None]:
def get_from_uid(items, id_):
    return next((i for i in items if i.uid == id_), None)

In [None]:
labelid2text = {label["id"]: label["text"] for label in d_client.get_labels(project_id)}
labelid2text

{4: 'Primary', 5: 'Promotions', 6: 'Social & Updates'}

In [None]:
for json_item in json_items:
    mail = get_from_uid(emails, json_item["meta"]["uid"])
    annotations = json_item["annotations"]
    annotation = next(iter(annotations), None)
    if annotation:
        label_name = labelid2text[annotation["label"]]
        mail.add_edge("label", Label.from_data(name=label_name))

In [None]:
for mail in emails:
    if len(mail.label) > 0: pod_client.create(mail.label[0])
    pod_client.create_edges(mail.get_all_edges())

400 Edge already exists
400 Edge already exists
400 Edge already exists
400 Edge already exists
400 Edge already exists
400 Edge already exists
400 Edge already exists
400 Edge already exists
400 Edge already exists
400 Edge already exists


Show labels

In [None]:
[(clean_html(e.content)[:50], e.label[0].name) for e in emails if len(e.label) > 0]

[('Microsoft-account\nBevestig je account\nEr is ongewo', 'Primary'),
 ('Welkom bij Gmail. Je kunt inloggen op je account o', 'Promotions'),
 ('This event has been changed. more details »Detecti', 'Social & Updates')]

## Stop Doccano

In [None]:
stop_docanno(docanno_process)

ProcessLookupError: [Errno 3] No such process

## Remove Database

Optional, do not execute if you did not write back to pod

In [None]:
doccano_path = Path("doccano.db")

In [None]:
if doccano_path.exists(): doccano_path.unlink()