developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, save_ner_model, load_ner_model,save_ner_model_v2, load_ner_model_v2
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, split_alternativenames)
from cnt.evaluate import Metrics
import spacy
from cnt.io import  Database_Connection
import warnings
warnings.filterwarnings('ignore')

### Define the column names for the id and design column 

In [2]:
id_col = "id"
design_col = "design_en"

In [3]:
dc =  Database_Connection("mysql+mysqlconnector://root:0Skate1188!@localhost/thrakien_d4n4_2") # Format user:password@IP/Database

#pd.read_sql_query("select " + "id" + " from " + "data_designs", "mysql+mysqlconnector://root:0Skate1188!@localhost/thrakien_d4n4_2")

In [4]:

designs = dc.load_designs_from_db("data_designs", [id_col, design_col])

## This step is optional - load additional data to save with the model

In [5]:
#entity_information = [dc.load_designs_from_db("nlp_list_person", ["name", "alternativenames","link"]),
#                      dc.load_designs_from_db("nlp_list_obj", ["name_en", "alternativenames_en","link"]),
#                      dc.load_designs_from_db("nlp_list_animal", ["name_en", "alternativenames_en","link"]),
#                      dc.load_designs_from_db("nlp_list_plant", ["name_en", "alternativenames_en","link"])
#                      ]

In [6]:
#optional_info = pd.DataFrame(columns=["name","link"])
#for df in entity_information:
#    tmp = split_alternativenames(df.fillna(" "))
#    optional_info = optional_info.append(tmp)
#optional_info

In [7]:
#optional_info.loc[optional_info["name"]=="Andromeda"]["link"].item()

### Load and annotate designs

In [8]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj", ["name_en", "alternativenames_en"], ["alternativenames_en"], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal", ["name_en", "alternativenames_en"], ["alternativenames_en"], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant", ["name_en", "alternativenames_en"], ["alternativenames_en"], ",", True)
    }

In [9]:
annotated_designs = annotate_designs(entities, designs, id_col, design_col)
annotated_designs = annotated_designs[
    annotated_designs.annotations.map(len) > 0]

In [10]:
annotated_designs.shape

(7194, 3)

In [11]:
annotated_designs.head(5)

Unnamed: 0,design_en,id,annotations
0,Diademed head of deified Alexander the Great w...,1,"[(9, 13, OBJECT), (25, 44, PERSON)]"
1,"Altar, lighted and garlanded.",6,"[(0, 5, OBJECT)]"
2,Prize amphora on ornamental stand; within line...,8,"[(6, 13, OBJECT)]"
3,Amphora with ribbed surface and crooked handle...,9,"[(0, 7, OBJECT), (63, 75, PLANT), (80, 85, PLA..."
4,"Bust of youthful Anchialos, right, wearing tae...",10,"[(0, 4, OBJECT), (17, 26, PERSON), (43, 49, OB..."


## Train NER

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_designs[[id_col, design_col]],
                                                    annotated_designs[[id_col, "annotations"]],
                                                    test_size=0.25, random_state = 12)
y_test = y_test.rename(columns={"annotations": "y"})

In [13]:
X_test.index = [i for i in range(X_test.shape[0])]
y_test.index = [i for i in range(y_test.shape[0])]

#### output directory for the trained model

In [14]:
output_dir =  "../cnt/trained_model/ner/english/"
model_name = "english_cno"

In [15]:
#my_estimator = DesignEstimator(4, output_dir, model_name, id_col, design_col, save_optional=True, optional_info=optional_info)
my_estimator = DesignEstimator(4, output_dir, model_name, id_col, design_col)
my_estimator.set_labels("PERSON", "OBJECT", "ANIMAL", "PLANT")
my_estimator.fit(X_train, y_train.annotations, "cnt")

## Load and evaluate model

In [None]:
model = load_ner_model_v2(output_dir, model_name, id_col, design_col)

In [None]:
x_predict = model.predict(X_test,as_doc=False)

In [None]:
x_predict

In [None]:
metrics = Metrics()

In [None]:
scores_frame = metrics.create_score_frame(y_test, x_predict, my_estimator.get_labels())
scores_frame

In [None]:
precision, recall = metrics.score_precision_recall(y_test, x_predict)

In [None]:
F1 = (2*precision*recall) / (precision + recall)

In [None]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

# Visualize prediction

In [None]:
x_predict_as_doc = model.predict(designs, as_doc=True)

In [None]:
from spacy import displacy
colors = {'PERSON': 'mediumpurple','OBJECT': 'greenyellow', 'ANIMAL' : 'orange', 'PLANT': 'salmom', 'VERBS': 'skyblue'}
options = {'ent': ['PERSON', 'OBJECT', 'ANIMAL', 'PLANT'], 'colors': colors}
displacy.render(x_predict_as_doc.y, 
                style='ent', jupyter=True, options=options)

## Upload data to mysql

In [None]:
upload = True

In [None]:
if upload ==True:
    dc =  Database_Connection("mysql+mysqlconnector://root:0Skate1188!@localhost/thrakien_d4n4_2")
    cnt_designs = dc.load_designs_from_db("designs", [id_col, design_col])
    cnt_pred = my_estimator.predict_clear(cnt_designs )

    print(cnt_pred)

    cnt_pred_predictions_only = cnt_pred["y"]
    
    cnt_ner_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Entity", "Label_Entity"])

    print(cnt_ner_output)

    cnt_ner_output.to_sql("cnt_pipeline_ner", dc.mysql_connection, 
                           if_exists="replace", index=False)