# Baseline Annotation of Observation based on trained spaCy (2.0.18) model 


<fig size= "4">
This notebook provides a baseline annotation suggestion by using refined named entity recognition that we provided using training a deep learning method by prodigy.  </p>
</fig size= "4">  

In [1]:
import spacy 
from spacy import displacy 


In [2]:
from __future__ import print_function
from ipywidgets import interact
import ipywidgets as widgets
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro","collapse_phrases":False}

In [4]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('max_colwidth', 260)
pd.set_option('display.max_rows', 500)

## Read the Data

In [5]:
importVersion = '013'

In [6]:

path= '../data/01_df_v{0}.pickle'.format(importVersion)# Put the path of the data in your local machine here, consider the letter "r" before the path
dfAstroNova = pd.read_pickle(path)

FileNotFoundError: [Errno 2] No such file or directory: '../data/01_df_v013.pickle'

In [None]:
# Sort the data based on the chapters of the book 
dfAstroNova['chapter'] = dfAstroNova.chapter.replace("appendix b",np.nan).astype(float)  
dfAstroNova = dfAstroNova.rename_axis('MyIdx').sort_values(by = ['chapter', 'MyIdx'], ascending = [True, True])
dfAstroNova.chapter.fillna('appendix b', inplace=True)

In [None]:
dfAstroNova.reset_index(inplace=True)
dfAstroNova=dfAstroNova.drop("MyIdx",axis=1,inplace=False)
dfAstroNova=dfAstroNova.drop("html",axis=1)

In [None]:
type(dfAstroNova)

In [None]:
dfAstroNova.head()

In [None]:
df.tail()

In [None]:
df=dfAstroNova.reset_index().text.str.split('.',expand=True).stack().reset_index(level=-1,drop=True)
type(df)

In [None]:
 # Create lists to fill with values
l_col1 = []
l_col2 = []

# iterrate over each row and fill our lists
for ix, row in dfAstroNova.iterrows():
    for value in row['sentences']:
        l_col1.append(value)
        l_col2.append(row['chapter'])

# Create new dataframe from the two lists
df= pd.DataFrame({'sentences': l_col1 ,
                         'chapter': l_col2 })
df=df.rename(columns={"sentences":"sents"});

In [None]:
len(df)

In [None]:
df.head()

# Upload customized spaCy model provioded by our training data in prodigy

In [None]:
path= '../data/Model_V17'
nlp=spacy.load(path)

In [None]:
def entity(x):
    a=[]
    b=[]
    for ent in nlp(x).ents:
        a +=ent.text,
        b +=ent.label_,
    c=list(zip(a,b))
    return c

In [None]:
def attribute(x):
    a=[]
    b=[]
    c=[]
    d=[]
    e=[]
    for token in nlp(x):
        a +=token.text,
        b += token.pos_,
        c += token.tag_,
        d += token.dep_,
        e += token.lemma_
    z=list(zip(a,b,c,d,e))
    return z

In [None]:
def satz_analytic2(satz):
#    merkmal = dict({"satz":satz.text})
    merkmal=dict({})
# search for main verb
    for t in satz:
        if t.dep_ == "ROOT":
            r=t.head.text
            merkmal.update({"act":r})
# subject and object related to verb
    for t in satz:
        if t.dep_ == "dobj" and merkmal["act"]==t.head.text:
             merkmal.update({"obj":''.join(w.text_with_ws for w in t.subtree)})
        if t.dep_ == "nsubj" and merkmal["act"]==t.head.text:
            merkmal.update({"subject":t.text.lower()})

    return(merkmal)

In [None]:
def NerFeature(x):
    list_en=[]
    N=3
    X=list(0 for i in range(0,N)) 
    if x is not None:
        for idx in range(0,len(x)):
            list_en.append(x[idx][1])
            result_0=any([lst in list_en for lst in ["DATE"]])
            if result_0:
                X[0]=1
            result_1=any([lst in list_en for lst in ["TIME"]])
            if result_1:
                X[1]=1
            result_2=any([lst in list_en for lst in ["LONG"]])
            if result_2:
                X[2]=1 
    else:
        X=list(0 for i in range(0,N))
    return X 

def noun_chunk(x):
    a=[]
    for chunk in nlp(x).noun_chunks:
        a.append(chunk)
    return a

df['chunks'] = df['sents'].apply(lambda x: noun_chunk(x))

In [None]:
df["aso"]=df["sents"].apply(lambda y: satz_analytic2(nlp(y)))

In [None]:
df['entities'] = df['sents'].apply(lambda x: entity(x))

In [None]:
df['fner'] = df['entities'].apply(lambda x: NerFeature(x))

In [None]:
df.head(10)

In [None]:
df.tail(10)


In [None]:
def annot_observation_01(x):
    list_en=[]
    for idx in range(0,len(x)):
        list_en.append(x[idx][1])
    if any (lst in ("LONG","DATE","TIME") for lst in list_en):
        y=1
    else:
        y=0 
    return y

In [None]:
def annot_observation_02(x):
    list_en=[]
    for idx in range(0,len(x)):
        list_en.append(x[idx][1])
    result= all(lst in list_en for lst in ["LONG","DATE","TIME"])
    if result:
        y=1
    else:
        y=0
    return y
              

In [None]:
def annot_observation_03(x):
    list_en=[]
    for idx in range(0,len(x)):
        list_en.append(x[idx][1])
    result= all(lst in list_en for lst in ["DATE","TIME"])
    if result:
        y=1
    else:
        y=0
    return y

In [None]:
df['label'] = df['entities'].apply(lambda x: annot_observation_02(x))

In [None]:
df.head()

In [None]:
df.tail(10)

In [None]:
df[(df.label==1)]

In [None]:
def show_chapter(x):
     return df.loc[df.chapter==10]

In [None]:
len(df)

In [None]:
interact(show_chapter,x=(1,70,1))      

In [None]:
df.info()

In [None]:
df.dtypes

## Save the result as a JSON file

We have provided 3 different versions of annotation of observational sentences based on rule-based methods specifically using named entity recognition. These can be modified by a human annotator for having more precise labels. 
- A01--> If there is an entity from labels LONG, DATA, TIME in the sentences. we consider the sentence as an observational sentences 


- A02-->If there are all entities from labels LONG, DATA, TIME in the sentence. We consider the sentence as an observational sentence

-  A03 -->If there are all entities from labels  DATA, TIME in the sentence. We consider the sentence as an observational sentence

In [None]:
df.to_json("dfObsV01A01.json")
#df.to_json("dfObsV01A02.json")
#df.to_json("dfObsV01A03.json")