# Task: NER
Write a rule to identify if words contain programming language

python version: 3.8+

spacy version: 3.0+

## Loading data:
Title: texts corpus

Label:

    = 1: means the texts contain Programming Language
    
    = 0: otherwise

In [1]:
# Load packages
import pandas as pd
import numpy as np
from IPython.display import HTML as html_print
import spacy
from spacy.matcher import Matcher
from sklearn.metrics import confusion_matrix, classification_report # confusion matrix
# ------------- Manually labelled
texts = (pd.read_csv('/Users/mikexie/have_label.txt', delimiter = '\t')
           [['Label', 'Title']]
            .loc[:601])

print(texts)
titles = (_ for _ in texts['Title'])

     Label                                              Title
0        0  SQLStatement.execute() - multiple queries in o...
1        0  Good branching and merging tutorials for Torto...
2        1                                  ASP.NET Site Maps
3        0                 Function for creating color wheels
4        1  Adding scripting functionality to .NET applica...
..     ...                                                ...
597      0        Templates of Technical and Functional Specs
598      1  How can you determine what version(s) of .NET ...
599      0            Patterns for the overlap of two objects
600      0  SQL Server best way to calculate datediff betw...
601      0  How do I find and decouple entities from a cer...

[602 rows x 2 columns]


## Rule-base method


In [2]:
# Load an English and empty model
nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)

### Writing rules which pattern is identified as programming language

In [3]:

obj_c_pattern = [{'LOWER': 'objective'},
          {'IS_PUNCT': True, 'OP': '?'},
          {'LOWER': 'c'}]
obj_c_pattern2 = [{'LOWER': 'objectivec'}]
ruby_pattern = [{"LOWER": 'ruby'}]
js_pattern = [{'LOWER': {'IN': ['java', 'js', 'javascript']}}]
golang_pattern = [{'LOWER': {"IN": ['go', 'golang']},
                  'POS': {'NOT_IN': ['VERB']}}]
python_pattern = [{'LOWER': 'python'}]
net_pattern = [{'LOWER': {'IN': ['.net', 'net']}}]
perl_pattern = [{'LOWER': 'perl'}]
php_pattern = [{'LOWER': 'php'}]
Csharp_pattern1 = [{'LOWER': 'c'}, {'LOWER': 'sharp'}]
Csharp_pattern2 = [{'LOWER': {'IN': ['c#', 'c #', 'C#', 'C #']}}]
Csharp_pattern3 = [{'LOWER': 'c'}, {'LOWER': '#'}]
Csharp_pattern4 = [{'LOWER': 'c'}, {'LOWER': '-'}, {'LOWER': 'sharp'}]
ASP_NET_pattern = [{'LOWER': 'asp.net'}]
CPP_pattern = [{'LOWER': {'IN': ['cpp', 'c++']}}]
LISP_pattern = [{'LOWER': 'lisp'}]
SQL_pattern = [{'LOWER': 'sql'}]
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

matcher.add("OBJ_C_LANG", [obj_c_pattern])
matcher.add("OBJ_C_LANG", [obj_c_pattern2])
matcher.add("RUBY_LANG", [ruby_pattern])
matcher.add("PYTHON", [python_pattern])
matcher.add("JS_LANG", [js_pattern])
matcher.add("GOLANG_LANG", [golang_pattern])
matcher.add(".NET", [net_pattern])
matcher.add("PERL", [perl_pattern])
matcher.add("PHP", [php_pattern])
matcher.add("CSHARP", 
            [Csharp_pattern1, 
            Csharp_pattern2, 
            Csharp_pattern3,
            Csharp_pattern4])
matcher.add("CPP", [CPP_pattern])
matcher.add("ASP_NET", [ASP_NET_pattern])
matcher.add("LISP", [LISP_pattern])
matcher.add("SQL", [SQL_pattern])

## Extending Jupyter: Visualise

In [4]:
def style(s, bold=False):
    blob = f"<text>{s}</text>"
    if bold:
        blob = f"<b style='background-color: #fff59d'>{blob}</b>"
    return blob

def html_generator(g,n = 20):
    blob=""
    for i in range(n):
        doc = next(g)
        state = [[t, False] for t in doc]
        for idx, start, end in matcher(doc):
            for i in range(start, end):
                state[i][1] = True
        blob += style(' '.join([style(str(t[0]), bold = t[1]) for t in state]) + '<br>')
    return blob
titles = (_ for _ in texts['Title'])
g = (d for d in nlp.pipe(titles) if len(matcher(d)) > 0)



In [5]:
html_print(html_generator(g, n = 10))

## Labelling
If you haven't labelled yet, manually label some. 

In [6]:

label_df = (pd.read_csv('have_label.txt', delimiter = '\t')
           [['Label', 'Title']]
            .loc[:100]
           .assign(Pred=lambda d: [len(matcher(d)) > 0 for d in nlp.pipe(d['Title'])])
           .assign(Pred=lambda d: d['Pred'].astype(np.int8)))
label_df

Unnamed: 0,Label,Title,Pred
0,0,SQLStatement.execute() - multiple queries in o...,0
1,0,Good branching and merging tutorials for Torto...,0
2,1,ASP.NET Site Maps,1
3,0,Function for creating color wheels,0
4,1,Adding scripting functionality to .NET applica...,1
...,...,...,...
96,0,Lisp/Scheme interpreter without Emacs?,1
97,0,How can I reverse the ON bits in a byte?,0
98,0,How to return a page of results from SQL?,1
99,0,Checking for string contents? string Length Vs...,0


## Check incorrect predictions

In [7]:
# pred = 0 ---- label = 1
mistakes1 = (label_df
            .loc[lambda d: d['Pred'] == 0]
            .loc[lambda d: d['Label'] == 1]['Title'])
for i in range(mistakes1.shape[0]):
    print(mistakes1.iloc[i])
print('-------------------')
# pred = 1 ---- label = 0
mistakes2 = (label_df
            .loc[lambda d: d['Pred'] == 1]
            .loc[lambda d: d['Label'] == 0]['Title'])
for i in range(mistakes2.shape[0]):
    print(mistakes2.iloc[i])

How to learn ADO.NET
-------------------
Deploying SQL Server Databases from Test to Live
Is Windows Server 2008 "Server Core" appropriate for a SQL Server instance?
Paging SQL Server 2005 Results
MySQL/Apache Error in PHP MySQL query
How do you get leading wildcard full-text searches to work in SQL Server?
How do I Transform Sql Columns into Rows?
What program can I use to generate diagrams of SQL view/table structure?
SQL Server Management Studio alternatives to browse/edit tables and run queries
Lisp/Scheme interpreter without Emacs?
How to return a page of results from SQL?


In [8]:
# confusion matrix
confusion_matrix(label_df['Label'], label_df['Pred'])

array([[65, 10],
       [ 1, 25]])

In [9]:
# Recall, Precision, F1
print(classification_report(label_df['Label'], label_df['Pred']))

              precision    recall  f1-score   support

           0       0.98      0.87      0.92        75
           1       0.71      0.96      0.82        26

    accuracy                           0.89       101
   macro avg       0.85      0.91      0.87       101
weighted avg       0.92      0.89      0.90       101



# NER Task for the whole 

In [10]:
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.pipeline import EntityRuler, EntityRecognizer
from spacy.training import docs_to_json

In [11]:

def create_versioned(name):
    return [
        [{'LOWER': name}],
        [{'LOWER': {'REGEX': f'({name}\d+\.?\d*.?\d*)'}}],
        [{'LOWER': name}, {'TEXT': {'REGEX': '(\d+\.?\d*.?\d*)'}}],
    ]

def create_patterns():
    versioned_languages = ['ruby', 'php', 'python', 'lisp',
                          'golang', 
                           'asp.net', 
                           'objectivec',
                           'C + +',
                           'C#',
                           'sql', 'cpp', 'perl', 'java']
    versioned_languages = ['ruby', 'php']
    flatten = lambda l: [item for sublist in l for item in sublist]
    versioned_patterns = flatten([create_versioned(lang) for lang in versioned_languages])
    lang_patterns = [
         [{'LOWER': 'objective'}, {'IS_PUNCT': True, 'OP': '?'}, {'LOWER': 'c'}],
         [{'LOWER': 'objectivec'}],
        [{"LOWER": 'ruby'}],
         [{'LOWER': {'IN': ['java', 'js', 'javascript']}}],
         [{'LOWER': {"IN": ['go', 'golang']},'POS': {'NOT_IN': ['VERB']}}],
        [{'LOWER': 'python'}],
        [{'LOWER': {'IN': ['.net', 'net']}}],
        [{'LOWER': 'perl'}],
        [{'LOWER': 'php'}],
         [{'LOWER': 'c'}, {'LOWER': 'sharp'}],
         [{'LOWER': {'IN': ['c#', 'c #', 'C#', 'C #']}}],
        [{'LOWER': 'c'}, {'LOWER': '#'}],
        [{'LOWER': 'c'}, {'LOWER': '-'}, {'LOWER': 'sharp'}],
         [{'LOWER': 'asp.net'}],
         [{'LOWER': {'IN': ['cpp', 'c++']}}],
         [{'LOWER': 'lisp'}],
         [{'LOWER': 'sql'}],
    ]
    return versioned_patterns + lang_patterns
    
    

# Full training set
Training with labelled data

In [12]:
def parse_train_data(doc):
    detections = [(doc[start:end].start_char, 
                   doc[start:end].end_char, 
                   "PROGLANG") for idx, start, end in matcher(doc)]
    return (doc.text, {'entities': detections})

print(parse_train_data(nlp('i use java and sql'))) # simple test for the function

# training set size = 500
train_df = (pd.read_csv("have_label.txt", 
                  nrows=500, 
                  sep='\t', 
                  usecols=['Label', 'Title']))

titles = train_df.loc[lambda d: d['Label'] == 1]['Title']
TRAIN_DATA = [parse_train_data(d) for d in nlp.pipe(titles) if len(matcher(d)) == 1]

('i use java and sql', {'entities': [(6, 10, 'PROGLANG'), (15, 18, 'PROGLANG')]})


# Training loop

In [13]:

print(TRAIN_DATA[5:8])
def create_blank_nlp(train_data):
    nlp = spacy.blank("en")
    ner = nlp.add_pipe("ner")
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    return nlp

[('.NET Unit Testing packages?', {'entities': [(0, 4, 'PROGLANG')]}), ('What are effective options for embedding video in an ASP.NET web site?', {'entities': [(53, 60, 'PROGLANG')]}), ('How to set up unit testing for Visual Studio C++', {'entities': [(45, 48, 'PROGLANG')]})]


In [14]:
import random 
import datetime as dt
from spacy.training.example import Example
nlp = create_blank_nlp(TRAIN_DATA)
optimizer = nlp.begin_training()  
for i in range(20):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
#        nlp.update([text], [annotations], sgd=optimizer, losses=losses)
        nlp.update([example], losses=losses, sgd=optimizer, drop=0.3)
    if losses.get("ner") < 0.01:
        break
    print(f"Losses at iteration {i} - {dt.datetime.now()}", losses)

Losses at iteration 0 - 2021-10-15 02:02:07.800442 {'ner': 173.99925856949926}
Losses at iteration 1 - 2021-10-15 02:02:09.543693 {'ner': 16.41922613286977}
Losses at iteration 2 - 2021-10-15 02:02:11.268491 {'ner': 5.67216943011611}
Losses at iteration 3 - 2021-10-15 02:02:12.994988 {'ner': 3.3659192187337075}
Losses at iteration 4 - 2021-10-15 02:02:14.702703 {'ner': 3.539841123097918}
Losses at iteration 5 - 2021-10-15 02:02:16.446174 {'ner': 1.9739410326166358}
Losses at iteration 6 - 2021-10-15 02:02:18.177170 {'ner': 0.12594040888139726}
Losses at iteration 7 - 2021-10-15 02:02:19.896304 {'ner': 0.27727586121967474}
Losses at iteration 8 - 2021-10-15 02:02:21.624774 {'ner': 0.9413159054850833}
Losses at iteration 9 - 2021-10-15 02:02:23.387569 {'ner': 0.06976780081079186}
Losses at iteration 10 - 2021-10-15 02:02:25.176645 {'ner': 3.3655249025860043}
Losses at iteration 11 - 2021-10-15 02:02:26.915961 {'ner': 1.972134490175858}
Losses at iteration 12 - 2021-10-15 02:02:28.683276 

# Enhance training： dropout + batch

In [15]:
from spacy.util import minibatch, compounding
nlp = create_blank_nlp(TRAIN_DATA)
ruler = EntityRuler(nlp)
ruler.add_patterns([{'label': 'PROGLANG', 'pattern': p} for p in create_patterns()])
optimizer = nlp.begin_training()
for i in range(20):
    losses = {}
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update(
                [example],
                drop=0.1,  # dropout - make it harder to memorise data
                losses=losses,
            )
    if losses.get("ner") < 0.001:
        break
    print(f"Losses at iteration {i} - {dt.datetime.now()} {losses}")
ruler.to_disk('entity_ruler.jsonl')
nlp.to_disk('rule-model') # call it later

Losses at iteration 0 - 2021-10-15 02:02:32.450801 {'ner': 156.9159550144574}
Losses at iteration 1 - 2021-10-15 02:02:34.166574 {'ner': 3.8402867561575276}
Losses at iteration 2 - 2021-10-15 02:02:35.906373 {'ner': 0.0028766789965708637}


In [16]:
# Simple test for loading rule based model
from spacy import displacy
doc = nlp("i write code in python")
displacy.render(doc, style="ent")


## Prepare data for scoring

In [None]:

nlp = spacy.load('rule-model')
nlp.add_pipe('sentencizer')

valid_df = (pd.read_csv('have_label.txt', delimiter = '\t')
           [['Label', 'Title']]
           .assign(Pred=lambda d: [len(matcher(d)) > 0 for d in nlp.pipe(d['Title'])])
           .assign(Pred=lambda d: d['Pred'].astype(np.int8)))


In [18]:
# confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(valid_df['Label'], valid_df['Pred']))
print(classification_report(valid_df['Label'], valid_df['Pred']))


[[546  70]
 [  7 174]]
              precision    recall  f1-score   support

           0       0.99      0.89      0.93       616
           1       0.71      0.96      0.82       181

    accuracy                           0.90       797
   macro avg       0.85      0.92      0.88       797
weighted avg       0.93      0.90      0.91       797



# Do the thing above by a run

In [23]:
from __future__ import unicode_literals, print_function

import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

# Training + testing
def main(model=None, output_dir=None, n_iter=10, TRAIN_DATA=None):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.add_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner")
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model -----------
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update(
                        [example],
                        drop=0.1,  # dropout - make it harder to memorise data
                        losses=losses,
                    )
            print("Losses", losses)

    # test the trained model--------
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
        print('---')
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir() # make directory at the current path
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
            
# ------------ validate the confusion matrix and model performance    
    valid_df = (pd.read_csv('have_label.txt', delimiter = '\t')
           [['Label', 'Title']]
           .assign(Pred=lambda d: [len(matcher(d)) > 0 for d in nlp2.pipe(d['Title'])])
           .assign(Pred=lambda d: d['Pred'].astype(np.int8)))
    # confusion matrix
    from sklearn.metrics import confusion_matrix, classification_report
    print(confusion_matrix(valid_df['Label'], valid_df['Pred']))
    print(classification_report(valid_df['Label'], valid_df['Pred']))
            
print(main(TRAIN_DATA=TRAIN_DATA))

Created blank 'en' model
Losses {'ner': 146.29710759356783}
Losses {'ner': 3.892018180535656}
Losses {'ner': 0.2710877051823593}
Losses {'ner': 0.0003204959549735426}
Losses {'ner': 2.2248619707612694e-08}
Losses {'ner': 0.0031156915315137903}
Losses {'ner': 1.962176431442263e-05}
Losses {'ner': 6.627135898645994e-08}
Losses {'ner': 1.7364762267744493e-07}
Losses {'ner': 6.969027306756018e-09}
Entities [('C#', 'PROGLANG')]
Tokens [('How', '', 2), ('to', '', 2), ('know', '', 2), ('if', '', 2), ('a', '', 2), ('line', '', 2), ('intersects', '', 2), ('a', '', 2), ('plane', '', 2), ('in', '', 2), ('C', 'PROGLANG', 3), ('#', 'PROGLANG', 1), ('?', '', 2), ('-', '', 2), ('Basic', '', 2), ('2D', '', 2), ('geometry', '', 2)]
---
Entities [('NET', 'PROGLANG')]
Tokens [('Does', '', 2), ('CruiseControl', '', 2), ('.', '', 2), ('NET', 'PROGLANG', 3), ('run', '', 2), ('on', '', 2), ('IIS', '', 2), ('7.0', '', 2), ('?', '', 2)]
---
Entities [('Java', 'PROGLANG')]
Tokens [('Is', '', 2), ('there', '', 2

Entities [('C#', 'PROGLANG')]
Tokens [('C', 'PROGLANG', 3), ('#', 'PROGLANG', 1), ('string', '', 2), ('concatenation', '', 2), ('and', '', 2), ('string', '', 2), ('interning', '', 2)]
---
Entities [('python', 'PROGLANG')]
Tokens [('How', '', 2), ('to', '', 2), ('find', '', 2), ('the', '', 2), ('mime', '', 2), ('type', '', 2), ('of', '', 2), ('a', '', 2), ('file', '', 2), ('in', '', 2), ('python', 'PROGLANG', 3), ('?', '', 2)]
---
Entities [('C++', 'PROGLANG')]
Tokens [('How', '', 2), ('do', '', 2), ('you', '', 2), ('properly', '', 2), ('use', '', 2), ('namespaces', '', 2), ('in', '', 2), ('C++', 'PROGLANG', 3), ('?', '', 2)]
---
Entities [('Java', 'PROGLANG')]
Tokens [('Java', 'PROGLANG', 3), ('Logging', '', 2), ('vs', '', 2), ('Log4J', '', 2)]
---
Entities [('ASP.NET', 'PROGLANG')]
Tokens [('The', '', 2), ('Difference', '', 2), ('Between', '', 2), ('a', '', 2), ('DataGrid', '', 2), ('and', '', 2), ('a', '', 2), ('GridView', '', 2), ('in', '', 2), ('ASP.NET', 'PROGLANG', 3), ('?', '', 

In [None]:
!python3 -V