This notebook utilizes the training data from the previous notebook and will conduct training, assessment, as well as generation of the final output

In [1]:
import os
import re

import pandas as pd
import numpy as np
import nltk
from copy import deepcopy
from tqdm import tqdm

import math
import random 
import json

import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy.scorer import Scorer
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/barryz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Data Loading

In [2]:
corpus_directory = '/mnt/c/Course Data/Business Insider/2013-2014'
label_directory = '/mnt/c/Course Data/Business Insider/labels/'
ceos = pd.read_csv(label_directory+'ceo.csv', encoding='latin-1', header=None)
companies = pd.read_csv(label_directory+'companies.csv', encoding='latin-1',header=None)
percentage = pd.read_csv(label_directory+'percentage.csv', encoding='latin-1',header=None)

ceos = ceos.fillna(0)
ceos['name'] = ceos.apply(lambda row: str(row[0]) + ' '+ str(row[1]) if row[1] != 0 else row[0], axis=1)
ceos_list = ceos.name.tolist()
company_list = companies[companies[0].notnull()][0].tolist()

corpus = ""
files = os.listdir(corpus_directory)
for file in files:
    with open(corpus_directory+'/' + file , 'r', encoding='latin1') as f:
        current = f.read()
        corpus += current
        
sentences = nltk.sent_tokenize(corpus)
        
#Training data generated by Exploration Page

### CEO and Company Names

Assessment: From the training data we generated before, we will use 80% for training and 20% for testing. Since we know that all sentences contain at least one CEO/Company, we will be looking at whether the trained model is correct in identifying in those

Result Generation: After getting the assessment of the model, we will generate all named entities by training on all of the training data and running through the entire corpus

In [3]:
total_training = pd.read_csv('/mnt/c/Course Data/Business Insider/total_training.csv', header=None)

In [6]:
total_training[1] = total_training.apply(lambda row: json.loads(row[1].replace('\'', '"').replace('(', '[').replace(')', ']')), axis=1)
total_training[1] = total_training.apply(lambda row: {"entities": [tuple(x) for x in row[1]['entities']]}, axis=1)

all_examples = total_training.apply(lambda row: (row[0], row[1]), axis=1).tolist()
training = all_examples[0:math.floor(.8*len(all_examples))]
testing = all_examples[math.floor(.8*len(all_examples))::]

#### Assessment Model (trained only on 80% of the data and testing on the other 20%)

In [8]:
nlp_assess = spacy.blank('en')
ner = nlp_assess.add_pipe("ner")

In [9]:
examples = []
for text, annots in training:
    examples.append(Example.from_dict(nlp_assess.make_doc(text), annots))
nlp_assess.initialize(lambda: examples)

<thinc.optimizers.Optimizer at 0x7f535b59e720>

In [12]:
# iter = 10
unaffected_pipes = [pipe for pipe in nlp_assess.pipe_names if pipe != 'ner']


with nlp_assess.disable_pipes(*unaffected_pipes):
    optimizer = nlp_assess.begin_training()
    
    for item in range(iter):
        losses = {}
        random.shuffle(training)
        batches = minibatch(examples, size=compounding(4.0, 16.0, 1.01))
        for batch in batches:
            nlp_assess.update(batch, drop=.35, losses=losses)
        print("Losses for epoch {}/{}".format(item+1, iter), losses)         

Losses for epoch 1/10 {'ner': 34441.79836689816}
Losses for epoch 2/10 {'ner': 16445.6952037105}
Losses for epoch 3/10 {'ner': 13504.237734976228}
Losses for epoch 4/10 {'ner': 12303.370895562504}
Losses for epoch 5/10 {'ner': 11300.860001635621}
Losses for epoch 6/10 {'ner': 10518.765316585468}
Losses for epoch 7/10 {'ner': 9974.616454796493}
Losses for epoch 8/10 {'ner': 9565.271979551659}
Losses for epoch 9/10 {'ner': 9233.096911021883}
Losses for epoch 10/10 {'ner': 8892.362471087916}


In [14]:
eval_results = {
    "correct_CEO": 0,
    "total_CEO": 0,
    "correct_company": 0,
    "total_company":0
}

for i in tqdm(range(len(testing))):
    curr = testing[i]
    text = curr[0]
    entities = curr[1]["entities"]
    for entity in entities:
        if entity[2] == 'CEO':
            eval_results['total_CEO'] += 1
        elif entity[2] == 'Company':
            eval_results['total_company'] += 1
        doc = nlp_assess(text)
        correct = text[entity[0]:entity[1]]
        for ent in doc.ents:
            if ent.label_ == entity[2] and ent.text == correct:
                if ent.label_ == 'CEO':
                    eval_results['correct_CEO'] += 1
                elif ent.label_ == 'Company':
                    eval_results['correct_company'] += 1
                break

100%|██████████| 13152/13152 [01:01<00:00, 212.51it/s]


In [15]:
eval_results

{'correct_CEO': 2184,
 'total_CEO': 2326,
 'correct_company': 13356,
 'total_company': 13811}

#### Real Model (trained on all of the data and generates the final output)

In [16]:
nlp_real = spacy.blank('en')
ner = nlp_real.add_pipe("ner")

examples = []
for text, annots in all_examples:
    examples.append(Example.from_dict(nlp_real.make_doc(text), annots))
nlp_real.initialize(lambda: examples)

<thinc.optimizers.Optimizer at 0x7f5325e28400>

In [None]:
iter = 10
unaffected_pipes = [pipe for pipe in nlp_real.pipe_names if pipe != 'ner']


with nlp_real.disable_pipes(*unaffected_pipes):
    optimizer = nlp_real.begin_training()
    
    for item in range(iter):
        losses = {}
        random.shuffle(examples)
        batches = minibatch(examples, size=compounding(4.0, 16.0, 1.01))
        for batch in batches:
            nlp_real.update(batch, drop=.35, losses=losses)
        print("Losses for epoch {}/{}".format(item+1, iter), losses)         

Losses for epoch 1/10 {'ner': 32684.711174758726}
Losses for epoch 2/10 {'ner': 18145.963492346895}
Losses for epoch 3/10 {'ner': 15261.413774527062}
Losses for epoch 4/10 {'ner': 13685.645772086618}
Losses for epoch 5/10 {'ner': 12899.775864021622}
Losses for epoch 6/10 {'ner': 12066.761543256081}
Losses for epoch 7/10 {'ner': 11579.892368793162}
Losses for epoch 8/10 {'ner': 10910.373842086083}
Losses for epoch 9/10 {'ner': 10672.123514381165}


In [None]:
CEOs = set()
Companies = set()
    
for i in tqdm(range(len(sentences))):
    sentence = sentences[i]
    doc = nlp_real(sentence)
    for ent in doc.ents:
        if ent.label_ == 'CEO':
            CEOs.add(ent.text)
        elif ent.label_ == 'Company':
            Companies.add(ent.text)


### Percentage

Use regex for percentage
We are focusing on a couple of cases here:
- purely numeric cases, like 33%, 3.3%, 0.33%, or 100%)
- a combination of numeric and textual cases, like 33 percent, 33percent, 1 percentile, 2 percentage point, or 3 percentage
- textual percentages: like ten percent, seventy-eight percent



In [468]:
def percentage_regex(percentage_corpus):
    numeric = re.findall("(\-*\d*\.*\,*\-*\d*\/*\\d*s*%)", percentage_corpus)
    text_numeric_percent = re.findall("(\-*\d*\.*\,*\-*\d*\/*\d*\s*percent)", percentage_corpus)
    text_numeric_percent_point = re.findall("(\-*\d*\.*\,*\-*\d*\s*percentage point)", percentage_corpus)
    text_numeric_percentage = re.findall("(\-*\d*\.*\,*\-*\d*\s*percentage)", percentage_corpus)
    text_numeric_percentile = re.findall("(\-*\d*\.*\,*\-*\d*\s*percentile)", percentage_corpus)
    raw_text = [x[0] for x in re.findall('((one|two|twe|thr|thi|for|fou|fif|fiv|six|sev|eig|nin|ten|zer|hun|hal)[a-z|-]*\spercent)', percentage_corpus)]
    hundred = re.findall('one hundred percent', percentage_corpus)
    
    return numeric+text_numeric_percent+text_numeric_percent_point+text_numeric_percentage+text_numeric_percentile+raw_text+hundred

Assessment of the model: use a set difference to assess what we got and what we missed

In [469]:
# Preparing for the test corpus:
'''
- no white space
- no internal quotation mark
- no parens
- joined into a large string for matching
'''
test_corpus = [x.lower() for x in percentage[0].tolist()]
test_corpus_string = ' '.join(test_corpus)
results = percentage_regex(test_corpus_string)

In [470]:
'''
    Since we are taking a precise set difference, we only want to look at the positive samples (at least has percent sign or the prefix "perc")
    we will also take out most symbols for the corpus set such that our extracted results can match with the embedded percentages.
'''
positive_only = []
for text in test_corpus:
    if ("%" in  text) or ('perc' in text):
        positive_only.append(text)
        
result_set = set(results)
test_corpus_set = set([x.strip().replace(',', '').replace('"', '').replace('(', '').replace(')', '').strip('\.\?\*') for x in positive_only])
set_difference = test_corpus_set.difference(result_set)

In [471]:
len(result_set)/len(test_corpus_set)

0.9939315127871695

In [472]:
set_difference

{'+3%',
 '+86%',
 '0.00-0.25%',
 '0.2 percentage points',
 '0.7 percent ±1.3%',
 '04%',
 '1 percentile point',
 '1 to 2 percentage points',
 '1.4 percent ±0.8%',
 '1000 percent',
 '14.0-percent',
 '1400 percent',
 '16 1%',
 '2 71%',
 '2 75%',
 '2 percentile points',
 '2%-2_%',
 '2.2 percentage points',
 '3.0 percent ±3.0%',
 '3000 percent',
 '3641 percent',
 '4.3-percent',
 '5000 percent',
 '516%',
 '80 to 90 percent',
 '_ percentage point',
 'a half of a percentage point',
 'a quarter of a percentage point',
 'eight percentage',
 'half a percent',
 'half percentage',
 'one percentage point',
 'quarter percent',
 'three-quarters of a percentage point',
 'three-tenths of a percentage point',
 'two percentage',
 'two-percent',
 'two-tenths of a percentage point',
 'zero%',
 '~4.22%'}

Generating output

In [473]:
full_corpus = corpus.lower()
all_percentages = percentage_regex(full_corpus)
pd.DataFrame(all_percentages).to_csv('percentage_results.csv')