Roger Mei

In [655]:
# import libraries
import os
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize,RegexpTokenizer
from nltk.corpus import stopwords
import re
import pandas as pd
import numpy as np

# Reading in the data:

In [29]:
path_13 = './2013'
path_14 = './2014'
path_train = './training/'

# load training set
text_2013 = []
text_2014 = []

# reading 2013 texts
for file in os.listdir(path_13):
    text_2013.append(open(os.path.join(path_13,file),'rb').read().decode("utf-8",errors="replace"))
    
# reading 2014 texts
for file in os.listdir(path_14):
    text_2014.append(open(os.path.join(path_14,file),'rb').read().decode("utf-8",errors="replace"))
    
# Combining both 2013 and 2014 texts into one list
all_text = text_2013 + text_2014
    
# reading training data
ceo = pd.read_csv(path_train + 'ceo.csv',header=None, encoding='latin-1')
companies = pd.read_csv(path_train + 'companies.csv',header=None, encoding='latin-1')
percentage = pd.read_csv(path_train + 'percentage.csv',header=None, encoding='latin-1')

Sentence tokenizing:

In [219]:
# list of all sentence tokenized articles
sentences = []
for txt in all_text:
    sent = sent_tokenize(txt)
    sentences.append(sent)

# Extracting percentages

Percentage regular expressions:

In [529]:
# one percent/1.0 percent/one.0 percent/1,000,000.00 percent/ (-5,000,000/2) percentage/ 1 - 2 percentage(ile)
percentage_pat1 = r'((?:\(?-?\w*[,\d\/]*(\.\d+)? (to |- ))?\(?-?\w*[,\d\/]*(\.\d+)? ?percent(?:age)?(?:ile)?(?: points?)?\)?)'
# one %/1.0 %/-5,000,000.0%/(50.0%)/(5,000,000%)/ one to two %/(-1/2 to 2/232,23013.2 %)/ 1% to 2%
percentage_pat2 = r'((?:\(?-?\w*[,\d\/%]*(\.\d+)?%? (to |- ))?\(?-?\w*[,\d\/]*(\.\d+)? ?%\)?)'

#captures pat1 and pat2 without spaces
#0.00-0.25%/5,000,000-6,000,000%/6-1/2 %
percentage_pat3 = r'\w+[,\w\/\.]*-\w+[,\w\/\.]* ?%'
# forty-one percent/single-digit percent/50-80 percent/0.00-0.25 percent[age][ile]
percentage_pat4 = r'\w+[,\w\/\.]*-\w+[,\w\/\.]* ?percent(?:age)?(?:ile)?(?: points?)?\)?)'

# point five percent[age][ile]
percentage_pat5 = r'point \w+ percent(?:age)?(?:ile)?'
# half of a percentage point/three-quarters of a percentage point
percentage_pat6 = r'\w+[\.\/\w,]* of a percent(?:age)?(?:ile)? ?(?:point)?'

def get_percent(txt):
    '''
    input:
    txt - string: piece of text 
    return:
    p - list: list of matches
    Given a piece of text, finds all instances of numbers involving percentages
    '''
    p = re.findall(percentage_pat1,txt)
    p = [group[0] for group in p]
    
    p2 = re.findall(percentage_pat2,txt)
    # only take the full group
    p2 = [group[0] for group in p2]
    p += p2
    
    p3 = re.findall(percentage_pat3,txt)
    p += p3
    
    p4 = re.findall(percentage_pat4,txt)
    p += p4
    
    p5 = re.findall(percentage_pat5,txt)
    p += p5
    
    p6 = re.findall(percentage_pat6,txt)
    p += p6
    
    
    return p

data exploration:

In [532]:
examples = []
for article in sentences:
    for sentence in article:
        matches = get_percent(sentence)
        # if there are matches, then append (sentence, match) to list of examples
        if len(matches):
            for m in matches:
                result = (sentence,m)
                examples.append(result)

In [254]:
examples[407] # "the" (a stop word) begins a negative example

('Sober Look As the percentage of money-losing firms rises, corporate default rates should follow.',
 'the percentage')

In [275]:
examples[425] # "the" precedes a negative example as in "the 16-week percent"

('This chart (via John Brynjolfsson) puts it all in context, showing the 16-week percent change in 10-Year yields over the last few years.',
 '16-week percent')

In [290]:
examples[5000] # down/up precedes a positive example

('The unemployment rate for July to September 2013 was 7.6% of the economically active population, down 0.2 percentage points from April to June 2013.',
 '0.2 percentage points')

In [469]:
examples[6] # a noun, not a cardinal number, begins a negative example

('Bernanke ended up doing the opposite – he essentially encouraged the rise in yields, saying it was for good reasons – and investors began dumping bonds, sending the yield on the 10-year Treasury note to 2.35% at the close, up 14 basis points from where it was before the release of the FOMC monetary policy statement and subsequent Bernanke presser this afternoon.',
 'note to 2.35%')

Out of all matches that are two words in length, throw away any matches that begin with a stop word. Also remove any matches that begin with an adjective ('JJ') such as same, fixed, small, and big. Matches usually begin with a cardinal number ('CD'). However, adjectives with '-' in it such as 'fifty-five' are kept.

In [533]:
stop_words = set(stopwords.words('english'))
nltk.pos_tag(tks)

filtered_examples = []
for example in examples:
    match = example[1]
    # for matches with 2 words
    # examples: same percent, a percent
    if len(match.split()) == 2:
        # remove matches that begin with any adjectives (small, big, fixed, same) that is not a cardinal number 'CD'
        # remove matches that begin with a verb
        pos_of_first_word = nltk.pos_tag(match)[0][1]
        # remove matches that begin with a stop word
        if (match.split()[0].lower() not in stop_words) and (pos_of_first_word == 'CD'):
            filtered_examples.append(example)
        # fifty-five is an adjective ('JJ') but is kept since it contains a hyphen ('-')
        elif (match.split()[0].lower() not in stop_words) and (pos_of_first_word == 'JJ') and ('-' in match.split()[0]):
            filtered_examples.append(example)
            # if false, don't do anything
    # for matches with 3 words
    # example: get to 2%
    elif len(match.split()) == 3:
        pos_of_first_word = nltk.pos_tag(match)[0][1]
        if (match.split()[0].lower() not in stop_words) and (pos_of_first_word == 'CD'):
            filtered_examples.append(example)
        elif (match.split()[0].lower() not in stop_words) and (pos_of_first_word == 'JJ') and ('-' in match.split()[0]):
            filtered_examples.append(example)
        else:
            # i.e. only get '2%' in 'get to 2%'
            filtered_examples.append((example[0],example[1].split()[-1]))
    else:
        filtered_examples.append(example)

In [850]:
# writing percentages to a txt file
percentage_file = 'all_percentages.txt'
with open(percentage_file, 'a+') as the_file:
    for example in filtered_examples:
        the_file.write('("' + example[0] + '", "' + example[1] + '")\n')

# Capitalized words regular expressions

In [612]:
capitalized_regex = r'([A-Z][A-Za-z\.]+(?=\s[A-Z])(?:\s[A-Z][A-Za-z\.]+)+)'
capitalized_examples = []
for article in sentences:
    for sentence in article:
        matches = re.findall(capitalized_regex,sentence)
        # if there are matches, then append (sentence, match) to list of examples
        if len(matches):
            for m in matches:
                result = (sentence,m)
                capitalized_examples.append(result)

# remove any words that do not have a proper noun
filtered_capitalized_examples = []
for example in capitalized_examples:
    has_NNP = False
    for tag in nltk.pos_tag(word_tokenize(example[1])):
        if tag[1] == 'NNP':
            has_NNP = True
    if has_NNP:
        filtered_capitalized_examples.append(example)

single_capitalized = r'([A-Z][A-Za-z\.]+)(?:(?:\s[a-z]+)|\.)'
single_cap_examples = []
for article in sentences:
    for sentence in article:
        matches = re.findall(single_capitalized,sentence)
        # if there are matches, then append (sentence, match) to list of examples
        if len(matches):
            for m in matches:
                result = (sentence,m)
                single_cap_examples.append(result)

filtered_single_cap_examples = []
for example in single_cap_examples:
    tag = nltk.pos_tag([example[1]])[0][1]
    # if the single word is not a stop word and is not a pronoun
    if (example[1].lower() not in stop_words) and ('PRP' not in tag):
        filtered_single_cap_examples.append(example)

# Extracting CEO names

data exploration to look for features of CEO names:

In [644]:
# stop after 100000 samples, do not need every example
ceo_examples = []

# loop through all rows
for i in range(ceo.shape[0]):
    # create regex
    first_name = ceo[0][i]
    last_exists = (type(ceo[1][i]) == str)
    first_exists = (type(ceo[0][i]) == str)
    if last_exists and first_exists:
        last_name = ceo[1][i]
        # concatenate to create full name (first last)
        full_name = first_name + ' ' + last_name
        regex1 = re.escape(full_name)
        # concatenante to create full name ver. 2 (last, first)
        last_first = last_name + ', ' + first_name
        regex2 = re.escape(last_first)
    elif first_exists:
        regex3 = re.escape(first_name)
    
    for article in sentences:
        for sentence in article:
            if last_exists and first_exists:
                matches = re.findall(regex1,sentence)
                if len(matches):
                    for m in matches:
                        result = (sentence,m)
                        ceo_examples.append(result)
                matches2 = re.findall(regex2,sentence)
                if len(matches2):
                    for m in matches2:
                        result = (sentence,m)
                        ceo_examples.append(result)
            elif first_exists:
                matches3 = re.findall(regex3,sentence)
                if len(matches3):
                    for m in matches3:
                        result = (sentence,m)
                        ceo_examples.append(result)
    if len(ceo_examples) > 100000:
        break

In [544]:
ceo_examples[100] # verb follows ceo name

("Thinkorswim\r\nReutersAs a manager, JP Morgan CEO Jamie Dimon doesn't like when colleagues throw each other under the bus, MarketWatch's Sital Patel reports.",
 'Jamie Dimon')

In [636]:
ceo_examples[1011] # verb follows ceo name

('Like many California residents, Elon Musk hates driving on Interstate 405 at rush hour.',
 'Elon Musk')

In [639]:
ceo_examples[106] # another proper noun (NNP) "Chase" is in sentence

("Incidentally, I don't remember hearing anything from Jamie Dimon at the time Chase was acquiring these banks about any reluctance to buy up two firms that had just spent years helping to blow up the world economic system with phony loans.",
 'Jamie Dimon')

In [888]:
for i in range(100):
    print(ceo_examples[i])

("American's CEO, Tom Horton, will serve as chairman of the new company until mid-2014, these people said.", 'Tom Horton')
("In the meantime, reports of IGT CEO Patti Hart's management style have made it beyond the Las Vegas press and into the mainstream media.", 'Patti Hart')
('“I recently read a magazine article in which my CEO, Patti Hart, was featured.', 'Patti Hart')
('It just shows how far removed Patti Hart is from daily life at IGT.', 'Patti Hart')
('Patti Hart once told her employees in a recorded video, ‘It is too expensive and difficult to reclaim market share.’ She also once remarked in an internal online Q &amp; A , ‘Executives need business class travel, because when they arrive at a destination, they are expected to hit the ground running and be immediately productive’.', 'Patti Hart')
("On February 1st IGT CEO Patti Hart and Chairman Phil Satre wrote a letter bashing Chuck Mathewson, one of Ader's board picks and a former Chairman of IGT.", 'Patti Hart')
("In fact, back

creating positive examples:

In [877]:
import random
pos_samples = random.sample(ceo_examples,255)

true_pos_samples = []
for sample in pos_samples:
    if len(sample[1].split()) > 1:
        true_pos_samples.append(sample)

len(true_pos_samples)

196

In [907]:
# features: NNP proper noun, number of proper noun tags in sentence that are not in match,contains 'CEO' in sentence, is a stop word in the match?, length of match, number of words in match,"company" in sentence verb follows it?, number of verb tags in sentence, contains a cardinal digit tag in the sentence, contains 'head' in sentence
colnames = ['is_NNP','num_NNP','contains_CEO','is_stop','match_len','num_words','followed_by_verb','contains_digit','contains_company','contains_head','contains_illion','contains_pay','contains_investor','contains_quarter','contains_boss','contains_executive','result']
ceo_training_data = pd.DataFrame(columns=colnames)

# positive examples
for example in true_pos_samples:
    sentence = example[0]
    match = example[1]
    match_tags = nltk.pos_tag(word_tokenize(match))
    sentence_tags = nltk.pos_tag(word_tokenize(sentence))
    
    is_NNP = 1
    for tag in match_tags:
        is_NNP = is_NNP and (tag[1] == 'NNP')
    if is_NNP:
        is_NNP = 1
    else:
        is_NNP = 0
    
    num_NNP = 0
    subtract_this = 0
    for tag in match_tags:
        if (tag[1] == 'NNP'):
            subtract_this += 1
    for tag in sentence_tags:
        if (tag[1] == 'NNP'):
            num_NNP += 1
    num_NNP = num_NNP - subtract_this
    
    
    contains_CEO = ('CEO' in sentence)
    if contains_CEO:
        contains_CEO = 1
    else:
        contains_CEO = 0
    
    is_stop = 0
    for word in match.split():
        if word.lower() in stop_words:
            is_stop = 1
            break
    
    match_len = 0
    for word in match.split():
        match_len += len(word)
    
    num_words = len(match.split())
    
    for word in word_tokenize(sentence):
        if word_tokenize(match)[-1] in word:
            following_index = word_tokenize(sentence).index(word) + 1
    if following_index >= len(sentence_tags):
        followed_by_verb = 0
    else:
        followed_by_verb = 'VB' in sentence_tags[following_index][1] 
    if followed_by_verb:
        followed_by_verb = 1
    else:
        followed_by_verb = 0
    
    contains_digit = 0
    for tag in sentence_tags:
        if (tag[1] == 'CD'):
            contains_digit = 1
            break
            
    contains_company = ('compan' in sentence.lower()) 
    if contains_company:
        contains_company = 1
    else:
        contains_company = 0
    
    contains_head = ('head' in sentence.lower()) 
    if contains_head:
        contains_head = 1
    else:
        contains_head = 0
    
    if ('illion' in sentence.lower()):
        contains_illion = 1
    else:
        contains_illion = 0
    
    contains_pay = ('pay' in sentence.lower()) 
    if contains_pay:
        contains_pay = 1
    else:
        contains_pay = 0
    
    contains_investor = ('investor' in sentence.lower()) 
    if contains_investor:
        contains_investor = 1
    else:
        contains_investor = 0
    
    contains_quarter = ('quarter' in sentence.lower()) 
    if contains_quarter:
        contains_quarter = 1
    else:
        contains_quarter = 0
    
    contains_boss = ('boss' in sentence.lower()) 
    if contains_boss:
        contains_boss = 1
    else:
        contains_boss = 0
    
    contains_executive = ('executive' in sentence.lower()) 
    if contains_executive:
        contains_executive = 1
    else:
        contains_executive = 0
    
    result = 1
    
    ceo_training_data = ceo_training_data.append({
     'is_NNP': is_NNP,
     'num_NNP': num_NNP,
     'contains_CEO': contains_CEO,
     'is_stop': is_stop,
     'match_len': match_len,
     'num_words': num_words,
     'followed_by_verb':followed_by_verb,
     'contains_digit': contains_digit, 
     'contains_company': contains_company,
     'contains_head': contains_head,
     'contains_illion': contains_illion,
     'contains_pay': contains_pay,
     'contains_investor': contains_investor,
     'contains_quarter': contains_quarter,
     'contains_boss': contains_boss,
     'contains_executive': contains_executive,
     'result': result
      }, ignore_index=True)

creating negative examples:

In [868]:
# manually extract negative examples from corpus and saved into txt file
# 196 negative samples extracted
text_neg = []
path_neg = './ceo_negative_examples'
for file in os.listdir(path_neg):
    text_neg.append(open(os.path.join(path_neg,file),'rb').read().decode("utf-8",errors="replace"))

# list of all sentence tokenized articles
sentences_neg = []
for txt in text_neg:
    sentences_neg.append(sent_tokenize(txt))
    
capitalized_regex = r'([A-Z][A-Za-z\.]+(?=\s[A-Z])(?:\s[A-Z][A-Za-z\.]+)+)'
true_negative_samples = []
for article in sentences_neg:
    for sentence in article:
        matches = re.findall(capitalized_regex,sentence)
        # if there are matches, then append (sentence, match) to list of examples
        if len(matches):
            for m in matches:
                result = (sentence,m)
                true_negative_samples.append(result)
                
len(true_negative_samples)

196

In [908]:
for example in true_negative_samples:
    sentence = example[0]
    match = example[1]
    match_tags = nltk.pos_tag(word_tokenize(match))
    sentence_tags = nltk.pos_tag(word_tokenize(sentence))
    
    is_NNP = 1
    for tag in match_tags:
        is_NNP = is_NNP and (tag[1] == 'NNP')
    if is_NNP:
        is_NNP = 1
    else:
        is_NNP = 0
    
    num_NNP = 0
    subtract_this = 0
    for tag in match_tags:
        if (tag[1] == 'NNP'):
            subtract_this += 1
    for tag in sentence_tags:
        if (tag[1] == 'NNP'):
            num_NNP += 1
    num_NNP = num_NNP - subtract_this
    
    
    contains_CEO = 'CEO' in sentence
    if contains_CEO:
        contains_CEO = 1
    else:
        contains_CEO = 0
    
    is_stop = 0
    for word in match.split():
        if word.lower() in stop_words:
            is_stop = 1
            break
    
    match_len = 0
    for word in match.split():
        match_len += len(word)
    
    num_words = len(match.split())
    
    for word in word_tokenize(sentence):
        if word_tokenize(match)[-1] in word:
            following_index = word_tokenize(sentence).index(word) + 1
    if following_index >= len(sentence_tags):
        followed_by_verb = 0
    else:
        followed_by_verb = 'VB' in sentence_tags[following_index][1] 
    if followed_by_verb:
        followed_by_verb = 1
    else:
        followed_by_verb = 0
    
    contains_digit = 0
    for tag in sentence_tags:
        if (tag[1] == 'CD'):
            contains_digit = 1
            break
            
    contains_company = ('compan' in sentence.lower()) 
    if contains_company:
        contains_company = 1
    else:
        contains_company = 0
    
    contains_head = ('head' in sentence.lower()) 
    if contains_head:
        contains_head = 1
    else:
        contains_head = 0
    
    if ('illion' in sentence.lower()):
        contains_illion = 1
    else:
        contains_illion = 0
    
    contains_pay = ('pay' in sentence.lower()) 
    if contains_pay:
        contains_pay = 1
    else:
        contains_pay = 0
    
    contains_investor = ('investor' in sentence.lower()) 
    if contains_investor:
        contains_investor = 1
    else:
        contains_investor = 0
    
    contains_quarter = ('quarter' in sentence.lower()) 
    if contains_quarter:
        contains_quarter = 1
    else:
        contains_quarter = 0
    
    contains_boss = ('boss' in sentence.lower()) 
    if contains_boss:
        contains_boss = 1
    else:
        contains_boss = 0
    
    contains_executive = ('executive' in sentence.lower()) 
    if contains_executive:
        contains_executive = 1
    else:
        contains_executive = 0
    
    result = 0
    
    ceo_training_data = ceo_training_data.append({
     'is_NNP': is_NNP,
     'num_NNP': num_NNP,
     'contains_CEO': contains_CEO,
     'is_stop': is_stop,
     'match_len': match_len,
     'num_words': num_words,
     'followed_by_verb':followed_by_verb,
     'contains_digit': contains_digit, 
     'contains_company': contains_company,
     'contains_head': contains_head,
     'contains_illion': contains_illion,
     'contains_pay': contains_pay,
     'contains_investor': contains_investor,
     'contains_quarter': contains_quarter,
     'contains_boss': contains_boss,
     'contains_executive': contains_executive,
     'result': result
      }, ignore_index=True)
    
ceo_training_data.shape

(392, 17)

classification with logistic regression:

In [909]:
# training data: 145 positive, 149 negative, 294 total
ceo_training_data = ceo_training_data.apply(pd.to_numeric)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X = ceo_training_data.iloc[:,:-1]
y = ceo_training_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Normalization
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

classifier = LogisticRegression(random_state=0)
classifier.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [911]:
# test data: 47 positive, 51 negative, 98 total
# Normalization
X_test_scaled = scaler.transform(X_test)

y_pred = classifier.predict(X_test_scaled)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test_scaled, y_test)))
print('Precision of logistic regression classifier on test set: {:.2f}'.format(50/(50+20)))
print('Recall of logistic regression classifier on test set: {:.2f}'.format(50/(50+1)))
# results based on random initialization of positive examples

[[27 20]
 [ 1 50]]
Accuracy of logistic regression classifier on test set: 0.79
Precision of logistic regression classifier on test set: 0.71
Recall of logistic regression classifier on test set: 0.98


classifying the capitalized words as ceo names or not:

In [912]:
# features: NNP proper noun, number of proper noun tags in sentence that are not in match,contains 'CEO' in sentence, is a stop word in the match?, length of match, number of words in match, verb follows it?, number of verb tags in sentence
colnames = ['is_NNP','num_NNP','contains_CEO','is_stop','match_len','num_words','followed_by_verb','contains_digit','contains_head','contains_illion','contains_company','contains_pay','contains_investor','contains_quarter','contains_boss','contains_executive']
classified_ceo_names = []

for example in filtered_capitalized_examples:
    # initialize feature vector of a single example
    feature_vector = pd.DataFrame(columns=colnames)
    
    sentence = example[0]
    match = example[1]
    match_tags = nltk.pos_tag(word_tokenize(match))
    sentence_tags = nltk.pos_tag(word_tokenize(sentence))
    
    is_NNP = 1
    for tag in match_tags:
        is_NNP = is_NNP and (tag[1] == 'NNP')
    if is_NNP:
        is_NNP = 1
    else:
        is_NNP = 0
    
    num_NNP = 0
    subtract_this = 0
    for tag in match_tags:
        if (tag[1] == 'NNP'):
            subtract_this += 1
    for tag in sentence_tags:
        if (tag[1] == 'NNP'):
            num_NNP += 1
    num_NNP = num_NNP - subtract_this
    
    
    contains_CEO = 'CEO' in sentence
    if contains_CEO:
        contains_CEO = 1
    else:
        contains_CEO = 0
    
    is_stop = 0
    for word in match.split():
        if word.lower() in stop_words:
            is_stop = 1
            break
    
    match_len = 0
    for word in match.split():
        match_len += len(word)
    
    num_words = len(match.split())
    
    for word in word_tokenize(sentence):
        if word_tokenize(match)[-1] in word:
            following_index = word_tokenize(sentence).index(word) + 1
    if following_index >= len(sentence_tags):
        followed_by_verb = 0
    else:
        followed_by_verb = 'VB' in sentence_tags[following_index][1] 
    if followed_by_verb:
        followed_by_verb = 1
    else:
        followed_by_verb = 0
            
    contains_digit = 0
    for tag in sentence_tags:
        if (tag[1] == 'CD'):
            contains_digit = 1
            break
    
    contains_company = ('compan' in sentence.lower())
    if contains_company:
        contains_company = 1
    else:
        contains_company = 0
    
    contains_head = ('head' in sentence.lower()) 
    if contains_head:
        contains_head = 1
    else:
        contains_head = 0
    
    if ('illion' in sentence.lower()):
        contains_illion = 1
    else:
        contains_illion = 0
    
    contains_pay = ('pay' in sentence.lower()) 
    if contains_pay:
        contains_pay = 1
    else:
        contains_pay = 0
    
    contains_investor = ('investor' in sentence.lower()) 
    if contains_investor:
        contains_investor = 1
    else:
        contains_investor = 0
    
    contains_quarter = ('quarter' in sentence.lower()) 
    if contains_quarter:
        contains_quarter = 1
    else:
        contains_quarter = 0
    
    contains_boss = ('boss' in sentence.lower()) 
    if contains_boss:
        contains_boss = 1
    else:
        contains_boss = 0
    
    contains_executive = ('executive' in sentence.lower()) 
    if contains_executive:
        contains_executive = 1
    else:
        contains_executive = 0
    
    contains_percent = ('%' in sentence) or ('percent' in sentence.lower())
    if contains_percent:
        contains_percent = 1
    else:
        contains_percent = 0
    
    contains_dollar_sign = ('$' in sentence) 
    if contains_dollar_sign:
        contains_dollar_sign = 1
    else:
        contains_dollar_sign = 0
    
    feature_vector = feature_vector.append({
     'is_NNP': is_NNP,
     'num_NNP': num_NNP,
     'contains_CEO': contains_CEO,
     'is_stop': is_stop,
     'match_len': match_len,
     'num_words': num_words,
     'followed_by_verb':followed_by_verb,
     'contains_digit': contains_digit,
     'contains_company': contains_company,
     'contains_head': contains_head,
     'contains_illion': contains_illion,
     'contains_pay': contains_pay,
     'contains_investor': contains_investor,
     'contains_quarter': contains_quarter,
     'contains_boss': contains_boss,
     'contains_executive': contains_executive
      }, ignore_index=True)
    
    # Normalization
    feature_scaled = scaler.transform(feature_vector)
    predicted_result = classifier.predict(feature_scaled)
    if predicted_result:
        classified_ceo_names.append(example)

In [913]:
# writing ceo names to a file
ceo_file = 'all_ceo_names.txt'
with open(ceo_file, 'a+') as the_file:
    for example in classified_ceo_names:
        the_file.write('("' + example[0] + '", "' + example[1] + '")\n')

# Extracting Companies

creating positive examples:

In [841]:
# stop after 300,000 samples, do not need every example
company_examples = []

# loop through all rows
for i in range(companies.shape[0]):
    # create regex
    company_name = companies[0][i]
    co_regex = re.escape(company_name)
    
    for article in sentences:
        for sentence in article:
            matches = re.findall(co_regex,sentence)
            if len(matches):
                for m in matches:
                    result = (sentence,m)
                    company_examples.append(result)
                    
    if len(company_examples) > 300000:
        break

comp_pos_samples = random.sample(company_examples,151)
len(comp_pos_samples)

151

In [842]:
#features
#contains inc/co in sentence, contains inc/co in match, contains '$' in sentence, contains 'percent' or '%' in sentence, contains year (4 digit number regex) in sentence, number of CD's in sentence tags, comtains 'compan', number of proper nouns 'NNP' in match, contains 'stock' in sentence, contains 'quarter' in sentence, contains 'revenue' in sentence, contains 'share' in sentence, contains 'illion' (as in million/billion) in sentence
colnames = ['inc_co_sent','inc_co_match','dollar_sign_sent','percent_sent','year_sent','num_CD','compan_sent','num_NNP_match','stock_sent','quarter_sent','revenue_sent', 'share_sent', 'illion_sent','result']
comp_training_data = pd.DataFrame(columns=colnames)

year_regex = r'\d{4}'
# positive examples
for example in comp_pos_samples:
    sentence = example[0]
    match = example[1]
    match_tags = nltk.pos_tag(word_tokenize(match))
    sentence_tags = nltk.pos_tag(word_tokenize(sentence))
    
    if ('inc' in sentence.lower()) or ('co' in sentence.lower()):
        inc_co_sent = 1
    else:
        inc_co_sent = 0
       
    if ('inc' in match.lower()) or ('co' in match.lower()):
        inc_co_match = 1
    else:
        inc_co_match = 0
    
    if '$' in sentence:
        dollar_sign_sent = 1
    else:
        dollar_sign_sent = 0
    
    if ('percent' in sentence.lower()) or ('%' in sentence.lower()):
        percent_sent = 1
    else:
        percent_sent = 0
    
    if len(re.findall(year_regex,sentence)):
        year_sent = 1
    else:
        year_sent = 0
    
    num_CD = 0
    for tag in sentence_tags:
        if (tag[1] == 'CD'):
            num_verbs += 1
    
    if ('compan' in sentence.lower()):
        compan_sent = 1
    else:
        compan_sent = 0
        
    num_NNP_match = 0
    for tag in match_tags:
        if (tag[1] == 'NNP'):
            num_NNP_match += 1

    if ('stock' in sentence.lower()):
        stock_sent = 1
    else:
        stock_sent = 0
    
    if ('quarter' in sentence.lower()):
        quarter_sent = 1
    else:
        quarter_sent = 0
    
    if ('revenue' in sentence.lower()):
        revenue_sent = 1
    else:
        revenue_sent = 0
    
    if ('share' in sentence.lower()):
        share_sent = 1
    else:
        share_sent = 0
    
    if ('illion' in sentence.lower()):
        illion_sent = 1
    else:
        illion_sent = 0
    
    result = 1
    
    comp_training_data = comp_training_data.append({
     'inc_co_sent': inc_co_sent,
     'inc_co_match': inc_co_match,
     'dollar_sign_sent': dollar_sign_sent,
     'percent_sent': percent_sent,
     'year_sent': year_sent,
     'num_CD': num_CD,
     'compan_sent': compan_sent,
     'num_NNP_match': num_NNP_match,
     'stock_sent': stock_sent,
     'quarter_sent': quarter_sent,
     'revenue_sent': revenue_sent,
     'share_sent': share_sent,
     'illion_sent': illion_sent,
     'result': result
      }, ignore_index=True)

creating negative examples:

In [840]:
# manually extract negative examples from corpus and saved into txt file
comp_text_neg = []
path_neg = './company_negative_examples'
for file in os.listdir(path_neg):
    comp_text_neg.append(open(os.path.join(path_neg,file),'rb').read().decode("utf-8",errors="replace"))

# list of all sentence tokenized articles
comp_sentences_neg = []
for txt in comp_text_neg:
    comp_sentences_neg.append(sent_tokenize(txt))
    
capitalized_regex = r'([A-Z][A-Za-z\.]+(?=\s[A-Z])(?:\s[A-Z][A-Za-z\.]+)+)'
single_capitalized = r'([A-Z][A-Za-z\.]+)(?:(?:\s[a-z]+)|\.)'
comp_neg_samples = []
for article in comp_sentences_neg:
    for sentence in article:
        
        matches = re.findall(capitalized_regex,sentence)
        # if there are matches, then append (sentence, match) to list of examples
        if len(matches):
            for m in matches:
                result = (sentence,m)
                comp_neg_samples.append(result)
        
        matches = re.findall(single_capitalized,sentence)
        # if there are matches, then append (sentence, match) to list of examples
        if len(matches):
            for m in matches:
                result = (sentence,m)
                single_cap_examples.append(result)

len(comp_neg_samples)

151

In [843]:
# negative examples
for example in comp_neg_samples:
    sentence = example[0]
    match = example[1]
    match_tags = nltk.pos_tag(word_tokenize(match))
    sentence_tags = nltk.pos_tag(word_tokenize(sentence))
    
    if ('inc' in sentence.lower()) or ('co' in sentence.lower()):
        inc_co_sent = 1
    else:
        inc_co_sent = 0
       
    if ('inc' in match.lower()) or ('co' in match.lower()):
        inc_co_match = 1
    else:
        inc_co_match = 0
    
    if '$' in sentence:
        dollar_sign_sent = 1
    else:
        dollar_sign_sent = 0
    
    if ('percent' in sentence.lower()) or ('%' in sentence.lower()):
        percent_sent = 1
    else:
        percent_sent = 0
    
    if len(re.findall(year_regex,sentence)):
        year_sent = 1
    else:
        year_sent = 0
    
    num_CD = 0
    for tag in sentence_tags:
        if (tag[1] == 'CD'):
            num_verbs += 1
    
    if ('compan' in sentence.lower()):
        compan_sent = 1
    else:
        compan_sent = 0
        
    num_NNP_match = 0
    for tag in match_tags:
        if (tag[1] == 'NNP'):
            num_NNP_match += 1

    if ('stock' in sentence.lower()):
        stock_sent = 1
    else:
        stock_sent = 0
    
    if ('quarter' in sentence.lower()):
        quarter_sent = 1
    else:
        quarter_sent = 0
    
    if ('revenue' in sentence.lower()):
        revenue_sent = 1
    else:
        revenue_sent = 0
    
    if ('share' in sentence.lower()):
        share_sent = 1
    else:
        share_sent = 0
    
    if ('illion' in sentence.lower()):
        illion_sent = 1
    else:
        illion_sent = 0
    
    result = 0
    
    comp_training_data = comp_training_data.append({
     'inc_co_sent': inc_co_sent,
     'inc_co_match': inc_co_match,
     'dollar_sign_sent': dollar_sign_sent,
     'percent_sent': percent_sent,
     'year_sent': year_sent,
     'num_CD': num_CD,
     'compan_sent': compan_sent,
     'num_NNP_match': num_NNP_match,
     'stock_sent': stock_sent,
     'quarter_sent': quarter_sent,
     'revenue_sent': revenue_sent,
     'share_sent': share_sent,
     'illion_sent': illion_sent,
     'result': result
      }, ignore_index=True)

comp_training_data.shape

(302, 14)

classification with logisitic regression:

In [914]:
# training data: 113 positive, 113 negative, 226 total
comp_training_data = comp_training_data.apply(pd.to_numeric)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X = comp_training_data.iloc[:,:-1]
y = comp_training_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Normalization
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

classifier = LogisticRegression(random_state=0)
classifier.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [915]:
# test data: 38 positive, 38 negative, 76 total
# Normalization
X_test_scaled = scaler.transform(X_test)

y_pred = classifier.predict(X_test_scaled)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test_scaled, y_test)))
print('Precision of logistic regression classifier on test set: {:.2f}'.format(34/39))
print('Recall of logistic regression classifier on test set: {:.2f}'.format(34/38))
# results based on random initialization of positive examples

[[33  5]
 [ 4 34]]
Accuracy of logistic regression classifier on test set: 0.88
Precision of logistic regression classifier on test set: 0.87
Recall of logistic regression classifier on test set: 0.89


classifiying capitalized words as company or not:

In [916]:
colnames = ['inc_co_sent','inc_co_match','dollar_sign_sent','percent_sent','year_sent','num_CD','compan_sent','num_NNP_match','stock_sent','quarter_sent','revenue_sent', 'share_sent', 'illion_sent']
classified_company_names = []

# multi-word capitalized examples
for example in filtered_capitalized_examples:
    # initialize feature vector of a single example
    feature_vector = pd.DataFrame(columns=colnames)

    sentence = example[0]
    match = example[1]
    match_tags = nltk.pos_tag(word_tokenize(match))
    sentence_tags = nltk.pos_tag(word_tokenize(sentence))
    
    if ('inc' in sentence.lower()) or ('co' in sentence.lower()):
        inc_co_sent = 1
    else:
        inc_co_sent = 0
       
    if ('inc' in match.lower()) or ('co' in match.lower()):
        inc_co_match = 1
    else:
        inc_co_match = 0
    
    if '$' in sentence:
        dollar_sign_sent = 1
    else:
        dollar_sign_sent = 0
    
    if ('percent' in sentence.lower()) or ('%' in sentence.lower()):
        percent_sent = 1
    else:
        percent_sent = 0
    
    if len(re.findall(year_regex,sentence)):
        year_sent = 1
    else:
        year_sent = 0
    
    num_CD = 0
    for tag in sentence_tags:
        if (tag[1] == 'CD'):
            num_verbs += 1
    
    if ('compan' in sentence.lower()):
        compan_sent = 1
    else:
        compan_sent = 0
        
    num_NNP_match = 0
    for tag in match_tags:
        if (tag[1] == 'NNP'):
            num_NNP_match += 1

    if ('stock' in sentence.lower()):
        stock_sent = 1
    else:
        stock_sent = 0
    
    if ('quarter' in sentence.lower()):
        quarter_sent = 1
    else:
        quarter_sent = 0
    
    if ('revenue' in sentence.lower()):
        revenue_sent = 1
    else:
        revenue_sent = 0
    
    if ('share' in sentence.lower()):
        share_sent = 1
    else:
        share_sent = 0
    
    if ('illion' in sentence.lower()):
        illion_sent = 1
    else:
        illion_sent = 0
    
    feature_vector = feature_vector.append({
     'inc_co_sent': inc_co_sent,
     'inc_co_match': inc_co_match,
     'dollar_sign_sent': dollar_sign_sent,
     'percent_sent': percent_sent,
     'year_sent': year_sent,
     'num_CD': num_CD,
     'compan_sent': compan_sent,
     'num_NNP_match': num_NNP_match,
     'stock_sent': stock_sent,
     'quarter_sent': quarter_sent,
     'revenue_sent': revenue_sent,
     'share_sent': share_sent,
     'illion_sent': illion_sent
      }, ignore_index=True)
    
    # Normalization
    feature_scaled = scaler.transform(feature_vector)
    
    predicted_result = classifier.predict(feature_scaled)
    if predicted_result:
        classified_company_names.append(example)

# single word capitalized examples
for example in filtered_single_cap_examples:
    # initialize feature vector of a single example
    feature_vector = pd.DataFrame(columns=colnames)

    sentence = example[0]
    match = example[1]
    match_tags = nltk.pos_tag(word_tokenize(match))
    sentence_tags = nltk.pos_tag(word_tokenize(sentence))
    
    if ('inc' in sentence.lower()) or ('co' in sentence.lower()):
        inc_co_sent = 1
    else:
        inc_co_sent = 0
       
    if ('inc' in match.lower()) or ('co' in match.lower()):
        inc_co_match = 1
    else:
        inc_co_match = 0
    
    if '$' in sentence:
        dollar_sign_sent = 1
    else:
        dollar_sign_sent = 0
    
    if ('percent' in sentence.lower()) or ('%' in sentence.lower()):
        percent_sent = 1
    else:
        percent_sent = 0
    
    if len(re.findall(year_regex,sentence)):
        year_sent = 1
    else:
        year_sent = 0
    
    num_CD = 0
    for tag in sentence_tags:
        if (tag[1] == 'CD'):
            num_verbs += 1
    
    if ('compan' in sentence.lower()):
        compan_sent = 1
    else:
        compan_sent = 0
        
    num_NNP_match = 0
    for tag in match_tags:
        if (tag[1] == 'NNP'):
            num_NNP_match += 1

    if ('stock' in sentence.lower()):
        stock_sent = 1
    else:
        stock_sent = 0
    
    if ('quarter' in sentence.lower()):
        quarter_sent = 1
    else:
        quarter_sent = 0
    
    if ('revenue' in sentence.lower()):
        revenue_sent = 1
    else:
        revenue_sent = 0
    
    if ('share' in sentence.lower()):
        share_sent = 1
    else:
        share_sent = 0
    
    if ('illion' in sentence.lower()):
        illion_sent = 1
    else:
        illion_sent = 0
    
    feature_vector = feature_vector.append({
     'inc_co_sent': inc_co_sent,
     'inc_co_match': inc_co_match,
     'dollar_sign_sent': dollar_sign_sent,
     'percent_sent': percent_sent,
     'year_sent': year_sent,
     'num_CD': num_CD,
     'compan_sent': compan_sent,
     'num_NNP_match': num_NNP_match,
     'stock_sent': stock_sent,
     'quarter_sent': quarter_sent,
     'revenue_sent': revenue_sent,
     'share_sent': share_sent,
     'illion_sent': illion_sent
      }, ignore_index=True)
    
    # Normalization
    feature_scaled = scaler.transform(feature_vector)
    
    predicted_result = classifier.predict(feature_scaled)
    if predicted_result:
        classified_company_names.append(example)

len(classified_company_names)

In [921]:
# writing company names to a file
company_file = 'all_company_names.txt'
company_file2 = 'all_company_names2.txt'

cnt = 0
with open(company_file, 'a+') as the_file, open(company_file2, 'a+') as the_file2:
    for example in classified_company_names:
        if cnt < 450000:
            the_file.write('("' + example[0] + '", "' + example[1] + '")\n')
            cnt += 1
        else:
            the_file2.write('("' + example[0] + '", "' + example[1] + '")\n')